Add fullband-melgan DE vocoder

This commit is contained in:
Eren Gölge 2021-07-26 14:56:05 +02:00
parent 764f684e1b
commit 4b7b88dd3d
6 changed files with 25 additions and 17 deletions

View File

@ -230,6 +230,11 @@
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
"author": "@thorstenMueller", "author": "@thorstenMueller",
"commit": "unknown" "commit": "unknown"
},
"fullband-melgan":{
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
"author": "@thorstenMueller",
"commit": "unknown"
} }
} }
} }

View File

@ -1,10 +1,9 @@
import argparse import argparse
import os import os
from argparse import RawTextHelpFormatter
from tqdm import tqdm from tqdm import tqdm
from argparse import RawTextHelpFormatter
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.datasets import load_meta_data from TTS.tts.datasets import load_meta_data
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
@ -40,7 +39,9 @@ c_dataset = load_config(args.config_dataset_path)
meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval) meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
wav_files = meta_data_train + meta_data_eval wav_files = meta_data_train + meta_data_eval
speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) speaker_manager = SpeakerManager(
encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
)
# compute speaker embeddings # compute speaker embeddings
speaker_mapping = {} speaker_mapping = {}
@ -62,7 +63,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
if speaker_mapping: if speaker_mapping:
# save speaker_mapping if target dataset is defined # save speaker_mapping if target dataset is defined
if '.json' not in args.output_path: if ".json" not in args.output_path:
mapping_file_path = os.path.join(args.output_path, "speakers.json") mapping_file_path = os.path.join(args.output_path, "speakers.json")
else: else:
mapping_file_path = args.output_path mapping_file_path = args.output_path

View File

@ -1,8 +1,9 @@
"""Find all the unique characters in a dataset""" """Find all the unique characters in a dataset"""
import argparse import argparse
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from TTS.tts.datasets import load_meta_data
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.datasets import load_meta_data
def main(): def main():
@ -16,9 +17,7 @@ def main():
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
parser.add_argument( parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
"--config_path", type=str, help="Path to dataset config file.", required=True
)
args = parser.parse_args() args = parser.parse_args()
c = load_config(args.config_path) c = load_config(args.config_path)
@ -38,5 +37,6 @@ def main():
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,5 +1,5 @@
import torch
import numpy as np import numpy as np
import torch
from torch import nn from torch import nn
@ -81,12 +81,12 @@ class LSTMSpeakerEncoder(nn.Module):
if max_len < num_frames: if max_len < num_frames:
num_frames = max_len num_frames = max_len
offsets = np.linspace(0, max_len-num_frames, num=num_eval) offsets = np.linspace(0, max_len - num_frames, num=num_eval)
frames_batch = [] frames_batch = []
for offset in offsets: for offset in offsets:
offset = int(offset) offset = int(offset)
end_offset = int(offset+num_frames) end_offset = int(offset + num_frames)
frames = x[:, offset:end_offset] frames = x[:, offset:end_offset]
frames_batch.append(frames) frames_batch.append(frames)

View File

@ -291,18 +291,20 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"):
return items return items
def mls(root_path, meta_files=None): def mls(root_path, meta_files=None):
"""http://www.openslr.org/94/""" """http://www.openslr.org/94/"""
items = [] items = []
with open(os.path.join(root_path, meta_files), "r") as meta: with open(os.path.join(root_path, meta_files), "r") as meta:
for line in meta: for line in meta:
file, text = line.split('\t') file, text = line.split("\t")
text = text[:-1] text = text[:-1]
speaker, book, *_ = file.split('_') speaker, book, *_ = file.split("_")
wav_file = os.path.join(root_path, os.path.dirname(meta_files), 'audio', speaker, book, file + ".wav") wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
items.append([text, wav_file, "MLS_" + speaker]) items.append([text, wav_file, "MLS_" + speaker])
return items return items
# ======================================== VOX CELEB =========================================== # ======================================== VOX CELEB ===========================================
def voxceleb2(root_path, meta_file=None): def voxceleb2(root_path, meta_file=None):
""" """