mirror of https://github.com/coqui-ai/TTS.git
Add fullband-melgan DE vocoder
This commit is contained in:
parent
764f684e1b
commit
4b7b88dd3d
|
@ -230,6 +230,11 @@
|
||||||
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
|
},
|
||||||
|
"fullband-melgan":{
|
||||||
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
|
||||||
|
"author": "@thorstenMueller",
|
||||||
|
"commit": "unknown"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from argparse import RawTextHelpFormatter
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_meta_data
|
from TTS.tts.datasets import load_meta_data
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
@ -40,7 +39,9 @@ c_dataset = load_config(args.config_dataset_path)
|
||||||
meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
|
meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
|
||||||
wav_files = meta_data_train + meta_data_eval
|
wav_files = meta_data_train + meta_data_eval
|
||||||
|
|
||||||
speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda)
|
speaker_manager = SpeakerManager(
|
||||||
|
encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
|
||||||
|
)
|
||||||
|
|
||||||
# compute speaker embeddings
|
# compute speaker embeddings
|
||||||
speaker_mapping = {}
|
speaker_mapping = {}
|
||||||
|
@ -62,7 +63,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
|
||||||
|
|
||||||
if speaker_mapping:
|
if speaker_mapping:
|
||||||
# save speaker_mapping if target dataset is defined
|
# save speaker_mapping if target dataset is defined
|
||||||
if '.json' not in args.output_path:
|
if ".json" not in args.output_path:
|
||||||
mapping_file_path = os.path.join(args.output_path, "speakers.json")
|
mapping_file_path = os.path.join(args.output_path, "speakers.json")
|
||||||
else:
|
else:
|
||||||
mapping_file_path = args.output_path
|
mapping_file_path = args.output_path
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
"""Find all the unique characters in a dataset"""
|
"""Find all the unique characters in a dataset"""
|
||||||
import argparse
|
import argparse
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
from TTS.tts.datasets import load_meta_data
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
from TTS.tts.datasets import load_meta_data
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -16,9 +17,7 @@ def main():
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
||||||
"--config_path", type=str, help="Path to dataset config file.", required=True
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
c = load_config(args.config_path)
|
c = load_config(args.config_path)
|
||||||
|
@ -38,5 +37,6 @@ def main():
|
||||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
||||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import torch
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,12 +81,12 @@ class LSTMSpeakerEncoder(nn.Module):
|
||||||
if max_len < num_frames:
|
if max_len < num_frames:
|
||||||
num_frames = max_len
|
num_frames = max_len
|
||||||
|
|
||||||
offsets = np.linspace(0, max_len-num_frames, num=num_eval)
|
offsets = np.linspace(0, max_len - num_frames, num=num_eval)
|
||||||
|
|
||||||
frames_batch = []
|
frames_batch = []
|
||||||
for offset in offsets:
|
for offset in offsets:
|
||||||
offset = int(offset)
|
offset = int(offset)
|
||||||
end_offset = int(offset+num_frames)
|
end_offset = int(offset + num_frames)
|
||||||
frames = x[:, offset:end_offset]
|
frames = x[:, offset:end_offset]
|
||||||
frames_batch.append(frames)
|
frames_batch.append(frames)
|
||||||
|
|
||||||
|
|
|
@ -291,18 +291,20 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"):
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
def mls(root_path, meta_files=None):
|
def mls(root_path, meta_files=None):
|
||||||
"""http://www.openslr.org/94/"""
|
"""http://www.openslr.org/94/"""
|
||||||
items = []
|
items = []
|
||||||
with open(os.path.join(root_path, meta_files), "r") as meta:
|
with open(os.path.join(root_path, meta_files), "r") as meta:
|
||||||
for line in meta:
|
for line in meta:
|
||||||
file, text = line.split('\t')
|
file, text = line.split("\t")
|
||||||
text = text[:-1]
|
text = text[:-1]
|
||||||
speaker, book, *_ = file.split('_')
|
speaker, book, *_ = file.split("_")
|
||||||
wav_file = os.path.join(root_path, os.path.dirname(meta_files), 'audio', speaker, book, file + ".wav")
|
wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
|
||||||
items.append([text, wav_file, "MLS_" + speaker])
|
items.append([text, wav_file, "MLS_" + speaker])
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
# ======================================== VOX CELEB ===========================================
|
# ======================================== VOX CELEB ===========================================
|
||||||
def voxceleb2(root_path, meta_file=None):
|
def voxceleb2(root_path, meta_file=None):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue