mirror of https://github.com/coqui-ai/TTS.git
styling and linting
This commit is contained in:
parent
a878d8fb42
commit
4cf211348d
|
@ -184,8 +184,14 @@ def main():
|
|||
|
||||
# load models
|
||||
synthesizer = Synthesizer(
|
||||
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path,
|
||||
encoder_config_path, args.use_cuda
|
||||
model_path,
|
||||
config_path,
|
||||
speakers_file_path,
|
||||
vocoder_path,
|
||||
vocoder_config_path,
|
||||
encoder_path,
|
||||
encoder_config_path,
|
||||
args.use_cuda,
|
||||
)
|
||||
|
||||
# query speaker ids of a multi-speaker model.
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
#!flask/bin/python
|
||||
from typing import Union
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import json
|
||||
from typing import Union
|
||||
|
||||
from flask import Flask, render_template, request, send_file
|
||||
|
||||
|
@ -32,19 +31,12 @@ def create_argparser():
|
|||
"--model_name",
|
||||
type=str,
|
||||
default="tts_models/en/ljspeech/tacotron2-DDC",
|
||||
help=
|
||||
"Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
|
||||
help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
|
||||
)
|
||||
parser.add_argument("--vocoder_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="name of one of the released vocoder models.")
|
||||
parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
|
||||
|
||||
# Args for running custom models
|
||||
parser.add_argument("--config_path",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Path to model config file.")
|
||||
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
|
||||
parser.add_argument(
|
||||
"--model_path",
|
||||
type=str,
|
||||
|
@ -54,34 +46,15 @@ def create_argparser():
|
|||
parser.add_argument(
|
||||
"--vocoder_path",
|
||||
type=str,
|
||||
help=
|
||||
"Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
||||
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--vocoder_config_path",
|
||||
type=str,
|
||||
help="Path to vocoder model config file.",
|
||||
default=None)
|
||||
parser.add_argument("--speakers_file_path",
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default=None)
|
||||
parser.add_argument("--port",
|
||||
type=int,
|
||||
default=5002,
|
||||
help="port to listen on.")
|
||||
parser.add_argument("--use_cuda",
|
||||
type=convert_boolean,
|
||||
default=False,
|
||||
help="true to use CUDA.")
|
||||
parser.add_argument("--debug",
|
||||
type=convert_boolean,
|
||||
default=False,
|
||||
help="true to enable Flask debug mode.")
|
||||
parser.add_argument("--show_details",
|
||||
type=convert_boolean,
|
||||
default=False,
|
||||
help="Generate model detail page.")
|
||||
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
||||
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
|
||||
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
|
||||
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
|
||||
return parser
|
||||
|
||||
|
||||
|
@ -109,14 +82,11 @@ if args.list_models:
|
|||
|
||||
# CASE2: load pre-trained model paths
|
||||
if args.model_name is not None and not args.model_path:
|
||||
model_path, config_path, model_item = manager.download_model(
|
||||
args.model_name)
|
||||
args.vocoder_name = model_item[
|
||||
"default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
||||
model_path, config_path, model_item = manager.download_model(args.model_name)
|
||||
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
||||
|
||||
if args.vocoder_name is not None and not args.vocoder_path:
|
||||
vocoder_path, vocoder_config_path, _ = manager.download_model(
|
||||
args.vocoder_name)
|
||||
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||
|
||||
# CASE3: set custome model paths
|
||||
if args.model_path is not None:
|
||||
|
@ -129,8 +99,7 @@ if args.vocoder_path is not None:
|
|||
vocoder_config_path = args.vocoder_config_path
|
||||
|
||||
# load models
|
||||
synthesizer = Synthesizer(model_path, config_path, speakers_file_path,
|
||||
vocoder_path, vocoder_config_path, args.use_cuda)
|
||||
synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
|
||||
|
||||
use_multi_speaker = synthesizer.speaker_manager is not None
|
||||
# TODO: set this from SpeakerManager
|
||||
|
@ -154,17 +123,18 @@ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
|
|||
|
||||
style_wav = json.loads(style_wav)
|
||||
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html",
|
||||
show_details=args.show_details,
|
||||
use_multi_speaker=use_multi_speaker,
|
||||
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
|
||||
use_gst=use_gst)
|
||||
return render_template(
|
||||
"index.html",
|
||||
show_details=args.show_details,
|
||||
use_multi_speaker=use_multi_speaker,
|
||||
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
|
||||
use_gst=use_gst,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/details")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -9,8 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model
|
|||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
from typing import Union
|
||||
|
||||
|
||||
def make_speakers_json_path(out_path):
|
||||
"""Returns conventional speakers.json location."""
|
||||
|
@ -52,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
print(
|
||||
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
||||
)
|
||||
speaker_mapping = load_speaker_mapping(
|
||||
c.external_speaker_embedding_file)
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
if not speaker_mapping:
|
||||
raise RuntimeError(
|
||||
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
||||
)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(
|
||||
speaker_mapping.keys())[0]]["embedding"])
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||
elif (
|
||||
not c.use_external_speaker_embedding_file
|
||||
not c.use_external_speaker_embedding_file
|
||||
): # if restore checkpoint and don't use External Embedding file
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
speaker_embedding_dim = None
|
||||
assert all(
|
||||
speaker in speaker_mapping
|
||||
for speaker in speakers), ("As of now you, you cannot "
|
||||
"introduce new speakers to "
|
||||
"a previously trained model.")
|
||||
elif (c.use_external_speaker_embedding_file
|
||||
and c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(
|
||||
c.external_speaker_embedding_file)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(
|
||||
speaker_mapping.keys())[0]]["embedding"])
|
||||
assert all(speaker in speaker_mapping for speaker in speakers), (
|
||||
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
|
||||
)
|
||||
elif (
|
||||
c.use_external_speaker_embedding_file
|
||||
and not c.external_speaker_embedding_file
|
||||
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||
elif (
|
||||
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file and don't pass external embedding file
|
||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||
else: # if start new train and don't use External Embedding file
|
||||
|
@ -88,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
speaker_embedding_dim = None
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print(" > Training with {} speakers: {}".format(
|
||||
len(speakers), ", ".join(speakers)))
|
||||
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
speaker_embedding_dim = None
|
||||
|
@ -134,6 +125,7 @@ class SpeakerManager:
|
|||
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
||||
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
x_vectors_file_path: str = "",
|
||||
|
@ -192,34 +184,23 @@ class SpeakerManager:
|
|||
|
||||
def load_x_vectors_file(self, file_path: str):
|
||||
self.x_vectors = self._load_json(file_path)
|
||||
self.speaker_ids = list(
|
||||
set(sorted(x["name"] for x in self.x_vectors.values())))
|
||||
self.clip_ids = list(
|
||||
set(sorted(clip_name for clip_name in self.x_vectors.keys())))
|
||||
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
|
||||
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
|
||||
|
||||
def get_x_vector_by_clip(self, clip_idx: str):
|
||||
return self.x_vectors[clip_idx]["embedding"]
|
||||
|
||||
def get_x_vectors_by_speaker(self, speaker_idx: str):
|
||||
return [
|
||||
x["embedding"] for x in self.x_vectors.values()
|
||||
if x["name"] == speaker_idx
|
||||
]
|
||||
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
|
||||
|
||||
def get_mean_x_vector(self,
|
||||
speaker_idx: str,
|
||||
num_samples: int = None,
|
||||
randomize: bool = False):
|
||||
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
|
||||
x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
|
||||
if num_samples is None:
|
||||
x_vectors = np.stack(x_vectors).mean(0)
|
||||
else:
|
||||
assert len(
|
||||
x_vectors
|
||||
) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
|
||||
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
|
||||
if randomize:
|
||||
x_vectors = np.stack(random.choices(x_vectors,
|
||||
k=num_samples)).mean(0)
|
||||
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
|
||||
else:
|
||||
x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
|
||||
return x_vectors
|
||||
|
@ -234,21 +215,20 @@ class SpeakerManager:
|
|||
self.speaker_encoder_config = load_config(config_path)
|
||||
self.speaker_encoder = setup_model(self.speaker_encoder_config)
|
||||
self.speaker_encoder.load_checkpoint(config_path, model_path, True)
|
||||
self.speaker_encoder_ap = AudioProcessor(
|
||||
**self.speaker_encoder_config.audio)
|
||||
self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
|
||||
# normalize the input audio level and trim silences
|
||||
self.speaker_encoder_ap.do_sound_norm = True
|
||||
self.speaker_encoder_ap.do_trim_silence = True
|
||||
|
||||
def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list:
|
||||
def _compute(wav_file: str):
|
||||
waveform = self.speaker_encoder_ap.load_wav(
|
||||
wav_file, sr=self.speaker_encoder_ap.sample_rate)
|
||||
waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
|
||||
spec = self.speaker_encoder_ap.melspectrogram(waveform)
|
||||
spec = torch.from_numpy(spec.T)
|
||||
spec = spec.unsqueeze(0)
|
||||
x_vector = self.speaker_encoder.compute_embedding(spec)
|
||||
return x_vector
|
||||
|
||||
if isinstance(wav_file, list):
|
||||
# compute the mean x_vector
|
||||
x_vectors = None
|
||||
|
@ -259,9 +239,8 @@ class SpeakerManager:
|
|||
else:
|
||||
x_vectors += x_vector
|
||||
return (x_vectors / len(wav_file))[0].tolist()
|
||||
else:
|
||||
x_vector = _compute(wav_file)
|
||||
return x_vector[0].tolist()
|
||||
x_vector = _compute(wav_file)
|
||||
return x_vector[0].tolist()
|
||||
|
||||
def compute_x_vector(self, feats):
|
||||
if isinstance(feats, np.ndarray):
|
||||
|
|
|
@ -93,7 +93,9 @@ class Synthesizer(object):
|
|||
speaker_file (str): path to the speakers meta-data file.
|
||||
"""
|
||||
print("Loading speakers ...")
|
||||
self.speaker_manager = SpeakerManager(encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config)
|
||||
self.speaker_manager = SpeakerManager(
|
||||
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
|
||||
)
|
||||
self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file))
|
||||
self.num_speakers = self.speaker_manager.num_speakers
|
||||
self.speaker_embedding_dim = self.speaker_manager.x_vector_dim
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import os
|
||||
import torch
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from tests import get_tests_input_path
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
|
@ -15,8 +15,10 @@ sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ
|
|||
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
|
||||
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
|
||||
|
||||
|
||||
class SpeakerManagerTest(unittest.TestCase):
|
||||
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
|
||||
|
||||
@staticmethod
|
||||
def test_speaker_embedding():
|
||||
# load config
|
||||
|
@ -47,7 +49,6 @@ class SpeakerManagerTest(unittest.TestCase):
|
|||
assert x_vector3.shape[0] == 256
|
||||
assert (x_vector - x_vector3).sum() != 0.0
|
||||
|
||||
|
||||
@staticmethod
|
||||
def test_speakers_file_processing():
|
||||
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)
|
||||
|
|
Loading…
Reference in New Issue