styling and linting

This commit is contained in:
Eren Gölge 2021-04-23 17:57:27 +02:00
parent a878d8fb42
commit 4cf211348d
5 changed files with 64 additions and 106 deletions

View File

@ -184,8 +184,14 @@ def main():
# load models
synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path,
encoder_config_path, args.use_cuda
model_path,
config_path,
speakers_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
args.use_cuda,
)
# query speaker ids of a multi-speaker model.

View File

@ -1,12 +1,11 @@
#!flask/bin/python
from typing import Union
import argparse
import io
import json
import os
import sys
from pathlib import Path
import json
from typing import Union
from flask import Flask, render_template, request, send_file
@ -32,19 +31,12 @@ def create_argparser():
"--model_name",
type=str,
default="tts_models/en/ljspeech/tacotron2-DDC",
help=
"Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
)
parser.add_argument("--vocoder_name",
type=str,
default=None,
help="name of one of the released vocoder models.")
parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
# Args for running custom models
parser.add_argument("--config_path",
default=None,
type=str,
help="Path to model config file.")
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
parser.add_argument(
"--model_path",
type=str,
@ -54,34 +46,15 @@ def create_argparser():
parser.add_argument(
"--vocoder_path",
type=str,
help=
"Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
default=None,
)
parser.add_argument("--vocoder_config_path",
type=str,
help="Path to vocoder model config file.",
default=None)
parser.add_argument("--speakers_file_path",
type=str,
help="JSON file for multi-speaker model.",
default=None)
parser.add_argument("--port",
type=int,
default=5002,
help="port to listen on.")
parser.add_argument("--use_cuda",
type=convert_boolean,
default=False,
help="true to use CUDA.")
parser.add_argument("--debug",
type=convert_boolean,
default=False,
help="true to enable Flask debug mode.")
parser.add_argument("--show_details",
type=convert_boolean,
default=False,
help="Generate model detail page.")
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
return parser
@ -109,14 +82,11 @@ if args.list_models:
# CASE2: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(
args.model_name)
args.vocoder_name = model_item[
"default_vocoder"] if args.vocoder_name is None else args.vocoder_name
model_path, config_path, model_item = manager.download_model(args.model_name)
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(
args.vocoder_name)
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
# CASE3: set custome model paths
if args.model_path is not None:
@ -129,8 +99,7 @@ if args.vocoder_path is not None:
vocoder_config_path = args.vocoder_config_path
# load models
synthesizer = Synthesizer(model_path, config_path, speakers_file_path,
vocoder_path, vocoder_config_path, args.use_cuda)
synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
use_multi_speaker = synthesizer.speaker_manager is not None
# TODO: set this from SpeakerManager
@ -154,17 +123,18 @@ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
style_wav = json.loads(style_wav)
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
else:
return None
return None
@app.route("/")
def index():
return render_template("index.html",
show_details=args.show_details,
use_multi_speaker=use_multi_speaker,
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
use_gst=use_gst)
return render_template(
"index.html",
show_details=args.show_details,
use_multi_speaker=use_multi_speaker,
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
use_gst=use_gst,
)
@app.route("/details")

View File

@ -1,6 +1,7 @@
import json
import os
import random
from typing import Union
import numpy as np
import torch
@ -9,8 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from typing import Union
def make_speakers_json_path(out_path):
"""Returns conventional speakers.json location."""
@ -52,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
print(
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
)
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
if not speaker_mapping:
raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
not c.use_external_speaker_embedding_file
not c.use_external_speaker_embedding_file
): # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None
assert all(
speaker in speaker_mapping
for speaker in speakers), ("As of now you, you cannot "
"introduce new speakers to "
"a previously trained model.")
elif (c.use_external_speaker_embedding_file
and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
assert all(speaker in speaker_mapping for speaker in speakers), (
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
)
elif (
c.use_external_speaker_embedding_file
and not c.external_speaker_embedding_file
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file
@ -88,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print(" > Training with {} speakers: {}".format(
len(speakers), ", ".join(speakers)))
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
else:
num_speakers = 0
speaker_embedding_dim = None
@ -134,6 +125,7 @@ class SpeakerManager:
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
"""
def __init__(
self,
x_vectors_file_path: str = "",
@ -192,34 +184,23 @@ class SpeakerManager:
def load_x_vectors_file(self, file_path: str):
self.x_vectors = self._load_json(file_path)
self.speaker_ids = list(
set(sorted(x["name"] for x in self.x_vectors.values())))
self.clip_ids = list(
set(sorted(clip_name for clip_name in self.x_vectors.keys())))
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
def get_x_vector_by_clip(self, clip_idx: str):
return self.x_vectors[clip_idx]["embedding"]
def get_x_vectors_by_speaker(self, speaker_idx: str):
return [
x["embedding"] for x in self.x_vectors.values()
if x["name"] == speaker_idx
]
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
def get_mean_x_vector(self,
speaker_idx: str,
num_samples: int = None,
randomize: bool = False):
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
if num_samples is None:
x_vectors = np.stack(x_vectors).mean(0)
else:
assert len(
x_vectors
) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
if randomize:
x_vectors = np.stack(random.choices(x_vectors,
k=num_samples)).mean(0)
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
else:
x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
return x_vectors
@ -234,21 +215,20 @@ class SpeakerManager:
self.speaker_encoder_config = load_config(config_path)
self.speaker_encoder = setup_model(self.speaker_encoder_config)
self.speaker_encoder.load_checkpoint(config_path, model_path, True)
self.speaker_encoder_ap = AudioProcessor(
**self.speaker_encoder_config.audio)
self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
# normalize the input audio level and trim silences
self.speaker_encoder_ap.do_sound_norm = True
self.speaker_encoder_ap.do_trim_silence = True
def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list:
def _compute(wav_file: str):
waveform = self.speaker_encoder_ap.load_wav(
wav_file, sr=self.speaker_encoder_ap.sample_rate)
waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
spec = self.speaker_encoder_ap.melspectrogram(waveform)
spec = torch.from_numpy(spec.T)
spec = spec.unsqueeze(0)
x_vector = self.speaker_encoder.compute_embedding(spec)
return x_vector
if isinstance(wav_file, list):
# compute the mean x_vector
x_vectors = None
@ -259,9 +239,8 @@ class SpeakerManager:
else:
x_vectors += x_vector
return (x_vectors / len(wav_file))[0].tolist()
else:
x_vector = _compute(wav_file)
return x_vector[0].tolist()
x_vector = _compute(wav_file)
return x_vector[0].tolist()
def compute_x_vector(self, feats):
if isinstance(feats, np.ndarray):

View File

@ -93,7 +93,9 @@ class Synthesizer(object):
speaker_file (str): path to the speakers meta-data file.
"""
print("Loading speakers ...")
self.speaker_manager = SpeakerManager(encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config)
self.speaker_manager = SpeakerManager(
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
)
self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file))
self.num_speakers = self.speaker_manager.num_speakers
self.speaker_embedding_dim = self.speaker_manager.x_vector_dim

View File

@ -1,8 +1,8 @@
import os
import torch
import unittest
import numpy as np
import torch
from tests import get_tests_input_path
from TTS.tts.utils.speakers import SpeakerManager
@ -15,8 +15,10 @@ sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
@staticmethod
def test_speaker_embedding():
# load config
@ -47,7 +49,6 @@ class SpeakerManagerTest(unittest.TestCase):
assert x_vector3.shape[0] == 256
assert (x_vector - x_vector3).sum() != 0.0
@staticmethod
def test_speakers_file_processing():
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)