styling and linting

This commit is contained in:
Eren Gölge 2021-04-23 17:57:27 +02:00
parent a878d8fb42
commit 4cf211348d
5 changed files with 64 additions and 106 deletions

View File

@ -184,8 +184,14 @@ def main():
# load models # load models
synthesizer = Synthesizer( synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, model_path,
encoder_config_path, args.use_cuda config_path,
speakers_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
args.use_cuda,
) )
# query speaker ids of a multi-speaker model. # query speaker ids of a multi-speaker model.

View File

@ -1,12 +1,11 @@
#!flask/bin/python #!flask/bin/python
from typing import Union
import argparse import argparse
import io import io
import json
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
import json from typing import Union
from flask import Flask, render_template, request, send_file from flask import Flask, render_template, request, send_file
@ -32,19 +31,12 @@ def create_argparser():
"--model_name", "--model_name",
type=str, type=str,
default="tts_models/en/ljspeech/tacotron2-DDC", default="tts_models/en/ljspeech/tacotron2-DDC",
help= help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
"Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
) )
parser.add_argument("--vocoder_name", parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
type=str,
default=None,
help="name of one of the released vocoder models.")
# Args for running custom models # Args for running custom models
parser.add_argument("--config_path", parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
default=None,
type=str,
help="Path to model config file.")
parser.add_argument( parser.add_argument(
"--model_path", "--model_path",
type=str, type=str,
@ -54,34 +46,15 @@ def create_argparser():
parser.add_argument( parser.add_argument(
"--vocoder_path", "--vocoder_path",
type=str, type=str,
help= help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
"Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
default=None, default=None,
) )
parser.add_argument("--vocoder_config_path", parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
type=str, parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
help="Path to vocoder model config file.", parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
default=None) parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
parser.add_argument("--speakers_file_path", parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
type=str, parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
help="JSON file for multi-speaker model.",
default=None)
parser.add_argument("--port",
type=int,
default=5002,
help="port to listen on.")
parser.add_argument("--use_cuda",
type=convert_boolean,
default=False,
help="true to use CUDA.")
parser.add_argument("--debug",
type=convert_boolean,
default=False,
help="true to enable Flask debug mode.")
parser.add_argument("--show_details",
type=convert_boolean,
default=False,
help="Generate model detail page.")
return parser return parser
@ -109,14 +82,11 @@ if args.list_models:
# CASE2: load pre-trained model paths # CASE2: load pre-trained model paths
if args.model_name is not None and not args.model_path: if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model( model_path, config_path, model_item = manager.download_model(args.model_name)
args.model_name) args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
args.vocoder_name = model_item[
"default_vocoder"] if args.vocoder_name is None else args.vocoder_name
if args.vocoder_name is not None and not args.vocoder_path: if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model( vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
args.vocoder_name)
# CASE3: set custome model paths # CASE3: set custome model paths
if args.model_path is not None: if args.model_path is not None:
@ -129,8 +99,7 @@ if args.vocoder_path is not None:
vocoder_config_path = args.vocoder_config_path vocoder_config_path = args.vocoder_config_path
# load models # load models
synthesizer = Synthesizer(model_path, config_path, speakers_file_path, synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
vocoder_path, vocoder_config_path, args.use_cuda)
use_multi_speaker = synthesizer.speaker_manager is not None use_multi_speaker = synthesizer.speaker_manager is not None
# TODO: set this from SpeakerManager # TODO: set this from SpeakerManager
@ -154,17 +123,18 @@ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
style_wav = json.loads(style_wav) style_wav = json.loads(style_wav)
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...} return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
else: return None
return None
@app.route("/") @app.route("/")
def index(): def index():
return render_template("index.html", return render_template(
show_details=args.show_details, "index.html",
use_multi_speaker=use_multi_speaker, show_details=args.show_details,
speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None, use_multi_speaker=use_multi_speaker,
use_gst=use_gst) speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
use_gst=use_gst,
)
@app.route("/details") @app.route("/details")

View File

@ -1,6 +1,7 @@
import json import json
import os import os
import random import random
from typing import Union
import numpy as np import numpy as np
import torch import torch
@ -9,8 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config from TTS.utils.io import load_config
from typing import Union
def make_speakers_json_path(out_path): def make_speakers_json_path(out_path):
"""Returns conventional speakers.json location.""" """Returns conventional speakers.json location."""
@ -52,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
print( print(
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
) )
speaker_mapping = load_speaker_mapping( speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
c.external_speaker_embedding_file)
if not speaker_mapping: if not speaker_mapping:
raise RuntimeError( raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
) )
speaker_embedding_dim = len(speaker_mapping[list( speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
not c.use_external_speaker_embedding_file not c.use_external_speaker_embedding_file
): # if restore checkpoint and don't use External Embedding file ): # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path) prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path) speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None speaker_embedding_dim = None
assert all( assert all(speaker in speaker_mapping for speaker in speakers), (
speaker in speaker_mapping "As of now you, you cannot " "introduce new speakers to " "a previously trained model."
for speaker in speakers), ("As of now you, you cannot " )
"introduce new speakers to "
"a previously trained model.")
elif (c.use_external_speaker_embedding_file
and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
c.use_external_speaker_embedding_file c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
and not c.external_speaker_embedding_file ): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # if start new train using External Embedding file and don't pass external embedding file ): # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file else: # if start new train and don't use External Embedding file
@ -88,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping) save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping) num_speakers = len(speaker_mapping)
print(" > Training with {} speakers: {}".format( print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
len(speakers), ", ".join(speakers)))
else: else:
num_speakers = 0 num_speakers = 0
speaker_embedding_dim = None speaker_embedding_dim = None
@ -134,6 +125,7 @@ class SpeakerManager:
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
""" """
def __init__( def __init__(
self, self,
x_vectors_file_path: str = "", x_vectors_file_path: str = "",
@ -192,34 +184,23 @@ class SpeakerManager:
def load_x_vectors_file(self, file_path: str): def load_x_vectors_file(self, file_path: str):
self.x_vectors = self._load_json(file_path) self.x_vectors = self._load_json(file_path)
self.speaker_ids = list( self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
set(sorted(x["name"] for x in self.x_vectors.values()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
self.clip_ids = list(
set(sorted(clip_name for clip_name in self.x_vectors.keys())))
def get_x_vector_by_clip(self, clip_idx: str): def get_x_vector_by_clip(self, clip_idx: str):
return self.x_vectors[clip_idx]["embedding"] return self.x_vectors[clip_idx]["embedding"]
def get_x_vectors_by_speaker(self, speaker_idx: str): def get_x_vectors_by_speaker(self, speaker_idx: str):
return [ return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
x["embedding"] for x in self.x_vectors.values()
if x["name"] == speaker_idx
]
def get_mean_x_vector(self, def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
speaker_idx: str,
num_samples: int = None,
randomize: bool = False):
x_vectors = self.get_x_vectors_by_speaker(speaker_idx) x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
if num_samples is None: if num_samples is None:
x_vectors = np.stack(x_vectors).mean(0) x_vectors = np.stack(x_vectors).mean(0)
else: else:
assert len( assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
x_vectors
) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
if randomize: if randomize:
x_vectors = np.stack(random.choices(x_vectors, x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
k=num_samples)).mean(0)
else: else:
x_vectors = np.stack(x_vectors[:num_samples]).mean(0) x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
return x_vectors return x_vectors
@ -234,21 +215,20 @@ class SpeakerManager:
self.speaker_encoder_config = load_config(config_path) self.speaker_encoder_config = load_config(config_path)
self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder = setup_model(self.speaker_encoder_config)
self.speaker_encoder.load_checkpoint(config_path, model_path, True) self.speaker_encoder.load_checkpoint(config_path, model_path, True)
self.speaker_encoder_ap = AudioProcessor( self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
**self.speaker_encoder_config.audio)
# normalize the input audio level and trim silences # normalize the input audio level and trim silences
self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_sound_norm = True
self.speaker_encoder_ap.do_trim_silence = True self.speaker_encoder_ap.do_trim_silence = True
def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list:
def _compute(wav_file: str): def _compute(wav_file: str):
waveform = self.speaker_encoder_ap.load_wav( waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
wav_file, sr=self.speaker_encoder_ap.sample_rate)
spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = self.speaker_encoder_ap.melspectrogram(waveform)
spec = torch.from_numpy(spec.T) spec = torch.from_numpy(spec.T)
spec = spec.unsqueeze(0) spec = spec.unsqueeze(0)
x_vector = self.speaker_encoder.compute_embedding(spec) x_vector = self.speaker_encoder.compute_embedding(spec)
return x_vector return x_vector
if isinstance(wav_file, list): if isinstance(wav_file, list):
# compute the mean x_vector # compute the mean x_vector
x_vectors = None x_vectors = None
@ -259,9 +239,8 @@ class SpeakerManager:
else: else:
x_vectors += x_vector x_vectors += x_vector
return (x_vectors / len(wav_file))[0].tolist() return (x_vectors / len(wav_file))[0].tolist()
else: x_vector = _compute(wav_file)
x_vector = _compute(wav_file) return x_vector[0].tolist()
return x_vector[0].tolist()
def compute_x_vector(self, feats): def compute_x_vector(self, feats):
if isinstance(feats, np.ndarray): if isinstance(feats, np.ndarray):

View File

@ -93,7 +93,9 @@ class Synthesizer(object):
speaker_file (str): path to the speakers meta-data file. speaker_file (str): path to the speakers meta-data file.
""" """
print("Loading speakers ...") print("Loading speakers ...")
self.speaker_manager = SpeakerManager(encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config) self.speaker_manager = SpeakerManager(
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
)
self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file))
self.num_speakers = self.speaker_manager.num_speakers self.num_speakers = self.speaker_manager.num_speakers
self.speaker_embedding_dim = self.speaker_manager.x_vector_dim self.speaker_embedding_dim = self.speaker_manager.x_vector_dim

View File

@ -1,8 +1,8 @@
import os import os
import torch
import unittest import unittest
import numpy as np import numpy as np
import torch
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
@ -15,8 +15,10 @@ sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
class SpeakerManagerTest(unittest.TestCase): class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
@staticmethod @staticmethod
def test_speaker_embedding(): def test_speaker_embedding():
# load config # load config
@ -47,7 +49,6 @@ class SpeakerManagerTest(unittest.TestCase):
assert x_vector3.shape[0] == 256 assert x_vector3.shape[0] == 256
assert (x_vector - x_vector3).sum() != 0.0 assert (x_vector - x_vector3).sum() != 0.0
@staticmethod @staticmethod
def test_speakers_file_processing(): def test_speakers_file_processing():
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)