diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 1f7725eb..b61113a7 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -184,8 +184,14 @@ def main(): # load models synthesizer = Synthesizer( - model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, - encoder_config_path, args.use_cuda + model_path, + config_path, + speakers_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + args.use_cuda, ) # query speaker ids of a multi-speaker model. diff --git a/TTS/server/server.py b/TTS/server/server.py index bed2ab39..f6335c42 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -1,12 +1,11 @@ #!flask/bin/python -from typing import Union - import argparse import io +import json import os import sys from pathlib import Path -import json +from typing import Union from flask import Flask, render_template, request, send_file @@ -32,19 +31,12 @@ def create_argparser(): "--model_name", type=str, default="tts_models/en/ljspeech/tacotron2-DDC", - help= - "Name of one of the pre-trained tts models in format //", + help="Name of one of the pre-trained tts models in format //", ) - parser.add_argument("--vocoder_name", - type=str, - default=None, - help="name of one of the released vocoder models.") + parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.") # Args for running custom models - parser.add_argument("--config_path", - default=None, - type=str, - help="Path to model config file.") + parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument( "--model_path", type=str, @@ -54,34 +46,15 @@ def create_argparser(): parser.add_argument( "--vocoder_path", type=str, - help= - "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", + help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", default=None, ) - parser.add_argument("--vocoder_config_path", - type=str, - help="Path to vocoder model config file.", - default=None) - parser.add_argument("--speakers_file_path", - type=str, - help="JSON file for multi-speaker model.", - default=None) - parser.add_argument("--port", - type=int, - default=5002, - help="port to listen on.") - parser.add_argument("--use_cuda", - type=convert_boolean, - default=False, - help="true to use CUDA.") - parser.add_argument("--debug", - type=convert_boolean, - default=False, - help="true to enable Flask debug mode.") - parser.add_argument("--show_details", - type=convert_boolean, - default=False, - help="Generate model detail page.") + parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) + parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) + parser.add_argument("--port", type=int, default=5002, help="port to listen on.") + parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.") + parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.") + parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.") return parser @@ -109,14 +82,11 @@ if args.list_models: # CASE2: load pre-trained model paths if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model( - args.model_name) - args.vocoder_name = model_item[ - "default_vocoder"] if args.vocoder_name is None else args.vocoder_name + model_path, config_path, model_item = manager.download_model(args.model_name) + args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model( - args.vocoder_name) + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) # CASE3: set custome model paths if args.model_path is not None: @@ -129,8 +99,7 @@ if args.vocoder_path is not None: vocoder_config_path = args.vocoder_config_path # load models -synthesizer = Synthesizer(model_path, config_path, speakers_file_path, - vocoder_path, vocoder_config_path, args.use_cuda) +synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda) use_multi_speaker = synthesizer.speaker_manager is not None # TODO: set this from SpeakerManager @@ -154,17 +123,18 @@ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: style_wav = json.loads(style_wav) return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...} - else: - return None + return None @app.route("/") def index(): - return render_template("index.html", - show_details=args.show_details, - use_multi_speaker=use_multi_speaker, - speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None, - use_gst=use_gst) + return render_template( + "index.html", + show_details=args.show_details, + use_multi_speaker=use_multi_speaker, + speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None, + use_gst=use_gst, + ) @app.route("/details") diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 7bf91bc4..2d6873e1 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,6 +1,7 @@ import json import os import random +from typing import Union import numpy as np import torch @@ -9,8 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -from typing import Union - def make_speakers_json_path(out_path): """Returns conventional speakers.json location.""" @@ -52,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) - speaker_mapping = load_speaker_mapping( - c.external_speaker_embedding_file) + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_embedding_dim = len(speaker_mapping[list( - speaker_mapping.keys())[0]]["embedding"]) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) elif ( - not c.use_external_speaker_embedding_file + not c.use_external_speaker_embedding_file ): # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None - assert all( - speaker in speaker_mapping - for speaker in speakers), ("As of now you, you cannot " - "introduce new speakers to " - "a previously trained model.") - elif (c.use_external_speaker_embedding_file - and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping( - c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list( - speaker_mapping.keys())[0]]["embedding"]) + assert all(speaker in speaker_mapping for speaker in speakers), ( + "As of now you, you cannot " "introduce new speakers to " "a previously trained model." + ) elif ( - c.use_external_speaker_embedding_file - and not c.external_speaker_embedding_file + c.use_external_speaker_embedding_file and c.external_speaker_embedding_file + ): # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) + elif ( + c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file ): # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file @@ -88,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format( - len(speakers), ", ".join(speakers))) + print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None @@ -134,6 +125,7 @@ class SpeakerManager: encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ + def __init__( self, x_vectors_file_path: str = "", @@ -192,34 +184,23 @@ class SpeakerManager: def load_x_vectors_file(self, file_path: str): self.x_vectors = self._load_json(file_path) - self.speaker_ids = list( - set(sorted(x["name"] for x in self.x_vectors.values()))) - self.clip_ids = list( - set(sorted(clip_name for clip_name in self.x_vectors.keys()))) + self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) + self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) def get_x_vector_by_clip(self, clip_idx: str): return self.x_vectors[clip_idx]["embedding"] def get_x_vectors_by_speaker(self, speaker_idx: str): - return [ - x["embedding"] for x in self.x_vectors.values() - if x["name"] == speaker_idx - ] + return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, - speaker_idx: str, - num_samples: int = None, - randomize: bool = False): + def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: x_vectors = np.stack(x_vectors).mean(0) else: - assert len( - x_vectors - ) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: - x_vectors = np.stack(random.choices(x_vectors, - k=num_samples)).mean(0) + x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) else: x_vectors = np.stack(x_vectors[:num_samples]).mean(0) return x_vectors @@ -234,21 +215,20 @@ class SpeakerManager: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) - self.speaker_encoder_ap = AudioProcessor( - **self.speaker_encoder_config.audio) + self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: def _compute(wav_file: str): - waveform = self.speaker_encoder_ap.load_wav( - wav_file, sr=self.speaker_encoder_ap.sample_rate) + waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) spec = spec.unsqueeze(0) x_vector = self.speaker_encoder.compute_embedding(spec) return x_vector + if isinstance(wav_file, list): # compute the mean x_vector x_vectors = None @@ -259,9 +239,8 @@ class SpeakerManager: else: x_vectors += x_vector return (x_vectors / len(wav_file))[0].tolist() - else: - x_vector = _compute(wav_file) - return x_vector[0].tolist() + x_vector = _compute(wav_file) + return x_vector[0].tolist() def compute_x_vector(self, feats): if isinstance(feats, np.ndarray): diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8054b181..3753c121 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -93,7 +93,9 @@ class Synthesizer(object): speaker_file (str): path to the speakers meta-data file. """ print("Loading speakers ...") - self.speaker_manager = SpeakerManager(encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config) + self.speaker_manager = SpeakerManager( + encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config + ) self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.speaker_embedding_dim = self.speaker_manager.x_vector_dim diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index 3e272f42..b8697ca8 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -1,8 +1,8 @@ import os -import torch import unittest import numpy as np +import torch from tests import get_tests_input_path from TTS.tts.utils.speakers import SpeakerManager @@ -15,8 +15,10 @@ sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") + class SpeakerManagerTest(unittest.TestCase): """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + @staticmethod def test_speaker_embedding(): # load config @@ -47,7 +49,6 @@ class SpeakerManagerTest(unittest.TestCase): assert x_vector3.shape[0] == 256 assert (x_vector - x_vector3).sum() != 0.0 - @staticmethod def test_speakers_file_processing(): manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)