diff --git a/TTS/server/server.py b/TTS/server/server.py index f6335c42..ebbf7988 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -9,6 +9,7 @@ from typing import Union from flask import Flask, render_template, request, send_file +from TTS.utils.generic_utils import style_wav_uri_to_dict from TTS.utils.io import load_config from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 2d6873e1..c026d50e 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -51,28 +51,35 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + speaker_mapping = load_speaker_mapping( + c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) + speaker_embedding_dim = len(speaker_mapping[list( + speaker_mapping.keys())[0]]["embedding"]) elif ( - not c.use_external_speaker_embedding_file + not c.use_external_speaker_embedding_file ): # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None - assert all(speaker in speaker_mapping for speaker in speakers), ( - "As of now you, you cannot " "introduce new speakers to " "a previously trained model." - ) + assert all( + speaker in speaker_mapping + for speaker in speakers), ("As of now you, you cannot " + "introduce new speakers to " + "a previously trained model.") + elif (c.use_external_speaker_embedding_file + and c.external_speaker_embedding_file + ): # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping( + c.external_speaker_embedding_file) + speaker_embedding_dim = len(speaker_mapping[list( + speaker_mapping.keys())[0]]["embedding"]) elif ( - c.use_external_speaker_embedding_file and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file + c.use_external_speaker_embedding_file + and not c.external_speaker_embedding_file ): # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file @@ -80,7 +87,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) + print(" > Training with {} speakers: {}".format( + len(speakers), ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None @@ -125,7 +133,10 @@ class SpeakerManager: encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ +<<<<<<< HEAD +======= +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 def __init__( self, x_vectors_file_path: str = "", @@ -138,7 +149,10 @@ class SpeakerManager: self.speaker_ids = None self.clip_ids = None self.speaker_encoder = None +<<<<<<< HEAD self.speaker_encoder_ap = None +======= +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 if x_vectors_file_path: self.load_x_vectors_file(x_vectors_file_path) @@ -184,23 +198,51 @@ class SpeakerManager: def load_x_vectors_file(self, file_path: str): self.x_vectors = self._load_json(file_path) +<<<<<<< HEAD self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) +======= + self.speaker_ids = list( + set(sorted(x["name"] for x in self.x_vectors.values()))) + self.clip_ids = list( + set(sorted(clip_name for clip_name in self.x_vectors.keys()))) +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 def get_x_vector_by_clip(self, clip_idx: str): return self.x_vectors[clip_idx]["embedding"] def get_x_vectors_by_speaker(self, speaker_idx: str): +<<<<<<< HEAD return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): +======= + return [ + x["embedding"] for x in self.x_vectors.values() + if x["name"] == speaker_idx + ] + + def get_mean_x_vector(self, + speaker_idx: str, + num_samples: int = None, + randomize: bool = False): +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: x_vectors = np.stack(x_vectors).mean(0) else: +<<<<<<< HEAD assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) +======= + assert len( + x_vectors + ) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + if randomize: + x_vectors = np.stack(random.choices(x_vectors, + k=num_samples)).mean(0) +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 else: x_vectors = np.stack(x_vectors[:num_samples]).mean(0) return x_vectors @@ -211,6 +253,7 @@ class SpeakerManager: def get_clips(self): return sorted(self.x_vectors.keys()) +<<<<<<< HEAD def init_speaker_encoder(self, model_path: str, config_path: str) -> None: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) @@ -241,6 +284,12 @@ class SpeakerManager: return (x_vectors / len(wav_file))[0].tolist() x_vector = _compute(wav_file) return x_vector[0].tolist() +======= + def init_speaker_encoder(self, model_path: str, config_path: str): + self.speaker_encoder_config = load_config(config_path) + self.speaker_encoder = setup_model(self.speaker_encoder_config) + self.speaker_encoder.load_checkpoint(config_path, model_path, True) +>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777 def compute_x_vector(self, feats): if isinstance(feats, np.ndarray): diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 57e22707..4de25300 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -1,10 +1,12 @@ import datetime import glob +import json import os import shutil import subprocess import sys from pathlib import Path +from typing import Union def get_git_branch(): @@ -163,3 +165,20 @@ def check_argument( assert ( isinstance(c[name], val_type) or c[name] is None ), f" [!] {name} has wrong type - {type(c[name])} vs {val_type}" + + +def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: + """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer) + or a dict (gst tokens/values to be use for styling) + + Args: + style_wav (str): uri + + Returns: + Union[str, dict]: path to file (str) or gst style (dict) + """ + if os.path.isfile(style_wav) and style_wav.endswith(".wav"): + return style_wav # style_wav is a .wav file located on the server + + style_wav = json.loads(style_wav) + return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...} diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index b1ae968d..8b8d1e3e 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -189,7 +189,7 @@ class Synthesizer(object): """ start_time = time.time() wavs = [] - sens = self.split_into_sentences(text) + sens = self._split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) diff --git a/tests/test_speakers_manager.py b/tests/test_speakers_manager.py new file mode 100644 index 00000000..40914224 --- /dev/null +++ b/tests/test_speakers_manager.py @@ -0,0 +1,49 @@ +import os +import unittest + +import numpy as np + +from tests import get_tests_input_path +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config + +encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") +encoder_model_path = os.path.join(get_tests_input_path(), "dummy_speaker_encoder.pth.tar") +sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") +x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") + + +class SpeakerManagerTest(unittest.TestCase): + """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + @staticmethod + def test_speaker_embedding(): + # load config + config = load_config(encoder_config_path) + config["audio"]["resample"] = True + + # load audio processor and speaker encoder + ap = AudioProcessor(**config.audio) + manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) + + # load a sample audio and compute embedding + waveform = ap.load_wav(sample_wav_path) + mel = ap.melspectrogram(waveform) + x_vector = manager.compute_x_vector(mel.T) + assert x_vector.shape[1] == 256 + + @staticmethod + def test_speakers_file_processing(): + manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) + print(manager.num_speakers) + print(manager.x_vector_dim) + print(manager.clip_ids) + x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) + assert len(x_vector) == 256 + x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) + assert len(x_vectors[0]) == 256 + x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + assert len(x_vector1) == 256 + x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + assert len(x_vector2) == 256 + assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0