Merge branch 'speaker-manager' of https://github.com/coqui-ai/TTS into speaker-manager

This commit is contained in:
Eren Gölge 2021-04-26 15:25:25 +02:00
commit f37b488876
5 changed files with 132 additions and 14 deletions

View File

@ -9,6 +9,7 @@ from typing import Union
from flask import Flask, render_template, request, send_file from flask import Flask, render_template, request, send_file
from TTS.utils.generic_utils import style_wav_uri_to_dict
from TTS.utils.io import load_config from TTS.utils.io import load_config
from TTS.utils.manage import ModelManager from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer from TTS.utils.synthesizer import Synthesizer

View File

@ -51,28 +51,35 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
print( print(
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
) )
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
if not speaker_mapping: if not speaker_mapping:
raise RuntimeError( raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
) )
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
not c.use_external_speaker_embedding_file not c.use_external_speaker_embedding_file
): # if restore checkpoint and don't use External Embedding file ): # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path) prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path) speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None speaker_embedding_dim = None
assert all(speaker in speaker_mapping for speaker in speakers), ( assert all(
"As of now you, you cannot " "introduce new speakers to " "a previously trained model." speaker in speaker_mapping
) for speaker in speakers), ("As of now you, you cannot "
"introduce new speakers to "
"a previously trained model.")
elif (c.use_external_speaker_embedding_file
and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file c.use_external_speaker_embedding_file
): # if start new train using External Embedding file and not c.external_speaker_embedding_file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # if start new train using External Embedding file and don't pass external embedding file ): # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file else: # if start new train and don't use External Embedding file
@ -80,7 +87,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping) save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping) num_speakers = len(speaker_mapping)
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) print(" > Training with {} speakers: {}".format(
len(speakers), ", ".join(speakers)))
else: else:
num_speakers = 0 num_speakers = 0
speaker_embedding_dim = None speaker_embedding_dim = None
@ -125,7 +133,10 @@ class SpeakerManager:
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
""" """
<<<<<<< HEAD
=======
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
def __init__( def __init__(
self, self,
x_vectors_file_path: str = "", x_vectors_file_path: str = "",
@ -138,7 +149,10 @@ class SpeakerManager:
self.speaker_ids = None self.speaker_ids = None
self.clip_ids = None self.clip_ids = None
self.speaker_encoder = None self.speaker_encoder = None
<<<<<<< HEAD
self.speaker_encoder_ap = None self.speaker_encoder_ap = None
=======
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
if x_vectors_file_path: if x_vectors_file_path:
self.load_x_vectors_file(x_vectors_file_path) self.load_x_vectors_file(x_vectors_file_path)
@ -184,23 +198,51 @@ class SpeakerManager:
def load_x_vectors_file(self, file_path: str): def load_x_vectors_file(self, file_path: str):
self.x_vectors = self._load_json(file_path) self.x_vectors = self._load_json(file_path)
<<<<<<< HEAD
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
=======
self.speaker_ids = list(
set(sorted(x["name"] for x in self.x_vectors.values())))
self.clip_ids = list(
set(sorted(clip_name for clip_name in self.x_vectors.keys())))
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
def get_x_vector_by_clip(self, clip_idx: str): def get_x_vector_by_clip(self, clip_idx: str):
return self.x_vectors[clip_idx]["embedding"] return self.x_vectors[clip_idx]["embedding"]
def get_x_vectors_by_speaker(self, speaker_idx: str): def get_x_vectors_by_speaker(self, speaker_idx: str):
<<<<<<< HEAD
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
=======
return [
x["embedding"] for x in self.x_vectors.values()
if x["name"] == speaker_idx
]
def get_mean_x_vector(self,
speaker_idx: str,
num_samples: int = None,
randomize: bool = False):
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
x_vectors = self.get_x_vectors_by_speaker(speaker_idx) x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
if num_samples is None: if num_samples is None:
x_vectors = np.stack(x_vectors).mean(0) x_vectors = np.stack(x_vectors).mean(0)
else: else:
<<<<<<< HEAD
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
if randomize: if randomize:
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
=======
assert len(
x_vectors
) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
if randomize:
x_vectors = np.stack(random.choices(x_vectors,
k=num_samples)).mean(0)
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
else: else:
x_vectors = np.stack(x_vectors[:num_samples]).mean(0) x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
return x_vectors return x_vectors
@ -211,6 +253,7 @@ class SpeakerManager:
def get_clips(self): def get_clips(self):
return sorted(self.x_vectors.keys()) return sorted(self.x_vectors.keys())
<<<<<<< HEAD
def init_speaker_encoder(self, model_path: str, config_path: str) -> None: def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
self.speaker_encoder_config = load_config(config_path) self.speaker_encoder_config = load_config(config_path)
self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder = setup_model(self.speaker_encoder_config)
@ -241,6 +284,12 @@ class SpeakerManager:
return (x_vectors / len(wav_file))[0].tolist() return (x_vectors / len(wav_file))[0].tolist()
x_vector = _compute(wav_file) x_vector = _compute(wav_file)
return x_vector[0].tolist() return x_vector[0].tolist()
=======
def init_speaker_encoder(self, model_path: str, config_path: str):
self.speaker_encoder_config = load_config(config_path)
self.speaker_encoder = setup_model(self.speaker_encoder_config)
self.speaker_encoder.load_checkpoint(config_path, model_path, True)
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
def compute_x_vector(self, feats): def compute_x_vector(self, feats):
if isinstance(feats, np.ndarray): if isinstance(feats, np.ndarray):

View File

@ -1,10 +1,12 @@
import datetime import datetime
import glob import glob
import json
import os import os
import shutil import shutil
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Union
def get_git_branch(): def get_git_branch():
@ -163,3 +165,20 @@ def check_argument(
assert ( assert (
isinstance(c[name], val_type) or c[name] is None isinstance(c[name], val_type) or c[name] is None
), f" [!] {name} has wrong type - {type(c[name])} vs {val_type}" ), f" [!] {name} has wrong type - {type(c[name])} vs {val_type}"
def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
"""Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
or a dict (gst tokens/values to be use for styling)
Args:
style_wav (str): uri
Returns:
Union[str, dict]: path to file (str) or gst style (dict)
"""
if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
return style_wav # style_wav is a .wav file located on the server
style_wav = json.loads(style_wav)
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}

View File

@ -189,7 +189,7 @@ class Synthesizer(object):
""" """
start_time = time.time() start_time = time.time()
wavs = [] wavs = []
sens = self.split_into_sentences(text) sens = self._split_into_sentences(text)
print(" > Text splitted to sentences.") print(" > Text splitted to sentences.")
print(sens) print(sens)

View File

@ -0,0 +1,49 @@
import os
import unittest
import numpy as np
from tests import get_tests_input_path
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
encoder_model_path = os.path.join(get_tests_input_path(), "dummy_speaker_encoder.pth.tar")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
@staticmethod
def test_speaker_embedding():
# load config
config = load_config(encoder_config_path)
config["audio"]["resample"] = True
# load audio processor and speaker encoder
ap = AudioProcessor(**config.audio)
manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
# load a sample audio and compute embedding
waveform = ap.load_wav(sample_wav_path)
mel = ap.melspectrogram(waveform)
x_vector = manager.compute_x_vector(mel.T)
assert x_vector.shape[1] == 256
@staticmethod
def test_speakers_file_processing():
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)
print(manager.num_speakers)
print(manager.x_vector_dim)
print(manager.clip_ids)
x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0])
assert len(x_vector) == 256
x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0])
assert len(x_vectors[0]) == 256
x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True)
assert len(x_vector1) == 256
x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False)
assert len(x_vector2) == 256
assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0