mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'speaker-manager' of https://github.com/coqui-ai/TTS into speaker-manager
This commit is contained in:
commit
f37b488876
|
@ -9,6 +9,7 @@ from typing import Union
|
||||||
|
|
||||||
from flask import Flask, render_template, request, send_file
|
from flask import Flask, render_template, request, send_file
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import style_wav_uri_to_dict
|
||||||
from TTS.utils.io import load_config
|
from TTS.utils.io import load_config
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
|
@ -51,28 +51,35 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
print(
|
print(
|
||||||
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
||||||
)
|
)
|
||||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
speaker_mapping = load_speaker_mapping(
|
||||||
|
c.external_speaker_embedding_file)
|
||||||
if not speaker_mapping:
|
if not speaker_mapping:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
||||||
)
|
)
|
||||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
speaker_embedding_dim = len(speaker_mapping[list(
|
||||||
|
speaker_mapping.keys())[0]]["embedding"])
|
||||||
elif (
|
elif (
|
||||||
not c.use_external_speaker_embedding_file
|
not c.use_external_speaker_embedding_file
|
||||||
): # if restore checkpoint and don't use External Embedding file
|
): # if restore checkpoint and don't use External Embedding file
|
||||||
prev_out_path = os.path.dirname(args.restore_path)
|
prev_out_path = os.path.dirname(args.restore_path)
|
||||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
assert all(speaker in speaker_mapping for speaker in speakers), (
|
assert all(
|
||||||
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
|
speaker in speaker_mapping
|
||||||
)
|
for speaker in speakers), ("As of now you, you cannot "
|
||||||
|
"introduce new speakers to "
|
||||||
|
"a previously trained model.")
|
||||||
|
elif (c.use_external_speaker_embedding_file
|
||||||
|
and c.external_speaker_embedding_file
|
||||||
|
): # if start new train using External Embedding file
|
||||||
|
speaker_mapping = load_speaker_mapping(
|
||||||
|
c.external_speaker_embedding_file)
|
||||||
|
speaker_embedding_dim = len(speaker_mapping[list(
|
||||||
|
speaker_mapping.keys())[0]]["embedding"])
|
||||||
elif (
|
elif (
|
||||||
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
|
c.use_external_speaker_embedding_file
|
||||||
): # if start new train using External Embedding file
|
and not c.external_speaker_embedding_file
|
||||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
|
||||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
|
||||||
elif (
|
|
||||||
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
|
|
||||||
): # if start new train using External Embedding file and don't pass external embedding file
|
): # if start new train using External Embedding file and don't pass external embedding file
|
||||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||||
else: # if start new train and don't use External Embedding file
|
else: # if start new train and don't use External Embedding file
|
||||||
|
@ -80,7 +87,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||||
num_speakers = len(speaker_mapping)
|
num_speakers = len(speaker_mapping)
|
||||||
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
|
print(" > Training with {} speakers: {}".format(
|
||||||
|
len(speakers), ", ".join(speakers)))
|
||||||
else:
|
else:
|
||||||
num_speakers = 0
|
num_speakers = 0
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
|
@ -125,7 +133,10 @@ class SpeakerManager:
|
||||||
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
||||||
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
||||||
"""
|
"""
|
||||||
|
<<<<<<< HEAD
|
||||||
|
|
||||||
|
=======
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
x_vectors_file_path: str = "",
|
x_vectors_file_path: str = "",
|
||||||
|
@ -138,7 +149,10 @@ class SpeakerManager:
|
||||||
self.speaker_ids = None
|
self.speaker_ids = None
|
||||||
self.clip_ids = None
|
self.clip_ids = None
|
||||||
self.speaker_encoder = None
|
self.speaker_encoder = None
|
||||||
|
<<<<<<< HEAD
|
||||||
self.speaker_encoder_ap = None
|
self.speaker_encoder_ap = None
|
||||||
|
=======
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
|
|
||||||
if x_vectors_file_path:
|
if x_vectors_file_path:
|
||||||
self.load_x_vectors_file(x_vectors_file_path)
|
self.load_x_vectors_file(x_vectors_file_path)
|
||||||
|
@ -184,23 +198,51 @@ class SpeakerManager:
|
||||||
|
|
||||||
def load_x_vectors_file(self, file_path: str):
|
def load_x_vectors_file(self, file_path: str):
|
||||||
self.x_vectors = self._load_json(file_path)
|
self.x_vectors = self._load_json(file_path)
|
||||||
|
<<<<<<< HEAD
|
||||||
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
|
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values())))
|
||||||
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
|
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys())))
|
||||||
|
=======
|
||||||
|
self.speaker_ids = list(
|
||||||
|
set(sorted(x["name"] for x in self.x_vectors.values())))
|
||||||
|
self.clip_ids = list(
|
||||||
|
set(sorted(clip_name for clip_name in self.x_vectors.keys())))
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
|
|
||||||
def get_x_vector_by_clip(self, clip_idx: str):
|
def get_x_vector_by_clip(self, clip_idx: str):
|
||||||
return self.x_vectors[clip_idx]["embedding"]
|
return self.x_vectors[clip_idx]["embedding"]
|
||||||
|
|
||||||
def get_x_vectors_by_speaker(self, speaker_idx: str):
|
def get_x_vectors_by_speaker(self, speaker_idx: str):
|
||||||
|
<<<<<<< HEAD
|
||||||
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
|
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx]
|
||||||
|
|
||||||
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
|
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False):
|
||||||
|
=======
|
||||||
|
return [
|
||||||
|
x["embedding"] for x in self.x_vectors.values()
|
||||||
|
if x["name"] == speaker_idx
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_mean_x_vector(self,
|
||||||
|
speaker_idx: str,
|
||||||
|
num_samples: int = None,
|
||||||
|
randomize: bool = False):
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
|
x_vectors = self.get_x_vectors_by_speaker(speaker_idx)
|
||||||
if num_samples is None:
|
if num_samples is None:
|
||||||
x_vectors = np.stack(x_vectors).mean(0)
|
x_vectors = np.stack(x_vectors).mean(0)
|
||||||
else:
|
else:
|
||||||
|
<<<<<<< HEAD
|
||||||
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
|
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
|
||||||
if randomize:
|
if randomize:
|
||||||
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
|
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0)
|
||||||
|
=======
|
||||||
|
assert len(
|
||||||
|
x_vectors
|
||||||
|
) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
|
||||||
|
if randomize:
|
||||||
|
x_vectors = np.stack(random.choices(x_vectors,
|
||||||
|
k=num_samples)).mean(0)
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
else:
|
else:
|
||||||
x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
|
x_vectors = np.stack(x_vectors[:num_samples]).mean(0)
|
||||||
return x_vectors
|
return x_vectors
|
||||||
|
@ -211,6 +253,7 @@ class SpeakerManager:
|
||||||
def get_clips(self):
|
def get_clips(self):
|
||||||
return sorted(self.x_vectors.keys())
|
return sorted(self.x_vectors.keys())
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
|
def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
|
||||||
self.speaker_encoder_config = load_config(config_path)
|
self.speaker_encoder_config = load_config(config_path)
|
||||||
self.speaker_encoder = setup_model(self.speaker_encoder_config)
|
self.speaker_encoder = setup_model(self.speaker_encoder_config)
|
||||||
|
@ -241,6 +284,12 @@ class SpeakerManager:
|
||||||
return (x_vectors / len(wav_file))[0].tolist()
|
return (x_vectors / len(wav_file))[0].tolist()
|
||||||
x_vector = _compute(wav_file)
|
x_vector = _compute(wav_file)
|
||||||
return x_vector[0].tolist()
|
return x_vector[0].tolist()
|
||||||
|
=======
|
||||||
|
def init_speaker_encoder(self, model_path: str, config_path: str):
|
||||||
|
self.speaker_encoder_config = load_config(config_path)
|
||||||
|
self.speaker_encoder = setup_model(self.speaker_encoder_config)
|
||||||
|
self.speaker_encoder.load_checkpoint(config_path, model_path, True)
|
||||||
|
>>>>>>> 757dfb9289c7185b0b78d2aa75e8a0c9b2911777
|
||||||
|
|
||||||
def compute_x_vector(self, feats):
|
def compute_x_vector(self, feats):
|
||||||
if isinstance(feats, np.ndarray):
|
if isinstance(feats, np.ndarray):
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
import datetime
|
import datetime
|
||||||
import glob
|
import glob
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
def get_git_branch():
|
def get_git_branch():
|
||||||
|
@ -163,3 +165,20 @@ def check_argument(
|
||||||
assert (
|
assert (
|
||||||
isinstance(c[name], val_type) or c[name] is None
|
isinstance(c[name], val_type) or c[name] is None
|
||||||
), f" [!] {name} has wrong type - {type(c[name])} vs {val_type}"
|
), f" [!] {name} has wrong type - {type(c[name])} vs {val_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
|
||||||
|
"""Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
|
||||||
|
or a dict (gst tokens/values to be use for styling)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
style_wav (str): uri
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[str, dict]: path to file (str) or gst style (dict)
|
||||||
|
"""
|
||||||
|
if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
|
||||||
|
return style_wav # style_wav is a .wav file located on the server
|
||||||
|
|
||||||
|
style_wav = json.loads(style_wav)
|
||||||
|
return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
|
||||||
|
|
|
@ -189,7 +189,7 @@ class Synthesizer(object):
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
wavs = []
|
wavs = []
|
||||||
sens = self.split_into_sentences(text)
|
sens = self._split_into_sentences(text)
|
||||||
print(" > Text splitted to sentences.")
|
print(" > Text splitted to sentences.")
|
||||||
print(sens)
|
print(sens)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from tests import get_tests_input_path
|
||||||
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.io import load_config
|
||||||
|
|
||||||
|
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
|
||||||
|
encoder_model_path = os.path.join(get_tests_input_path(), "dummy_speaker_encoder.pth.tar")
|
||||||
|
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
|
||||||
|
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
|
||||||
|
|
||||||
|
|
||||||
|
class SpeakerManagerTest(unittest.TestCase):
|
||||||
|
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
|
||||||
|
@staticmethod
|
||||||
|
def test_speaker_embedding():
|
||||||
|
# load config
|
||||||
|
config = load_config(encoder_config_path)
|
||||||
|
config["audio"]["resample"] = True
|
||||||
|
|
||||||
|
# load audio processor and speaker encoder
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
|
||||||
|
|
||||||
|
# load a sample audio and compute embedding
|
||||||
|
waveform = ap.load_wav(sample_wav_path)
|
||||||
|
mel = ap.melspectrogram(waveform)
|
||||||
|
x_vector = manager.compute_x_vector(mel.T)
|
||||||
|
assert x_vector.shape[1] == 256
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def test_speakers_file_processing():
|
||||||
|
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path)
|
||||||
|
print(manager.num_speakers)
|
||||||
|
print(manager.x_vector_dim)
|
||||||
|
print(manager.clip_ids)
|
||||||
|
x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0])
|
||||||
|
assert len(x_vector) == 256
|
||||||
|
x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0])
|
||||||
|
assert len(x_vectors[0]) == 256
|
||||||
|
x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True)
|
||||||
|
assert len(x_vector1) == 256
|
||||||
|
x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False)
|
||||||
|
assert len(x_vector2) == 256
|
||||||
|
assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0
|
Loading…
Reference in New Issue