enable multi-speaker CoquiTTS models for synthesize.py

This commit is contained in:
Eren Gölge 2021-04-26 19:36:53 +02:00
parent b531fa699c
commit 2f0716073e
5 changed files with 109 additions and 43 deletions

View File

@ -29,27 +29,65 @@ def main():
"""You can either use your trained model or choose a model from the provided list.\n\n"""
"""If you don't specify any models, then it uses LJSpeech based English models\n\n"""
"""
Example runs:
# Example Runs:
# list provided models
./TTS/bin/synthesize.py --list_models
## Single Speaker Models
# run tts with default models.
./TTS/bin synthesize.py --text "Text for TTS"
- list provided models
# run a tts model with its default vocoder model.
./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
```
$ ./TTS/bin/synthesize.py --list_models
```
# run with specific tts and vocoder models from the list
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
- run tts with default models.
# run your own TTS model (Using Griffin-Lim Vocoder)
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
```
$ ./TTS/bin synthesize.py --text "Text for TTS"
```
# run your own TTS and Vocoder models
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
- run a tts model with its default vocoder model.
```
$ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
```
- run with specific tts and vocoder models from the list
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
```
- run your own TTS model (Using Griffin-Lim Vocoder)
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
```
- run your own TTS and Vocoder models
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
```
## MULTI-SPEAKER MODELS
- list the available speakers and choose as <speaker_id> among them.
```
$ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- run the multi-speaker TTS model with the target speaker ID.
```
$ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
- run your own multi-speaker TTS model.
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
""",
formatter_class=RawTextHelpFormatter,
)
@ -113,21 +151,23 @@ def main():
parser.add_argument(
"--speaker_idx",
type=str,
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
help="Target speaker ID for a multi-speaker TTS model.",
default=None,
)
parser.add_argument(
"--speaker_wav",
nargs="+",
help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.",
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
default=None,
)
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
parser.add_argument(
"--list_speaker_idxs",
help="List available speaker ids for the defined multi-speaker model.",
default=False,
type=str2bool,
nargs="?",
const=True,
default=False,
)
# aux args
parser.add_argument(
@ -202,6 +242,14 @@ def main():
print(synthesizer.speaker_manager.speaker_ids)
return
# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return
# RUN THE SYNTHESIS
print(" > Text: {}".format(args.text))

View File

@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
print(
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
)
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
if not speaker_mapping:
raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
not c.use_external_speaker_embedding_file
not c.use_external_speaker_embedding_file
): # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None
assert all(
speaker in speaker_mapping
for speaker in speakers), ("As of now you, you cannot "
"introduce new speakers to "
"a previously trained model.")
elif (c.use_external_speaker_embedding_file
and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
assert all(speaker in speaker_mapping for speaker in speakers), (
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
)
elif (
c.use_external_speaker_embedding_file
and not c.external_speaker_embedding_file
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file
@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print(" > Training with {} speakers: {}".format(
len(speakers), ", ".join(speakers)))
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
else:
num_speakers = 0
speaker_embedding_dim = None
@ -133,6 +125,7 @@ class SpeakerManager:
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
"""
def __init__(
self,
x_vectors_file_path: str = "",

View File

@ -107,6 +107,7 @@ class ModelManager(object):
os.makedirs(output_path, exist_ok=True)
print(f" > Downloading model to {output_path}")
output_stats_path = os.path.join(output_path, "scale_stats.npy")
output_speakers_path = os.path.join(output_path, "speakers.json")
# download files to the output path
if self._check_dict_key(model_item, "github_rls_url"):
# download from github release
@ -119,7 +120,7 @@ class ModelManager(object):
if self._check_dict_key(model_item, "stats_file"):
self._download_gdrive_file(model_item["stats_file"], output_stats_path)
# set the scale_path.npy file path in the model config.json
# update the scale_path.npy file path in the model config.json
if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
# set scale stats path in config.json
config_path = output_config_path
@ -127,6 +128,14 @@ class ModelManager(object):
config["audio"]["stats_path"] = output_stats_path
with open(config_path, "w") as jf:
json.dump(config, jf)
# update the speakers.json file path in the model config.json to the current path
if os.path.exists(output_speakers_path):
# set scale stats path in config.json
config_path = output_config_path
config = load_config(config_path)
config["external_speaker_embedding_file"] = output_speakers_path
with open(config_path, "w") as jf:
json.dump(config, jf)
return output_model_path, output_config_path, model_item
def _download_gdrive_file(self, gdrive_idx, output):

View File

@ -127,6 +127,9 @@ class Synthesizer(object):
self.input_size = len(symbols)
if self.tts_config.use_speaker_embedding is True:
self.tts_speakers_file = (
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]
)
self._load_speakers(self.tts_speakers_file)
self.tts_model = setup_model(
@ -189,15 +192,27 @@ class Synthesizer(object):
"""
start_time = time.time()
wavs = []
sens = self._split_into_sentences(text)
sens = self.split_into_sentences(text)
print(" > Text splitted to sentences.")
print(sens)
# get the speaker embedding from the saved x_vectors.
if speaker_idx and isinstance(speaker_idx, str):
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
if self.tts_speakers_file:
# get the speaker embedding from the saved x_vectors.
if speaker_idx and isinstance(speaker_idx, str):
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
elif not speaker_idx and not speaker_wav:
raise ValueError(
" [!] Look like you use a multi-speaker model. "
"You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
)
else:
speaker_embedding = None
else:
speaker_embedding = None
if speaker_idx:
raise ValueError(
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
)
# compute a new x_vector from the given clip.
if speaker_wav is not None:

View File

@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake
class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
@staticmethod
def test_speaker_embedding():
# load config