enable multi-speaker CoquiTTS models for synthesize.py

This commit is contained in:
Eren Gölge 2021-04-26 19:36:53 +02:00
parent b531fa699c
commit 2f0716073e
5 changed files with 109 additions and 43 deletions

View File

@ -29,27 +29,65 @@ def main():
"""You can either use your trained model or choose a model from the provided list.\n\n""" """You can either use your trained model or choose a model from the provided list.\n\n"""
"""If you don't specify any models, then it uses LJSpeech based English models\n\n""" """If you don't specify any models, then it uses LJSpeech based English models\n\n"""
""" """
Example runs: # Example Runs:
# list provided models ## Single Speaker Models
./TTS/bin/synthesize.py --list_models
# run tts with default models. - list provided models
./TTS/bin synthesize.py --text "Text for TTS"
# run a tts model with its default vocoder model. ```
./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" $ ./TTS/bin/synthesize.py --list_models
```
# run with specific tts and vocoder models from the list - run tts with default models.
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
# run your own TTS model (Using Griffin-Lim Vocoder) ```
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav $ ./TTS/bin synthesize.py --text "Text for TTS"
```
# run your own TTS and Vocoder models - run a tts model with its default vocoder model.
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
```
$ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
```
- run with specific tts and vocoder models from the list
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
```
- run your own TTS model (Using Griffin-Lim Vocoder)
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
```
- run your own TTS and Vocoder models
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
```
## MULTI-SPEAKER MODELS
- list the available speakers and choose as <speaker_id> among them.
```
$ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- run the multi-speaker TTS model with the target speaker ID.
```
$ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
- run your own multi-speaker TTS model.
```
$ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
@ -113,21 +151,23 @@ def main():
parser.add_argument( parser.add_argument(
"--speaker_idx", "--speaker_idx",
type=str, type=str,
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.", help="Target speaker ID for a multi-speaker TTS model.",
default=None, default=None,
) )
parser.add_argument( parser.add_argument(
"--speaker_wav", "--speaker_wav",
nargs="+", nargs="+",
help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
default=None, default=None,
) )
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
parser.add_argument( parser.add_argument(
"--list_speaker_idxs", "--list_speaker_idxs",
help="List available speaker ids for the defined multi-speaker model.", help="List available speaker ids for the defined multi-speaker model.",
default=False,
type=str2bool, type=str2bool,
nargs="?",
const=True,
default=False,
) )
# aux args # aux args
parser.add_argument( parser.add_argument(
@ -202,6 +242,14 @@ def main():
print(synthesizer.speaker_manager.speaker_ids) print(synthesizer.speaker_manager.speaker_ids)
return return
# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return
# RUN THE SYNTHESIS # RUN THE SYNTHESIS
print(" > Text: {}".format(args.text)) print(" > Text: {}".format(args.text))

View File

@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
print( print(
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
) )
speaker_mapping = load_speaker_mapping( speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
c.external_speaker_embedding_file)
if not speaker_mapping: if not speaker_mapping:
raise RuntimeError( raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
) )
speaker_embedding_dim = len(speaker_mapping[list( speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
not c.use_external_speaker_embedding_file not c.use_external_speaker_embedding_file
): # if restore checkpoint and don't use External Embedding file ): # if restore checkpoint and don't use External Embedding file
prev_out_path = os.path.dirname(args.restore_path) prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path) speaker_mapping = load_speaker_mapping(prev_out_path)
speaker_embedding_dim = None speaker_embedding_dim = None
assert all( assert all(speaker in speaker_mapping for speaker in speakers), (
speaker in speaker_mapping "As of now you, you cannot " "introduce new speakers to " "a previously trained model."
for speaker in speakers), ("As of now you, you cannot " )
"introduce new speakers to "
"a previously trained model.")
elif (c.use_external_speaker_embedding_file
and c.external_speaker_embedding_file
): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(
c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(
speaker_mapping.keys())[0]]["embedding"])
elif ( elif (
c.use_external_speaker_embedding_file c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
and not c.external_speaker_embedding_file ): # if start new train using External Embedding file
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # if start new train using External Embedding file and don't pass external embedding file ): # if start new train using External Embedding file and don't pass external embedding file
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
else: # if start new train and don't use External Embedding file else: # if start new train and don't use External Embedding file
@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
speaker_embedding_dim = None speaker_embedding_dim = None
save_speaker_mapping(OUT_PATH, speaker_mapping) save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping) num_speakers = len(speaker_mapping)
print(" > Training with {} speakers: {}".format( print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
len(speakers), ", ".join(speakers)))
else: else:
num_speakers = 0 num_speakers = 0
speaker_embedding_dim = None speaker_embedding_dim = None
@ -133,6 +125,7 @@ class SpeakerManager:
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
""" """
def __init__( def __init__(
self, self,
x_vectors_file_path: str = "", x_vectors_file_path: str = "",

View File

@ -107,6 +107,7 @@ class ModelManager(object):
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
print(f" > Downloading model to {output_path}") print(f" > Downloading model to {output_path}")
output_stats_path = os.path.join(output_path, "scale_stats.npy") output_stats_path = os.path.join(output_path, "scale_stats.npy")
output_speakers_path = os.path.join(output_path, "speakers.json")
# download files to the output path # download files to the output path
if self._check_dict_key(model_item, "github_rls_url"): if self._check_dict_key(model_item, "github_rls_url"):
# download from github release # download from github release
@ -119,7 +120,7 @@ class ModelManager(object):
if self._check_dict_key(model_item, "stats_file"): if self._check_dict_key(model_item, "stats_file"):
self._download_gdrive_file(model_item["stats_file"], output_stats_path) self._download_gdrive_file(model_item["stats_file"], output_stats_path)
# set the scale_path.npy file path in the model config.json # update the scale_path.npy file path in the model config.json
if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path): if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
# set scale stats path in config.json # set scale stats path in config.json
config_path = output_config_path config_path = output_config_path
@ -127,6 +128,14 @@ class ModelManager(object):
config["audio"]["stats_path"] = output_stats_path config["audio"]["stats_path"] = output_stats_path
with open(config_path, "w") as jf: with open(config_path, "w") as jf:
json.dump(config, jf) json.dump(config, jf)
# update the speakers.json file path in the model config.json to the current path
if os.path.exists(output_speakers_path):
# set scale stats path in config.json
config_path = output_config_path
config = load_config(config_path)
config["external_speaker_embedding_file"] = output_speakers_path
with open(config_path, "w") as jf:
json.dump(config, jf)
return output_model_path, output_config_path, model_item return output_model_path, output_config_path, model_item
def _download_gdrive_file(self, gdrive_idx, output): def _download_gdrive_file(self, gdrive_idx, output):

View File

@ -127,6 +127,9 @@ class Synthesizer(object):
self.input_size = len(symbols) self.input_size = len(symbols)
if self.tts_config.use_speaker_embedding is True: if self.tts_config.use_speaker_embedding is True:
self.tts_speakers_file = (
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]
)
self._load_speakers(self.tts_speakers_file) self._load_speakers(self.tts_speakers_file)
self.tts_model = setup_model( self.tts_model = setup_model(
@ -189,15 +192,27 @@ class Synthesizer(object):
""" """
start_time = time.time() start_time = time.time()
wavs = [] wavs = []
sens = self._split_into_sentences(text) sens = self.split_into_sentences(text)
print(" > Text splitted to sentences.") print(" > Text splitted to sentences.")
print(sens) print(sens)
# get the speaker embedding from the saved x_vectors. if self.tts_speakers_file:
if speaker_idx and isinstance(speaker_idx, str): # get the speaker embedding from the saved x_vectors.
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] if speaker_idx and isinstance(speaker_idx, str):
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
elif not speaker_idx and not speaker_wav:
raise ValueError(
" [!] Look like you use a multi-speaker model. "
"You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
)
else:
speaker_embedding = None
else: else:
speaker_embedding = None if speaker_idx:
raise ValueError(
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
)
# compute a new x_vector from the given clip. # compute a new x_vector from the given clip.
if speaker_wav is not None: if speaker_wav is not None:

View File

@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake
class SpeakerManagerTest(unittest.TestCase): class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
@staticmethod @staticmethod
def test_speaker_embedding(): def test_speaker_embedding():
# load config # load config