mirror of https://github.com/coqui-ai/TTS.git
enable multi-speaker CoquiTTS models for synthesize.py
This commit is contained in:
parent
b531fa699c
commit
2f0716073e
|
@ -29,27 +29,65 @@ def main():
|
||||||
"""You can either use your trained model or choose a model from the provided list.\n\n"""
|
"""You can either use your trained model or choose a model from the provided list.\n\n"""
|
||||||
"""If you don't specify any models, then it uses LJSpeech based English models\n\n"""
|
"""If you don't specify any models, then it uses LJSpeech based English models\n\n"""
|
||||||
"""
|
"""
|
||||||
Example runs:
|
# Example Runs:
|
||||||
|
|
||||||
# list provided models
|
## Single Speaker Models
|
||||||
./TTS/bin/synthesize.py --list_models
|
|
||||||
|
|
||||||
# run tts with default models.
|
- list provided models
|
||||||
./TTS/bin synthesize.py --text "Text for TTS"
|
|
||||||
|
|
||||||
# run a tts model with its default vocoder model.
|
```
|
||||||
./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
|
$ ./TTS/bin/synthesize.py --list_models
|
||||||
|
```
|
||||||
|
|
||||||
# run with specific tts and vocoder models from the list
|
- run tts with default models.
|
||||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
|
||||||
|
|
||||||
# run your own TTS model (Using Griffin-Lim Vocoder)
|
```
|
||||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
$ ./TTS/bin synthesize.py --text "Text for TTS"
|
||||||
|
```
|
||||||
|
|
||||||
# run your own TTS and Vocoder models
|
- run a tts model with its default vocoder model.
|
||||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
|
||||||
|
```
|
||||||
|
|
||||||
|
- run with specific tts and vocoder models from the list
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
||||||
|
```
|
||||||
|
|
||||||
|
- run your own TTS model (Using Griffin-Lim Vocoder)
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
- run your own TTS and Vocoder models
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||||
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## MULTI-SPEAKER MODELS
|
||||||
|
|
||||||
|
- list the available speakers and choose as <speaker_id> among them.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
||||||
|
```
|
||||||
|
|
||||||
|
- run the multi-speaker TTS model with the target speaker ID.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
||||||
|
```
|
||||||
|
|
||||||
|
- run your own multi-speaker TTS model.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||||
|
```
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -113,21 +151,23 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--speaker_idx",
|
"--speaker_idx",
|
||||||
type=str,
|
type=str,
|
||||||
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
|
help="Target speaker ID for a multi-speaker TTS model.",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--speaker_wav",
|
"--speaker_wav",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.",
|
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
|
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_speaker_idxs",
|
"--list_speaker_idxs",
|
||||||
help="List available speaker ids for the defined multi-speaker model.",
|
help="List available speaker ids for the defined multi-speaker model.",
|
||||||
default=False,
|
|
||||||
type=str2bool,
|
type=str2bool,
|
||||||
|
nargs="?",
|
||||||
|
const=True,
|
||||||
|
default=False,
|
||||||
)
|
)
|
||||||
# aux args
|
# aux args
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -202,6 +242,14 @@ def main():
|
||||||
print(synthesizer.speaker_manager.speaker_ids)
|
print(synthesizer.speaker_manager.speaker_ids)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# check the arguments against a multi-speaker model.
|
||||||
|
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
||||||
|
print(
|
||||||
|
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
||||||
|
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
# RUN THE SYNTHESIS
|
# RUN THE SYNTHESIS
|
||||||
print(" > Text: {}".format(args.text))
|
print(" > Text: {}".format(args.text))
|
||||||
|
|
||||||
|
|
|
@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
print(
|
print(
|
||||||
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
||||||
)
|
)
|
||||||
speaker_mapping = load_speaker_mapping(
|
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||||
c.external_speaker_embedding_file)
|
|
||||||
if not speaker_mapping:
|
if not speaker_mapping:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
||||||
)
|
)
|
||||||
speaker_embedding_dim = len(speaker_mapping[list(
|
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||||
speaker_mapping.keys())[0]]["embedding"])
|
|
||||||
elif (
|
elif (
|
||||||
not c.use_external_speaker_embedding_file
|
not c.use_external_speaker_embedding_file
|
||||||
): # if restore checkpoint and don't use External Embedding file
|
): # if restore checkpoint and don't use External Embedding file
|
||||||
prev_out_path = os.path.dirname(args.restore_path)
|
prev_out_path = os.path.dirname(args.restore_path)
|
||||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
assert all(
|
assert all(speaker in speaker_mapping for speaker in speakers), (
|
||||||
speaker in speaker_mapping
|
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
|
||||||
for speaker in speakers), ("As of now you, you cannot "
|
)
|
||||||
"introduce new speakers to "
|
|
||||||
"a previously trained model.")
|
|
||||||
elif (c.use_external_speaker_embedding_file
|
|
||||||
and c.external_speaker_embedding_file
|
|
||||||
): # if start new train using External Embedding file
|
|
||||||
speaker_mapping = load_speaker_mapping(
|
|
||||||
c.external_speaker_embedding_file)
|
|
||||||
speaker_embedding_dim = len(speaker_mapping[list(
|
|
||||||
speaker_mapping.keys())[0]]["embedding"])
|
|
||||||
elif (
|
elif (
|
||||||
c.use_external_speaker_embedding_file
|
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
|
||||||
and not c.external_speaker_embedding_file
|
): # if start new train using External Embedding file
|
||||||
|
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||||
|
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||||
|
elif (
|
||||||
|
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
|
||||||
): # if start new train using External Embedding file and don't pass external embedding file
|
): # if start new train using External Embedding file and don't pass external embedding file
|
||||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||||
else: # if start new train and don't use External Embedding file
|
else: # if start new train and don't use External Embedding file
|
||||||
|
@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||||
num_speakers = len(speaker_mapping)
|
num_speakers = len(speaker_mapping)
|
||||||
print(" > Training with {} speakers: {}".format(
|
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
|
||||||
len(speakers), ", ".join(speakers)))
|
|
||||||
else:
|
else:
|
||||||
num_speakers = 0
|
num_speakers = 0
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
|
@ -133,6 +125,7 @@ class SpeakerManager:
|
||||||
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
||||||
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
x_vectors_file_path: str = "",
|
x_vectors_file_path: str = "",
|
||||||
|
|
|
@ -107,6 +107,7 @@ class ModelManager(object):
|
||||||
os.makedirs(output_path, exist_ok=True)
|
os.makedirs(output_path, exist_ok=True)
|
||||||
print(f" > Downloading model to {output_path}")
|
print(f" > Downloading model to {output_path}")
|
||||||
output_stats_path = os.path.join(output_path, "scale_stats.npy")
|
output_stats_path = os.path.join(output_path, "scale_stats.npy")
|
||||||
|
output_speakers_path = os.path.join(output_path, "speakers.json")
|
||||||
# download files to the output path
|
# download files to the output path
|
||||||
if self._check_dict_key(model_item, "github_rls_url"):
|
if self._check_dict_key(model_item, "github_rls_url"):
|
||||||
# download from github release
|
# download from github release
|
||||||
|
@ -119,7 +120,7 @@ class ModelManager(object):
|
||||||
if self._check_dict_key(model_item, "stats_file"):
|
if self._check_dict_key(model_item, "stats_file"):
|
||||||
self._download_gdrive_file(model_item["stats_file"], output_stats_path)
|
self._download_gdrive_file(model_item["stats_file"], output_stats_path)
|
||||||
|
|
||||||
# set the scale_path.npy file path in the model config.json
|
# update the scale_path.npy file path in the model config.json
|
||||||
if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
|
if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
|
||||||
# set scale stats path in config.json
|
# set scale stats path in config.json
|
||||||
config_path = output_config_path
|
config_path = output_config_path
|
||||||
|
@ -127,6 +128,14 @@ class ModelManager(object):
|
||||||
config["audio"]["stats_path"] = output_stats_path
|
config["audio"]["stats_path"] = output_stats_path
|
||||||
with open(config_path, "w") as jf:
|
with open(config_path, "w") as jf:
|
||||||
json.dump(config, jf)
|
json.dump(config, jf)
|
||||||
|
# update the speakers.json file path in the model config.json to the current path
|
||||||
|
if os.path.exists(output_speakers_path):
|
||||||
|
# set scale stats path in config.json
|
||||||
|
config_path = output_config_path
|
||||||
|
config = load_config(config_path)
|
||||||
|
config["external_speaker_embedding_file"] = output_speakers_path
|
||||||
|
with open(config_path, "w") as jf:
|
||||||
|
json.dump(config, jf)
|
||||||
return output_model_path, output_config_path, model_item
|
return output_model_path, output_config_path, model_item
|
||||||
|
|
||||||
def _download_gdrive_file(self, gdrive_idx, output):
|
def _download_gdrive_file(self, gdrive_idx, output):
|
||||||
|
|
|
@ -127,6 +127,9 @@ class Synthesizer(object):
|
||||||
self.input_size = len(symbols)
|
self.input_size = len(symbols)
|
||||||
|
|
||||||
if self.tts_config.use_speaker_embedding is True:
|
if self.tts_config.use_speaker_embedding is True:
|
||||||
|
self.tts_speakers_file = (
|
||||||
|
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]
|
||||||
|
)
|
||||||
self._load_speakers(self.tts_speakers_file)
|
self._load_speakers(self.tts_speakers_file)
|
||||||
|
|
||||||
self.tts_model = setup_model(
|
self.tts_model = setup_model(
|
||||||
|
@ -189,15 +192,27 @@ class Synthesizer(object):
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
wavs = []
|
wavs = []
|
||||||
sens = self._split_into_sentences(text)
|
sens = self.split_into_sentences(text)
|
||||||
print(" > Text splitted to sentences.")
|
print(" > Text splitted to sentences.")
|
||||||
print(sens)
|
print(sens)
|
||||||
|
|
||||||
# get the speaker embedding from the saved x_vectors.
|
if self.tts_speakers_file:
|
||||||
if speaker_idx and isinstance(speaker_idx, str):
|
# get the speaker embedding from the saved x_vectors.
|
||||||
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
|
if speaker_idx and isinstance(speaker_idx, str):
|
||||||
|
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
|
||||||
|
elif not speaker_idx and not speaker_wav:
|
||||||
|
raise ValueError(
|
||||||
|
" [!] Look like you use a multi-speaker model. "
|
||||||
|
"You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
speaker_embedding = None
|
||||||
else:
|
else:
|
||||||
speaker_embedding = None
|
if speaker_idx:
|
||||||
|
raise ValueError(
|
||||||
|
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
|
||||||
|
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
|
||||||
|
)
|
||||||
|
|
||||||
# compute a new x_vector from the given clip.
|
# compute a new x_vector from the given clip.
|
||||||
if speaker_wav is not None:
|
if speaker_wav is not None:
|
||||||
|
|
|
@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake
|
||||||
|
|
||||||
class SpeakerManagerTest(unittest.TestCase):
|
class SpeakerManagerTest(unittest.TestCase):
|
||||||
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
|
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_speaker_embedding():
|
def test_speaker_embedding():
|
||||||
# load config
|
# load config
|
||||||
|
|
Loading…
Reference in New Issue