mirror of https://github.com/coqui-ai/TTS.git
enable multi-speaker CoquiTTS models for synthesize.py
This commit is contained in:
parent
b531fa699c
commit
2f0716073e
|
@ -29,27 +29,65 @@ def main():
|
|||
"""You can either use your trained model or choose a model from the provided list.\n\n"""
|
||||
"""If you don't specify any models, then it uses LJSpeech based English models\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
# Example Runs:
|
||||
|
||||
# list provided models
|
||||
./TTS/bin/synthesize.py --list_models
|
||||
## Single Speaker Models
|
||||
|
||||
# run tts with default models.
|
||||
./TTS/bin synthesize.py --text "Text for TTS"
|
||||
- list provided models
|
||||
|
||||
# run a tts model with its default vocoder model.
|
||||
./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --list_models
|
||||
```
|
||||
|
||||
# run with specific tts and vocoder models from the list
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
||||
- run tts with default models.
|
||||
|
||||
# run your own TTS model (Using Griffin-Lim Vocoder)
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
```
|
||||
$ ./TTS/bin synthesize.py --text "Text for TTS"
|
||||
```
|
||||
|
||||
# run your own TTS and Vocoder models
|
||||
./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||
- run a tts model with its default vocoder model.
|
||||
|
||||
```
|
||||
$ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
|
||||
```
|
||||
|
||||
- run with specific tts and vocoder models from the list
|
||||
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
|
||||
```
|
||||
|
||||
- run your own TTS model (Using Griffin-Lim Vocoder)
|
||||
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- run your own TTS and Vocoder models
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
||||
## MULTI-SPEAKER MODELS
|
||||
|
||||
- list the available speakers and choose as <speaker_id> among them.
|
||||
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
||||
```
|
||||
|
||||
- run the multi-speaker TTS model with the target speaker ID.
|
||||
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
||||
```
|
||||
|
||||
- run your own multi-speaker TTS model.
|
||||
|
||||
```
|
||||
$ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
```
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
|
@ -113,21 +151,23 @@ def main():
|
|||
parser.add_argument(
|
||||
"--speaker_idx",
|
||||
type=str,
|
||||
help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
|
||||
help="Target speaker ID for a multi-speaker TTS model.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker_wav",
|
||||
nargs="+",
|
||||
help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.",
|
||||
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
|
||||
parser.add_argument(
|
||||
"--list_speaker_idxs",
|
||||
help="List available speaker ids for the defined multi-speaker model.",
|
||||
default=False,
|
||||
type=str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
)
|
||||
# aux args
|
||||
parser.add_argument(
|
||||
|
@ -202,6 +242,14 @@ def main():
|
|||
print(synthesizer.speaker_manager.speaker_ids)
|
||||
return
|
||||
|
||||
# check the arguments against a multi-speaker model.
|
||||
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
||||
print(
|
||||
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
||||
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
||||
)
|
||||
return
|
||||
|
||||
# RUN THE SYNTHESIS
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
||||
|
|
|
@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
print(
|
||||
"WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file"
|
||||
)
|
||||
speaker_mapping = load_speaker_mapping(
|
||||
c.external_speaker_embedding_file)
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
if not speaker_mapping:
|
||||
raise RuntimeError(
|
||||
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
|
||||
)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(
|
||||
speaker_mapping.keys())[0]]["embedding"])
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||
elif (
|
||||
not c.use_external_speaker_embedding_file
|
||||
not c.use_external_speaker_embedding_file
|
||||
): # if restore checkpoint and don't use External Embedding file
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
speaker_embedding_dim = None
|
||||
assert all(
|
||||
speaker in speaker_mapping
|
||||
for speaker in speakers), ("As of now you, you cannot "
|
||||
"introduce new speakers to "
|
||||
"a previously trained model.")
|
||||
elif (c.use_external_speaker_embedding_file
|
||||
and c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(
|
||||
c.external_speaker_embedding_file)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(
|
||||
speaker_mapping.keys())[0]]["embedding"])
|
||||
assert all(speaker in speaker_mapping for speaker in speakers), (
|
||||
"As of now you, you cannot " "introduce new speakers to " "a previously trained model."
|
||||
)
|
||||
elif (
|
||||
c.use_external_speaker_embedding_file
|
||||
and not c.external_speaker_embedding_file
|
||||
c.use_external_speaker_embedding_file and c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"])
|
||||
elif (
|
||||
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
|
||||
): # if start new train using External Embedding file and don't pass external embedding file
|
||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||
else: # if start new train and don't use External Embedding file
|
||||
|
@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
|||
speaker_embedding_dim = None
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print(" > Training with {} speakers: {}".format(
|
||||
len(speakers), ", ".join(speakers)))
|
||||
print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
speaker_embedding_dim = None
|
||||
|
@ -133,6 +125,7 @@ class SpeakerManager:
|
|||
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
||||
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
x_vectors_file_path: str = "",
|
||||
|
|
|
@ -107,6 +107,7 @@ class ModelManager(object):
|
|||
os.makedirs(output_path, exist_ok=True)
|
||||
print(f" > Downloading model to {output_path}")
|
||||
output_stats_path = os.path.join(output_path, "scale_stats.npy")
|
||||
output_speakers_path = os.path.join(output_path, "speakers.json")
|
||||
# download files to the output path
|
||||
if self._check_dict_key(model_item, "github_rls_url"):
|
||||
# download from github release
|
||||
|
@ -119,7 +120,7 @@ class ModelManager(object):
|
|||
if self._check_dict_key(model_item, "stats_file"):
|
||||
self._download_gdrive_file(model_item["stats_file"], output_stats_path)
|
||||
|
||||
# set the scale_path.npy file path in the model config.json
|
||||
# update the scale_path.npy file path in the model config.json
|
||||
if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path):
|
||||
# set scale stats path in config.json
|
||||
config_path = output_config_path
|
||||
|
@ -127,6 +128,14 @@ class ModelManager(object):
|
|||
config["audio"]["stats_path"] = output_stats_path
|
||||
with open(config_path, "w") as jf:
|
||||
json.dump(config, jf)
|
||||
# update the speakers.json file path in the model config.json to the current path
|
||||
if os.path.exists(output_speakers_path):
|
||||
# set scale stats path in config.json
|
||||
config_path = output_config_path
|
||||
config = load_config(config_path)
|
||||
config["external_speaker_embedding_file"] = output_speakers_path
|
||||
with open(config_path, "w") as jf:
|
||||
json.dump(config, jf)
|
||||
return output_model_path, output_config_path, model_item
|
||||
|
||||
def _download_gdrive_file(self, gdrive_idx, output):
|
||||
|
|
|
@ -127,6 +127,9 @@ class Synthesizer(object):
|
|||
self.input_size = len(symbols)
|
||||
|
||||
if self.tts_config.use_speaker_embedding is True:
|
||||
self.tts_speakers_file = (
|
||||
self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]
|
||||
)
|
||||
self._load_speakers(self.tts_speakers_file)
|
||||
|
||||
self.tts_model = setup_model(
|
||||
|
@ -189,15 +192,27 @@ class Synthesizer(object):
|
|||
"""
|
||||
start_time = time.time()
|
||||
wavs = []
|
||||
sens = self._split_into_sentences(text)
|
||||
sens = self.split_into_sentences(text)
|
||||
print(" > Text splitted to sentences.")
|
||||
print(sens)
|
||||
|
||||
# get the speaker embedding from the saved x_vectors.
|
||||
if speaker_idx and isinstance(speaker_idx, str):
|
||||
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
|
||||
if self.tts_speakers_file:
|
||||
# get the speaker embedding from the saved x_vectors.
|
||||
if speaker_idx and isinstance(speaker_idx, str):
|
||||
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0]
|
||||
elif not speaker_idx and not speaker_wav:
|
||||
raise ValueError(
|
||||
" [!] Look like you use a multi-speaker model. "
|
||||
"You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
|
||||
)
|
||||
else:
|
||||
speaker_embedding = None
|
||||
else:
|
||||
speaker_embedding = None
|
||||
if speaker_idx:
|
||||
raise ValueError(
|
||||
f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
|
||||
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
|
||||
)
|
||||
|
||||
# compute a new x_vector from the given clip.
|
||||
if speaker_wav is not None:
|
||||
|
|
|
@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake
|
|||
|
||||
class SpeakerManagerTest(unittest.TestCase):
|
||||
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms"""
|
||||
|
||||
@staticmethod
|
||||
def test_speaker_embedding():
|
||||
# load config
|
||||
|
|
Loading…
Reference in New Issue