diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 2a063fbf..67a7c40c 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -51,36 +51,42 @@ }, "datasets": [ + { + "name": "common_voice_wav", + "path": "../../audio-datasets/en/MozillaCommonVoice", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" + }, { "name": "voxceleb1", "path": "../../audio-datasets/en/voxceleb1/", "meta_file_train": null, "meta_file_val": null }, -// { -// "name": "voxceleb2", -// "path": "../../audio-datasets/en/voxceleb2/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "vctk", -// "path": "../../audio-datasets/en/VCTK-Corpus/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", -// "meta_file_train": null, -// "meta_file_val": null -// }, + { + "name": "voxceleb2", + "path": "../../audio-datasets/en/voxceleb2/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "vctk", + "path": "../../audio-datasets/en/VCTK-Corpus/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-100", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-360", + "meta_file_train": null, + "meta_file_val": null + }, { "name": "libri_tts", "path": "../../audio-datasets/en/LibriTTS/train-other-500", diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 40fc66dd..3bcf416c 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -161,7 +161,7 @@ def nancy(root_path, meta_file): return items -def common_voice(root_path, meta_file): +def common_voice_wav(root_path, meta_file): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -172,8 +172,8 @@ def common_voice(root_path, meta_file): cols = line.split("\t") text = cols[2] speaker_name = cols[0] - wav_file = os.path.join(root_path, "clips", cols[1] + ".wav") - items.append([text, wav_file, speaker_name]) + wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) + items.append([text, wav_file, 'MCV_' + speaker_name]) return items @@ -251,9 +251,9 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id, + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + '.wav') - items.append([text, wav_file, speaker_id]) + items.append([text, wav_file, 'VCTK_' + speaker_id]) return items @@ -298,3 +298,5 @@ def _voxcel_x(root_path, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] + + diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py index bf5e13d8..78e9b8b2 100644 --- a/TTS/tts/utils/io.py +++ b/TTS/tts/utils/io.py @@ -50,7 +50,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc if target_loss < best_loss: file_name = 'best_model.pth.tar' checkpoint_path = os.path.join(output_folder, file_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) + print(" >> BEST MODEL : {}".format(checkpoint_path)) save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) best_loss = target_loss return best_loss