add: Mozilla Commonvoice, VoxCeleb1+2, LibriTTS to Speaker Encoder Training

This commit is contained in:
mueller 2020-09-16 16:49:53 +02:00
parent c909ca3855
commit 95d2906307
3 changed files with 38 additions and 30 deletions

View File

@ -51,36 +51,42 @@
},
"datasets":
[
{
"name": "common_voice_wav",
"path": "../../audio-datasets/en/MozillaCommonVoice",
"meta_file_train": "train.tsv",
"meta_file_val": "test.tsv"
},
{
"name": "voxceleb1",
"path": "../../audio-datasets/en/voxceleb1/",
"meta_file_train": null,
"meta_file_val": null
},
// {
// "name": "voxceleb2",
// "path": "../../audio-datasets/en/voxceleb2/",
// "meta_file_train": null,
// "meta_file_val": null
// },
// {
// "name": "vctk",
// "path": "../../audio-datasets/en/VCTK-Corpus/",
// "meta_file_train": null,
// "meta_file_val": null
// },
// {
// "name": "libri_tts",
// "path": "../../audio-datasets/en/LibriTTS/train-clean-100",
// "meta_file_train": null,
// "meta_file_val": null
// },
// {
// "name": "libri_tts",
// "path": "../../audio-datasets/en/LibriTTS/train-clean-360",
// "meta_file_train": null,
// "meta_file_val": null
// },
{
"name": "voxceleb2",
"path": "../../audio-datasets/en/voxceleb2/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "vctk",
"path": "../../audio-datasets/en/VCTK-Corpus/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../audio-datasets/en/LibriTTS/train-clean-100",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../audio-datasets/en/LibriTTS/train-clean-360",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../audio-datasets/en/LibriTTS/train-other-500",

View File

@ -161,7 +161,7 @@ def nancy(root_path, meta_file):
return items
def common_voice(root_path, meta_file):
def common_voice_wav(root_path, meta_file):
"""Normalize the common voice meta data file to TTS format."""
txt_file = os.path.join(root_path, meta_file)
items = []
@ -172,8 +172,8 @@ def common_voice(root_path, meta_file):
cols = line.split("\t")
text = cols[2]
speaker_name = cols[0]
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
items.append([text, wav_file, speaker_name])
wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
items.append([text, wav_file, 'MCV_' + speaker_name])
return items
@ -251,9 +251,9 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'):
continue
with open(meta_file) as file_text:
text = file_text.readlines()[0]
wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id,
wav_file = os.path.join(root_path, wavs_path, speaker_id,
file_id + '.wav')
items.append([text, wav_file, speaker_id])
items.append([text, wav_file, 'VCTK_' + speaker_id])
return items
@ -298,3 +298,5 @@ def _voxcel_x(root_path, voxcel_idx):
with open(str(cache_to), 'r') as f:
return [x.strip().split('|') for x in f.readlines()]

View File

@ -50,7 +50,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc
if target_loss < best_loss:
file_name = 'best_model.pth.tar'
checkpoint_path = os.path.join(output_folder, file_name)
print(" > BEST MODEL : {}".format(checkpoint_path))
print(" >> BEST MODEL : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
best_loss = target_loss
return best_loss