add: Mozilla Commonvoice, VoxCeleb1+2, LibriTTS to Speaker Encoder Training

2020-09-16 16:49:53 +02:00 · 2020-09-16 16:49:53 +02:00 · 95d2906307
parent c909ca3855
commit 95d2906307
3 changed files with 38 additions and 30 deletions
--- a/TTS/speaker_encoder/config.json
+++ b/TTS/speaker_encoder/config.json
@ -51,36 +51,42 @@
    },
    "datasets": 
        [
+            {
+                "name": "common_voice_wav",
+                "path": "../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            },
            {
                "name": "voxceleb1",
                "path": "../../audio-datasets/en/voxceleb1/",
                "meta_file_train": null,
                "meta_file_val": null
            },
-//            {
-//                "name": "voxceleb2",
-//                "path": "../../audio-datasets/en/voxceleb2/",
-//                "meta_file_train": null,
-//                "meta_file_val": null
-//            },
-//            {
-//                "name": "vctk",
-//                "path": "../../audio-datasets/en/VCTK-Corpus/",
-//                "meta_file_train": null,
-//                "meta_file_val": null
-//            },
-//            {
-//                "name": "libri_tts",
-//                "path": "../../audio-datasets/en/LibriTTS/train-clean-100",
-//                "meta_file_train": null,
-//                "meta_file_val": null
-//            },
-//            {
-//                "name": "libri_tts",
-//                "path": "../../audio-datasets/en/LibriTTS/train-clean-360",
-//                "meta_file_train": null,
-//                "meta_file_val": null
-//            },
+            {
+                "name": "voxceleb2",
+                "path": "../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "vctk",
+                "path": "../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
            {
                "name": "libri_tts",
                "path": "../../audio-datasets/en/LibriTTS/train-other-500",
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -161,7 +161,7 @@ def nancy(root_path, meta_file):
    return items


-def common_voice(root_path, meta_file):
+def common_voice_wav(root_path, meta_file):
    """Normalize the common voice meta data file to TTS format."""
    txt_file = os.path.join(root_path, meta_file)
    items = []
@ -172,8 +172,8 @@ def common_voice(root_path, meta_file):
            cols = line.split("\t")
            text = cols[2]
            speaker_name = cols[0]
-            wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
-            items.append([text, wav_file, speaker_name])
+            wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
+            items.append([text, wav_file, 'MCV_' + speaker_name])
    return items


@ -251,9 +251,9 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'):
                continue
        with open(meta_file) as file_text:
            text = file_text.readlines()[0]
-        wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id,
+        wav_file = os.path.join(root_path, wavs_path, speaker_id,
                                file_id + '.wav')
-        items.append([text, wav_file, speaker_id])
+        items.append([text, wav_file, 'VCTK_' + speaker_id])

    return items

@ -298,3 +298,5 @@ def _voxcel_x(root_path, voxcel_idx):

    with open(str(cache_to), 'r') as f:
        return [x.strip().split('|') for x in f.readlines()]
+
+
--- a/TTS/tts/utils/io.py
+++ b/TTS/tts/utils/io.py
@ -50,7 +50,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc
    if target_loss < best_loss:
        file_name = 'best_model.pth.tar'
        checkpoint_path = os.path.join(output_folder, file_name)
-        print(" > BEST MODEL : {}".format(checkpoint_path))
+        print(" >> BEST MODEL : {}".format(checkpoint_path))
        save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
        best_loss = target_loss
    return best_loss