Merge branch 'bokeh-interactive-1' into dev

2019-11-14 15:38:14 +01:00 · 2019-11-14 15:38:14 +01:00 · 574de86b9b
parent 3f1f1f3316 e602534ed7
commit 574de86b9b
4 changed files with 636 additions and 78 deletions
--- a/speaker_encoder/README.md
+++ b/speaker_encoder/README.md
@ -1,8 +1,12 @@
 ### Speaker embedding (Experimental)
-This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. So you can generate d-vectors for multi-speaker TTS or prune bad samples from your TTS dataset. Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook. 
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
-![](https://user-images.githubusercontent.com/1402048/64603079-7fa5c100-d3c8-11e9-88e7-88a00d0e37d1.png)
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
 Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). 
 ![](umap.png)
 Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
--- a/speaker_encoder/compute_embeddings.py
+++ b/speaker_encoder/compute_embeddings.py
@ -14,27 +14,52 @@ parser = argparse.ArgumentParser(
    description="Compute embedding vectors for each wav file in a dataset. "
 )
 parser.add_argument(
-    "model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.)."
+    'data_path',
-)
+    type=str,
    help='Data path for wav files - directory or CSV file')
 parser.add_argument(
    "config_path", type=str, help="Path to config file for training.",
 )
 parser.add_argument(
    "data_path", type=str, help="Defines the data path. It overwrites config.json."
 )
-parser.add_argument("output_path", type=str, help="path for training outputs.")
+parser.add_argument(
-parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
+    '--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
 )
 args = parser.parse_args()
 c = load_config(args.config_path)
 ap = AudioProcessor(**c["audio"])
-wav_files = glob.glob(args.data_path + "/**/*.wav", recursive=True)
+data_path = args.data_path
-output_files = [
+split_ext = os.path.splitext(data_path)
-    wav_file.replace(args.data_path, args.output_path).replace(".wav", ".npy")
+sep = args.separator
-    for wav_file in wav_files
+
-]
+if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
    # Parse CSV
    print(f'CSV file: {data_path}')
    with open(data_path) as f:
        wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
        wav_files = []
        print(f'Separator is: {sep}')
        for line in f:
            components = line.split(sep)
            if len(components) != 2:
                print("Invalid line")
                continue
            wav_file = os.path.join(wav_path, components[0] + '.wav')
            #print(f'wav_file: {wav_file}')
            if os.path.exists(wav_file):
                wav_files.append(wav_file)
    print(f'Count of wavs imported: {len(wav_files)}')
 else:
    # Parse all wav files in data_path
    wav_path = data_path
    wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
 output_files = [wav_file.replace(wav_path, args.output_path).replace(
    '.wav', '.npy') for wav_file in wav_files]
 for output_file in output_files:
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
--- a/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb
+++ b/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb
--- a/speaker_encoder/umap.png
+++ b/speaker_encoder/umap.png