update compute_attention_masks.py

2021-01-13 10:03:57 +00:00 · 2021-01-13 10:03:57 +00:00 · 7beaacc55b
parent 0a9767afd7
commit 7beaacc55b
1 changed files with 27 additions and 30 deletions
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@ -1,17 +1,3 @@
-"""Compute attention masks from pre-trained Tacotron or Tacotron2 models.
-Sample run on LJSpeech dataset.
-
- >>>> CUDA_VISIBLE_DEVICES="0" python TTS/bin/compute_attention_masks.py \
-     --model_path /home/erogol/Cluster/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_100000.pth.tar \
-     --config_path /home/erogol/Cluster/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json --dataset ljspeech \
-     --dataset_metafile /home/erogol/Data/LJSpeech-1.1/metadata.csv \
-     --data_path /home/erogol/Data/LJSpeech-1.1/ \
-     --batch_size 16 \
-     --use_cuda true
-
-"""
-
-
 import argparse
 import importlib
 import os
@ -20,6 +6,7 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from argparse import RawTextHelpFormatter
 from TTS.tts.datasets.TTSDataset import MyDataset
 from TTS.tts.utils.generic_utils import setup_model
 from TTS.tts.utils.io import load_checkpoint
@ -30,40 +17,52 @@ from TTS.utils.io import load_config

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
-        description='Extract attention masks from trained Tacotron models.')
+        description='''Extract attention masks from trained Tacotron/Tacotron2 models.
+These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''
+
+'''Each attention mask is written to the same path as the input wav file with ".npy" file extension.
+(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n'''
+
+'''
+Example run:
+    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
+        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
+        --dataset_metafile /root/LJSpeech-1.1/metadata.csv
+        --data_path /root/LJSpeech-1.1/
+        --batch_size 32
+        --dataset ljspeech
+        --use_cuda True
+''',
+        formatter_class=RawTextHelpFormatter
+        )
    parser.add_argument('--model_path',
                        type=str,
-                        help='Path to Tacotron or Tacotron2 model file ')
+                        required=True,
+                        help='Path to Tacotron/Tacotron2 model file ')
    parser.add_argument(
        '--config_path',
        type=str,
        required=True,
-        help='Path to config file for training.',
+        help='Path to Tacotron/Tacotron2 config file.',
    )
    parser.add_argument('--dataset',
                        type=str,
                        default='',
-                        help='Dataset from TTS.tts.dataset.preprocess.')
+                        required=True,
+                        help='Target dataset processor name from TTS.tts.dataset.preprocess.')

    parser.add_argument(
        '--dataset_metafile',
        type=str,
        default='',
+        required=True,
        help='Dataset metafile inclusing file paths with transcripts.')
    parser.add_argument(
        '--data_path',
        type=str,
        default='',
        help='Defines the data path. It overwrites config.json.')
-    parser.add_argument('--output_path',
-                        type=str,
-                        help='path for training outputs.',
-                        default='')
-    parser.add_argument('--output_folder',
-                        type=str,
-                        default='',
-                        help='folder name for training outputs.')
-
    parser.add_argument('--use_cuda',
                        type=bool,
                        default=False,
@ -148,10 +147,8 @@ if __name__ == '__main__':
                    mode='nearest',
                    align_corners=None,
                    recompute_scale_factor=None).squeeze(0).transpose(0, 1)
-
                # remove paddings
                alignment = alignment[:mel_lengths[idx], :text_lengths[idx]].cpu().numpy()
-
                # set file paths
                wav_file_name = os.path.basename(item_idx)
                align_file_name = os.path.splitext(wav_file_name)[0] + '.npy'
@ -160,7 +157,7 @@ if __name__ == '__main__':
                file_paths.append([item_idx, file_path])
                np.save(file_path, alignment)

-        # ourpur metafile
+        # ourput metafile
        metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")

        with open(metafile, "w") as f: