Merge remote-tracking branch 'TTS/dev' into dev

2020-07-13 09:22:32 +02:00 · 2020-07-13 09:22:32 +02:00 · 9530a6bdaf
parent d8bdf750b7 98becfe973
commit 9530a6bdaf
15 changed files with 239 additions and 69 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,15 +1,20 @@
 numpy>=1.16.0
 torch>=1.5
-librosa>=0.5.1
+tensorflow>=2.2
-Unidecode>=0.4.20
+numpy>=1.16.0
-tensorboard
+scipy>=0.19.0
 numba==0.48
 librosa==0.7.2
 phonemizer>=2.2.0
 unidecode==0.4.20
 attrdict
 tensorboardX
 matplotlib
 Pillow
 flask
 scipy
 tqdm
 soundfile
 phonemizer
 bokeh==1.4.0
 inflect
 bokeh==1.4.0
 soundfile
 nose==1.3.7
 cardboardlint==1.3.0
 pylint==2.5.3
--- a/requirements_tests.txt
+++ b/requirements_tests.txt
@ -1,18 +1,19 @@
 torch>=1.5
 tensorflow==2.3rc
 numpy>=1.16.0
 scipy>=0.19.0
 numba==0.48
-torch>=0.4.1
+librosa==0.7.2
-tensorflow>=2.2
+phonemizer>=2.2.0
-librosa>=0.5.1
+unidecode==0.4.20
-Unidecode>=0.4.20
+attrdict
 tensorboard
 tensorboardX
 matplotlib
 Pillow
 flask
 scipy
 tqdm
 soundfile
 inflect
 phonemizer
 bokeh==1.4.0
-nose
+soundfile
 nose==1.3.7
 cardboardlint==1.3.0
--- a/server/server.py
+++ b/server/server.py
@ -15,12 +15,9 @@ def create_argparser():
    parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
    parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
    parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
-    parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
+    parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
    parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
    parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
    parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
    parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
    parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
    parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
@ -46,10 +43,6 @@ embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
 wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
 wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
 embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
 pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
 pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
 args = create_argparser().parse_args()
 # If these were not specified in the CLI args, use default values with embedded model files
@ -57,19 +50,16 @@ if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
    args.tts_checkpoint = tts_checkpoint_file
 if not args.tts_config and os.path.isfile(tts_config_file):
    args.tts_config = tts_config_file
 if not args.vocoder_checkpoint and os.path.isfile(tts_checkpoint_file):
    args.tts_checkpoint = tts_checkpoint_file
 if not args.vocoder_config and os.path.isfile(tts_config_file):
    args.tts_config = tts_config_file
-if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
+if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
-    args.wavernn_file = wavernn_checkpoint_file
+    args.vocoder_checkpoint = vocoder_checkpoint_file
 if not args.vocoder_config and os.path.isfile(vocoder_config_file):
    args.vocoder_config = vocoder_config_file
 if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
    args.wavernn_checkpoint = wavernn_checkpoint_file
 if not args.wavernn_config and os.path.isfile(wavernn_config_file):
    args.wavernn_config = wavernn_config_file
 if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
    args.pwgan_file = pwgan_checkpoint_file
 if not args.pwgan_config and os.path.isfile(pwgan_config_file):
    args.pwgan_config = pwgan_config_file
 synthesizer = Synthesizer(args)
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -31,15 +31,16 @@ class Synthesizer(object):
        self.wavernn = None
        self.vocoder_model = None
        self.config = config
        print(config)
        self.use_cuda = self.config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
        self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                      self.config.use_cuda)
-        if self.config.vocoder_file:
+        if self.config.vocoder_checkpoint:
            self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
        if self.config.wavernn_lib_path:
-            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
+            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
                              self.config.wavernn_config, self.config.use_cuda)
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
--- a/setup.py
+++ b/setup.py
@ -69,6 +69,41 @@ if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
    shutil.copy(args.model_config, embedded_config_path)
    package_data.extend([embedded_checkpoint_path, embedded_config_path])
 def pip_install(package_name):
    subprocess.call(
        [sys.executable, '-m', 'pip', 'install', package_name]
    )
 requirements = {
    'install_requires':[
        "torch>=1.5",
        "numpy>=1.16.0",
        "numba==0.48",
        "scipy>=0.19.0",
        "librosa==0.7.2",
        "unidecode==0.4.20",
        "attrdict",
        "tensorboardX",
        "matplotlib",
        "Pillow",
        "flask",
        "tqdm",
        "inflect",
        "bokeh==1.4.0",
        "soundfile",
        "phonemizer>=2.2.0",
        "nose==1.3.7",
        "cardboardlint==1.3.0",
        "pylint==2.5.3",
    ],
    'pip_install':[
        'tensorflow>=2.2.0',
    ]
 }
 setup(
    name='TTS',
    version=version,
@ -95,24 +130,23 @@ setup(
        'build_py': build_py,
        'develop': develop,
    },
-    install_requires=[
+    install_requires=requirements['install_requires'],
-        "scipy>=0.19.0",
+    python_requires='>=3.6.0',
-        "torch>=1.5",
+    classifiers=[
-        "numpy>=1.16.0",
+        "Programming Language :: Python",
-        "librosa==0.6.2",
+        "Programming Language :: Python :: 3",
-        "unidecode==0.4.20",
+        "Programming Language :: Python :: 3.6",
-        "attrdict",
+        "Programming Language :: Python :: 3.7",
-        "tensorboardX",
+        "Programming Language :: Python :: 3.8",
-        "matplotlib",
+        'Development Status :: 3 - Alpha',
-        "Pillow",
+        "Intended Audience :: Science/Research :: Developers",
-        "flask",
+        "Operating System :: POSIX :: Linux",
-        "tqdm",
+        'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
-        "inflect",
+        "Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
        "bokeh==1.4.0",
        "soundfile",
        "phonemizer @ https://github.com/bootphon/phonemizer/tarball/master",
    ],
    dependency_links=[
        "http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1"
    ]
 )
 # for some reason having tensorflow in 'install_requires'
 # breaks some of the dependencies.
 for module in requirements['pip_install']:
    pip_install(module)
--- a/tests/inputs/scale_stats.npy
+++ b/tests/inputs/scale_stats.npy
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@ -6,7 +6,7 @@
    "wavernn_file": null, // wavernn checkpoint file name
    "wavernn_config": null, // wavernn config file
    "vocoder_config":null,
-    "vocoder_file": null,
+    "vocoder_checkpoint": null,
    "is_wavernn_batched":true,
    "port": 5002,
    "use_cuda": false,
--- a/tests/test_server_package.sh
+++ b/tests/test_server_package.sh
@ -14,6 +14,9 @@ rm -f dist/*.whl
 python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
 pip install --quiet dist/TTS*.whl
 # this is related to https://github.com/librosa/librosa/issues/1160
 pip install numba==0.48
 python -m TTS.server.server &
 SERVER_PID=$!
--- a/tf/init.py
+++ b/tf/init.py
--- a/tf/tests/init.py
+++ b/tf/tests/init.py
@ -0,0 +1 @@
--- a/tf/tests/test_layers_tf.py
+++ b/tf/tests/test_layers_tf.py
--- a/tf/tests/test_tacotron2_tf_model.py
+++ b/tf/tests/test_tacotron2_tf_model.py
@ -0,0 +1,134 @@
 import os
 import torch
 import unittest
 import numpy as np
 import tensorflow as tf
 tf.get_logger().setLevel('INFO')
 from TTS.utils.io import load_config
 from TTS.tf.models.tacotron2 import Tacotron2
 from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
 #pylint: disable=unused-variable
 torch.manual_seed(1)
 use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
 c = load_config(os.path.join(file_path, 'test_config.json'))
 class TacotronTFTrainTest(unittest.TestCase):
    @staticmethod
    def generate_dummy_inputs():
        chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
        chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
        chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
        chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
        chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
        mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
        return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
            stop_targets, speaker_ids
    def test_train_step(self):
        ''' test forward pass '''
        chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
            stop_targets, speaker_ids = self.generate_dummy_inputs()
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0
        stop_targets = stop_targets.view(chars_seq.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
        # training pass
        output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
        # check model output shapes
        assert np.all(output[0].shape == mel_spec.shape)
        assert np.all(output[1].shape == mel_spec.shape)
        assert output[2].shape[2] == chars_seq.shape[1]
        assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
        assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
        # inference pass
        output = model(chars_seq, training=False)
    def test_forward_attention(self,):
        chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
            stop_targets, speaker_ids = self.generate_dummy_inputs()
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0
        stop_targets = stop_targets.view(chars_seq.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True)
        # training pass
        output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
        # check model output shapes
        assert np.all(output[0].shape == mel_spec.shape)
        assert np.all(output[1].shape == mel_spec.shape)
        assert output[2].shape[2] == chars_seq.shape[1]
        assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
        assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
        # inference pass
        output = model(chars_seq, training=False)
    def test_tflite_conversion(self, ):  #pylint:disable=no-self-use
        model = Tacotron2(num_chars=24,
                          num_speakers=0,
                          r=3,
                          postnet_output_dim=80,
                          decoder_output_dim=80,
                          attn_type='original',
                          attn_win=False,
                          attn_norm='sigmoid',
                          prenet_type='original',
                          prenet_dropout=True,
                          forward_attn=False,
                          trans_agent=False,
                          forward_attn_mask=False,
                          location_attn=True,
                          attn_K=0,
                          separate_stopnet=True,
                          bidirectional_decoder=False,
                          enable_tflite=True)
        model.build_inference()
        convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True)
        # init tflite model
        tflite_model = load_tflite_model('test_tacotron2.tflite')
        # fake input
        inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32)  #pylint:disable=unexpected-keyword-arg
        # run inference
        # get input and output details
        input_details = tflite_model.get_input_details()
        output_details = tflite_model.get_output_details()
        # reshape input tensor for the new input shape
        tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape)  #pylint:disable=unexpected-keyword-arg
        tflite_model.allocate_tensors()
        detail = input_details[0]
        input_shape = detail['shape']
        tflite_model.set_tensor(detail['index'], inputs)
        # run the tflite_model
        tflite_model.invoke()
        # collect outputs
        decoder_output = tflite_model.get_tensor(output_details[0]['index'])
        postnet_output = tflite_model.get_tensor(output_details[1]['index'])
        # remove tflite binary
        os.remove('test_tacotron2.tflite')
--- a/vocoder/README.md
+++ b/vocoder/README.md
@ -16,23 +16,23 @@ You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpee
 In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json'''
-You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path.
+You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path, where '0' is the Id of the GPU you wish to use.
-```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --config_path path/to/config.json```
+```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --config_path path/to/config.json```
 Exampled config files can be found under `vocoder/configs/` folder.
 You can continue a previous training by the following command.
-```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --continue_path path/to/your/model/folder```
+```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --continue_path path/to/your/model/folder```
 You can fine-tune a pre-trained model by the following command.
-```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --restore_path path/to/your/model.pth.tar```
+```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --restore_path path/to/your/model.pth.tar```
 Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off.
 You can also follow your training runs on Tensorboard as you do with our TTS models.
 ## Acknowledgement
-Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
+Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
--- a/vocoder/datasets/gan_dataset.py
+++ b/vocoder/datasets/gan_dataset.py
@ -87,6 +87,11 @@ class GANDataset(Dataset):
                audio, mel = self.cache[idx]
            else:
                audio = self.ap.load_wav(wavpath)
                if len(audio) < self.seq_len + self.pad_short:
                    audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
                            mode='constant', constant_values=0.0)
                mel = self.ap.melspectrogram(audio)
        else:
@ -99,10 +104,6 @@ class GANDataset(Dataset):
                audio = self.ap.load_wav(wavpath)
                mel = np.load(feat_path)
        if len(audio) < self.seq_len + self.pad_short:
            audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
                    mode='constant', constant_values=0.0)
        # correct the audio length wrt padding applied in stft
        audio = np.pad(audio, (0, self.hop_len), mode="edge")
        audio = audio[:mel.shape[-1] * self.hop_len]
--- a/vocoder/tests/test_datasets.py
+++ b/vocoder/tests/test_datasets.py
@ -59,9 +59,9 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us
                    audio = wav1[idx].squeeze()
                    feat = feat1[idx]
                    mel = ap.melspectrogram(audio)
-                    # the first 2 and the last frame is skipped due to the padding
+                    # the first 2 and the last 2 frames are skipped due to the padding
-                    # applied in spec. computation.
+                    # differences in stft
-                    assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum() == 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum()}'
+                    assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum() <= 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum()}'
            count_iter += 1
            # if count_iter == max_iter: