Merge remote-tracking branch 'TTS/dev' into dev

2020-08-11 14:27:26 +02:00 · 2020-08-11 14:27:26 +02:00 · 8b4eb256f6
parent 63401797f6 3f34829b78
commit 8b4eb256f6
186 changed files with 37459 additions and 3085 deletions
--- a/.github/PR_TEMPLATE.md
+++ b/.github/PR_TEMPLATE.md
@ -0,0 +1,18 @@
+---
+name: 'Contribution Guideline '
+about: Refer to Contirbution Guideline
+title: ''
+labels: ''
+assignees: ''
+
+---
+### Contribution Guideline
+
+Please send your PRs to `dev` branch if it is not directly related to a specific branch.
+Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter.
+We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command:
+
+```bash
+pip install pylint cardboardlint
+cardboardlinter --refspec master
+```
--- a/.gitignore
+++ b/.gitignore
@ -128,3 +128,4 @@ tests/outputs/*
 TODO.txt
 .vscode/*
 data/*
+notebooks/data/*
--- a/.travis.yml
+++ b/.travis.yml
@ -6,6 +6,8 @@ git:
 before_install:
  - sudo apt-get update
  - sudo apt-get -y install espeak
+  - python -m pip install --upgrade pip
+  - pip install six==1.12.0

 matrix:
  include:
@ -15,7 +17,15 @@ matrix:
    env: TEST_SUITE="lint"
  - name: "Unit tests"
    python: "3.6"
-    install: pip install --quiet -r requirements_tests.txt
+    install:
+      - python setup.py egg_info
+      - pip install -e .
    env: TEST_SUITE="unittest"
+  - name: "Unit tests"
+    python: "3.6"
+    install:
+      - python setup.py egg_info
+      - pip install -e .
+    env: TEST_SUITE="testscripts"

 script: ./.travis/script
--- a/.travis/script
+++ b/.travis/script
@ -10,13 +10,12 @@ if [[ ( "$TRAVIS_PULL_REQUEST" != "false" ) && ( "$TEST_SUITE" == "lint" ) ]]; t
 fi

 if [[ "$TEST_SUITE" == "unittest" ]]; then
-  # Run tests on all pushes
-  pushd tts_namespace
-  nosetests TTS.speaker_encoder.tests --nocapture
-  nosetests TTS.vocoder.tests --nocapture
-  nosetests TTS.tests --nocapture
-  nosetests TTS.tf.tests --nocapture
-  popd
-  # Test server package
+  nosetests tests --nocapture
  ./tests/test_server_package.sh
 fi
+
+if [[ "$TEST_SUITE" == "testscripts" ]]; then
+   # test model training scripts
+  ./tests/test_tts_train.sh
+  ./tests/test_vocoder_train.sh
+fi
--- a/README.md
+++ b/README.md
@ -1,12 +1,17 @@
 <p align="center"><img src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" data-canonical-src="![TTS banner](https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png =250x250)
 " width="320" height="95" /></p>

+<br/>
+
 <p align='center'>
    <img src="https://travis-ci.org/mozilla/TTS.svg?branch=dev"/>
    <a href='https://discourse.mozilla.org/c/tts'><img src="https://img.shields.io/badge/discourse-online-green.svg"/></a>
+    <a href='https://opensource.org/licenses/MPL-2.0'> <img src="https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg"/></a>
 </p>

-This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). 
+<br/>
+
+This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en).

 Mozilla TTS aims a deep learning based Text2Speech engine, low in cost and high in quality.

@ -79,32 +84,32 @@ Or you can use ```requirements.txt``` to install the requirements only.

 ### Directory Structure
 ```
-|- TTS/
-|   |- train.py         (train your TTS model.)
-|   |- distribute.py    (train your TTS model using Multiple GPUs)
-|   |- config.json      (TTS model configuration file)
-|   |- tf/              (Tensorflow 2 utilities and model implementations)
-|   |- layers/          (model layer definitions)
-|   |- models/          (model definitions)
-|   |- notebooks/       (Jupyter Notebooks for model evaluation and parameter selection)
-|   |- data_analysis/   (TTS Dataset analysis tools and notebooks.)
-|   |- utils/           (TTS utilities -io, visualization, data processing etc.-)
-|   |- speaker_encoder/ (Speaker Encoder implementation with the same folder structure.)
-|   |- vocoder/         (Vocoder implementations with the same folder structure.)
+|- notebooks/       (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
+|- utils/           (common utilities.)
+|- TTS
+    |- bin/             (folder for all the executables.)
+      |- train*.py                  (train your target model.)
+      |- distribute.py              (train your TTS model using Multiple GPUs.)
+      |- compute_statistics.py      (compute dataset statistics for normalization.)
+      |- convert*.py                (convert target torch model to TF.)
+    |- tts/             (text to speech models)
+        |- layers/          (model layer definitions)
+        |- models/          (model definitions)
+        |- tf/              (Tensorflow 2 utilities and model implementations)
+        |- utils/           (model specific utilities.)
+    |- speaker_encoder/ (Speaker Encoder models.)
+        |- (same)
+    |- vocoder/         (Vocoder models.)
+        |- (same)
 ```

 ### Docker
-A barebone `Dockerfile` exists at the root of the project, which should let you quickly setup the environment. By default, it will start the server and let you query it. Make sure to use `nvidia-docker` to use your GPUs. Make sure you follow the instructions in the [`server README`](server/README.md) before you build your image so that the server can find the model within the image.
+A docker image is created by [@synesthesiam](https://github.com/synesthesiam) and shared in a separate [repository](https://github.com/synesthesiam/docker-mozillatts) with the latest LJSpeech models.

-```
-docker build -t mozilla-tts .
-nvidia-docker run -it --rm -p 5002:5002 mozilla-tts
-```
-
-## Checkpoints and Audio Samples
+## Release Models
 Please visit [our wiki.](https://github.com/mozilla/TTS/wiki/Released-Models)

-## Example Model Outputs
+## Sample Model Output
 Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.

 > "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
@ -116,8 +121,8 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl
 ## [Mozilla TTS Tutorials and Notebooks](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials)

 ## Datasets and Data-Loading
-TTS provides a generic dataloader easy to use for your custom dataset. 
-You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples. 
+TTS provides a generic dataloader easy to use for your custom dataset.
+You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
 After that, you need to set ```dataset``` fields in ```config.json```.

 Some of the public datasets that we successfully applied TTS:
@ -142,15 +147,19 @@ tail -n 1100 metadata_shuf.csv > metadata_val.csv

 To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in  ```config.json```.

-```train.py --config_path config.json```
+```python TTS/bin/train.py --config_path TTS/tts/configs/config.json```

 To fine-tune a model, use ```--restore_path```.

-```train.py --config_path config.json --restore_path /path/to/your/model.pth.tar```
+```python TTS/bin/train.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
+
+To continue an old training run, use ```--continue_path```.
+
+```python TTS/bin/train.py --continue_path /path/to/your/run_folder/```

 For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU.

-```CUDA_VISIBLE_DEVICES="0,1,4" distribute.py --config_path config.json```
+```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --config_path TTS/tts/configs/config.json```

 Each run creates a new output folder and ```config.json``` is copied under this folder.

@ -187,7 +196,7 @@ If you like to use TTS to try a new idea and like to share your experiments with
 - [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
 - [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN)
 - [ ] Multi-speaker embedding.
- [ ] Model optimization (model export, model pruning etc.)
+- [x] Model optimization (model export, model pruning etc.)

 <!--## References
 - [Efficient Neural Audio Synthesis](https://arxiv.org/pdf/1802.08435.pdf)
@ -203,3 +212,4 @@ If you like to use TTS to try a new idea and like to share your experiments with
 ### References
 - https://github.com/keithito/tacotron (Dataset pre-processing)
 - https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
+- https://github.com/kan-bayashi/ParallelWaveGAN (vocoder library)
--- a/mozilla_voice_tts/init.py
+++ b/mozilla_voice_tts/init.py
--- a/mozilla_voice_tts/bin/compute_statistics.py
+++ b/mozilla_voice_tts/bin/compute_statistics.py
@ -7,16 +7,16 @@ import argparse
 import numpy as np
 from tqdm import tqdm

-from TTS.datasets.preprocess import load_meta_data
-from TTS.utils.io import load_config
-from TTS.utils.audio import AudioProcessor
+from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.utils.audio import AudioProcessor

 def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of spectrogtram features.")
    parser.add_argument("--config_path", type=str, required=True,
-                        help="TTS config file path.")
+                        help="TTS config file path to define audio processin parameters.")
    parser.add_argument("--out_path", default=None, type=str,
                        help="directory to save the output file.")
    args = parser.parse_args()
--- a/mozilla_voice_tts/bin/convert_melgan_tflite.py
+++ b/mozilla_voice_tts/bin/convert_melgan_tflite.py
@ -2,10 +2,10 @@

 import argparse

-from TTS.utils.io import load_config
-from TTS.vocoder.tf.utils.generic_utils import setup_generator
-from TTS.vocoder.tf.utils.io import load_checkpoint
-from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.vocoder.tf.utils.generic_utils import setup_generator
+from mozilla_voice_tts.vocoder.tf.utils.io import load_checkpoint
+from mozilla_voice_tts.vocoder.tf.utils.tflite import convert_melgan_to_tflite


 parser = argparse.ArgumentParser()
@ -30,4 +30,3 @@ model = load_checkpoint(model, args.tf_model)

 # create tflite model
 tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
-
--- a/mozilla_voice_tts/bin/convert_melgan_torch_to_tf.py
+++ b/mozilla_voice_tts/bin/convert_melgan_torch_to_tf.py
@ -6,13 +6,13 @@ import tensorflow as tf
 import torch
 from fuzzywuzzy import fuzz

-from TTS.utils.io import load_config
-from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.vocoder.tf.utils.convert_torch_to_tf_utils import (
    compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
-from TTS.vocoder.tf.utils.generic_utils import \
+from mozilla_voice_tts.vocoder.tf.utils.generic_utils import \
    setup_generator as setup_tf_generator
-from TTS.vocoder.tf.utils.io import save_checkpoint
-from TTS.vocoder.utils.generic_utils import setup_generator
+from mozilla_voice_tts.vocoder.tf.utils.io import save_checkpoint
+from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator

 # prevent GPU use
 os.environ['CUDA_VISIBLE_DEVICES'] = ''
@ -114,4 +114,3 @@ assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
 save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
                args.output_path)
 print(' > Model conversion is successfully completed :).')
-
--- a/mozilla_voice_tts/bin/convert_tacotron2_tflite.py
+++ b/mozilla_voice_tts/bin/convert_tacotron2_tflite.py
@ -2,11 +2,11 @@

 import argparse

-from TTS.utils.io import load_config
-from TTS.utils.text.symbols import symbols, phonemes
-from TTS.tf.utils.generic_utils import setup_model
-from TTS.tf.utils.io import load_checkpoint
-from TTS.tf.utils.tflite import convert_tacotron2_to_tflite
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes
+from mozilla_voice_tts.tts.tf.utils.generic_utils import setup_model
+from mozilla_voice_tts.tts.tf.utils.io import load_checkpoint
+from mozilla_voice_tts.tts.tf.utils.tflite import convert_tacotron2_to_tflite


 parser = argparse.ArgumentParser()
@ -34,4 +34,4 @@ model = load_checkpoint(model, args.tf_model)
 model.decoder.set_max_decoder_steps(1000)

 # create tflite model
-tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)
+tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)
--- a/mozilla_voice_tts/bin/convert_tacotron2_torch_to_tf.py
+++ b/mozilla_voice_tts/bin/convert_tacotron2_torch_to_tf.py
@ -1,21 +1,27 @@
 # %%
-import sys
-sys.path.append('/home/erogol/Projects')
-import os
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
 # %%
 import argparse
-import numpy as np
-import torch
-import tensorflow as tf
-from fuzzywuzzy import fuzz
+import os
+import sys
+# %%
+# print variable match
+from pprint import pprint
+
+import numpy as np
+import tensorflow as tf
+import torch
+from fuzzywuzzy import fuzz
+from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
+from mozilla_voice_tts.tts.tf.utils.convert_torch_to_tf_utils import (
+    compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
+from mozilla_voice_tts.tts.tf.utils.generic_utils import save_checkpoint
+from mozilla_voice_tts.tts.utils.generic_utils import setup_model
+from mozilla_voice_tts.tts.utils.text.symbols import phonemes, symbols
+from mozilla_voice_tts.utils.io import load_config
+
+sys.path.append('/home/erogol/Projects')
+os.environ['CUDA_VISIBLE_DEVICES'] = ''

-from TTS.utils.text.symbols import phonemes, symbols
-from TTS.utils.generic_utils import setup_model
-from TTS.utils.io import load_config
-from TTS.tf.models.tacotron2 import Tacotron2
-from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
-from TTS.tf.utils.generic_utils import save_checkpoint

 parser = argparse.ArgumentParser()
 parser.add_argument('--torch_model_path',
@ -86,7 +92,7 @@ var_map = [

 # %%
 # get tf_model graph
-mel_pred = model_tf.build_inference()
+model_tf.build_inference()

 # get tf variables
 tf_vars = model_tf.weights
@ -108,9 +114,6 @@ for tf_name in tf_var_names:
    del torch_var_names[max_idx]
    var_map.append((tf_name, matching_name))

-# %%
-# print variable match
-from pprint import pprint
 pprint(var_map)
 pprint(torch_var_names)

--- a/mozilla_voice_tts/bin/distribute_tts.py
+++ b/mozilla_voice_tts/bin/distribute_tts.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import pathlib
+import time
+import subprocess
+import argparse
+import torch
+
+
+def main():
+    """
+    Call train.py as a new process and pass command arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--continue_path',
+        type=str,
+        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
+        default='',
+        required='--config_path' not in sys.argv)
+    parser.add_argument(
+        '--restore_path',
+        type=str,
+        help='Model file to be restored. Use to finetune a model.',
+        default='')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        help='Path to config file for training.',
+        required='--continue_path' not in sys.argv
+    )
+    args = parser.parse_args()
+
+    num_gpus = torch.cuda.device_count()
+    group_id = time.strftime("%Y_%m_%d-%H%M%S")
+
+    # set arguments for train.py
+    folder_path = pathlib.Path(__file__).parent.absolute()
+    command = [os.path.join(folder_path, 'train_tts.py')]
+    command.append('--continue_path={}'.format(args.continue_path))
+    command.append('--restore_path={}'.format(args.restore_path))
+    command.append('--config_path={}'.format(args.config_path))
+    command.append('--group_id=group_{}'.format(group_id))
+    command.append('')
+
+    # run processes
+    processes = []
+    for i in range(num_gpus):
+        my_env = os.environ.copy()
+        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
+        command[-1] = '--rank={}'.format(i)
+        stdout = None if i == 0 else open(os.devnull, 'w')
+        p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
+        processes.append(p)
+        print(command)
+
+    for p in processes:
+        p.wait()
+
+
+if __name__ == '__main__':
+    main()
--- a/mozilla_voice_tts/bin/synthesize.py
+++ b/mozilla_voice_tts/bin/synthesize.py
@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+# pylint: disable=redefined-outer-name, unused-argument
+import os
+import string
+import time
+
+import torch
+
+from mozilla_voice_tts.tts.utils.generic_utils import setup_model
+from mozilla_voice_tts.tts.utils.synthesis import synthesis
+from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
+from mozilla_voice_tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
+
+
+def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
+    t_1 = time.time()
+    waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
+    if CONFIG.model == "Tacotron" and not use_gl:
+        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
+    if not use_gl:
+        waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
+    if use_cuda and not use_gl:
+        waveform = waveform.cpu()
+    if not use_gl:
+        waveform = waveform.numpy()
+    waveform = waveform.squeeze()
+    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
+    tps = (time.time() - t_1) / len(waveform)
+    print(" > Run-time: {}".format(time.time() - t_1))
+    print(" > Real-time factor: {}".format(rtf))
+    print(" > Time per step: {}".format(tps))
+    return waveform
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('text', type=str, help='Text to generate speech.')
+    parser.add_argument('config_path',
+                        type=str,
+                        help='Path to model config file.')
+    parser.add_argument(
+        'model_path',
+        type=str,
+        help='Path to model file.',
+    )
+    parser.add_argument(
+        'out_path',
+        type=str,
+        help='Path to save final wav file. Wav file will be names as the text given.',
+    )
+    parser.add_argument('--use_cuda',
+                        type=bool,
+                        help='Run model on CUDA.',
+                        default=False)
+    parser.add_argument(
+        '--vocoder_path',
+        type=str,
+        help=
+        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
+        default="",
+    )
+    parser.add_argument('--vocoder_config_path',
+                        type=str,
+                        help='Path to vocoder model config file.',
+                        default="")
+    parser.add_argument(
+        '--batched_vocoder',
+        type=bool,
+        help="If True, vocoder model uses faster batch processing.",
+        default=True)
+    parser.add_argument('--speakers_json',
+                        type=str,
+                        help="JSON file for multi-speaker model.",
+                        default="")
+    parser.add_argument(
+        '--speaker_fileid',
+        type=str,
+        help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
+        default=None)
+    parser.add_argument(
+        '--gst_style',
+        help="Wav path file for GST stylereference.",
+        default=None)
+
+    args = parser.parse_args()
+
+    # load the config
+    C = load_config(args.config_path)
+    C.forward_attn_mask = True
+
+    # load the audio processor
+    ap = AudioProcessor(**C.audio)
+
+    # if the vocabulary was passed, replace the default
+    if 'characters' in C.keys():
+        symbols, phonemes = make_symbols(**C.characters)
+
+    speaker_embedding = None
+    speaker_embedding_dim = None
+    num_speakers = 0
+
+    # load speakers
+    if args.speakers_json != '':
+        speaker_mapping = json.load(open(args.speakers_json, 'r'))
+        num_speakers = len(speaker_mapping)
+        if C.use_external_speaker_embedding_file:
+            if args.speaker_fileid is not None:
+                speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
+            else: # if speaker_fileid is not specificated use the first sample in speakers.json
+                speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
+            speaker_embedding_dim = len(speaker_embedding)
+
+    # load the model
+    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
+    model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
+    cp = torch.load(args.model_path, map_location=torch.device('cpu'))
+    model.load_state_dict(cp['model'])
+    model.eval()
+    if args.use_cuda:
+        model.cuda()
+    model.decoder.set_r(cp['r'])
+
+    # load vocoder model
+    if args.vocoder_path != "":
+        VC = load_config(args.vocoder_config_path)
+        vocoder_model = setup_generator(VC)
+        vocoder_model.load_state_dict(torch.load(args.vocoder_path, map_location="cpu")["model"])
+        vocoder_model.remove_weight_norm()
+        if args.use_cuda:
+            vocoder_model.cuda()
+        vocoder_model.eval()
+    else:
+        vocoder_model = None
+        VC = None
+
+    # synthesize voice
+    use_griffin_lim = args.vocoder_path == ""
+    print(" > Text: {}".format(args.text))
+
+    if not C.use_external_speaker_embedding_file:
+        if args.speaker_fileid.isdigit():
+            args.speaker_fileid = int(args.speaker_fileid)
+        else:
+            args.speaker_fileid = None
+    else:
+        args.speaker_fileid = None
+
+    if args.gst_style is None:
+        gst_style = C.gst['gst_style_input']
+    else:
+        # check if gst_style string is a dict, if is dict convert  else use string
+        try:
+            gst_style = json.loads(args.gst_style)
+            if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
+                raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens']))
+        except ValueError:
+            gst_style = args.gst_style
+
+    wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
+
+    # save the results
+    file_name = args.text.replace(" ", "_")
+    file_name = file_name.translate(
+        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
+    out_path = os.path.join(args.out_path, file_name)
+    print(" > Saving output to {}".format(out_path))
+    ap.save_wav(wav, out_path)
--- a/mozilla_voice_tts/bin/train_encoder.py
+++ b/mozilla_voice_tts/bin/train_encoder.py
@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 import argparse
 import os
 import sys
@ -6,19 +9,22 @@ import traceback

 import torch
 from torch.utils.data import DataLoader
-from TTS.datasets.preprocess import load_meta_data
-from TTS.speaker_encoder.dataset import MyDataset
-from TTS.speaker_encoder.loss import GE2ELoss
-from TTS.speaker_encoder.model import SpeakerEncoder
-from TTS.speaker_encoder.visual import plot_embeddings
-from TTS.speaker_encoder.generic_utils import save_best_model
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch,
-                                     remove_experiment_folder, set_init_dict)
-from TTS.utils.io import load_config, copy_config_file
-from TTS.utils.training import check_update, NoamLR
-from TTS.utils.tensorboard_logger import TensorboardLogger
-from TTS.utils.radam import RAdam
+
+from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
+from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
+from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
+from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
+from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
+from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
+from mozilla_voice_tts.tts.utils.generic_utils import (
+    create_experiment_folder, get_git_branch, remove_experiment_folder,
+    set_init_dict)
+from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
+from mozilla_voice_tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.utils.generic_utils import count_parameters
+from mozilla_voice_tts.utils.radam import RAdam
+from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
+from mozilla_voice_tts.utils.training import NoamLR, check_update

 torch.backends.cudnn.enabled = True
 torch.backends.cudnn.benchmark = True
@ -94,7 +100,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
        if global_step % c.steps_plot_stats == 0:
            # Plot Training Epoch Stats
            train_stats = {
-                "GE2Eloss": avg_loss,
+                "loss": avg_loss,
                "lr": current_lr,
                "grad_norm": grad_norm,
                "step_time": step_time
@ -129,12 +135,18 @@ def main(args):  # pylint: disable=redefined-outer-name
    global meta_data_eval

    ap = AudioProcessor(**c.audio)
-    model = SpeakerEncoder(input_dim=40,
-                           proj_dim=128,
-                           lstm_dim=384,
-                           num_lstm_layers=3)
+    model = SpeakerEncoder(input_dim=c.model['input_dim'],
+                           proj_dim=c.model['proj_dim'],
+                           lstm_dim=c.model['lstm_dim'],
+                           num_lstm_layers=c.model['num_lstm_layers'])
    optimizer = RAdam(model.parameters(), lr=c.lr)
-    criterion = GE2ELoss(loss_method='softmax')
+
+    if c.loss == "ge2e":
+        criterion = GE2ELoss(loss_method='softmax')
+    elif c.loss == "angleproto":
+        criterion = AngleProtoLoss()
+    else:
+        raise Exception("The %s  not is a loss supported" % c.loss)

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
@ -177,8 +189,8 @@ def main(args):  # pylint: disable=redefined-outer-name
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

    global_step = args.restore_step
-    train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
-                                    global_step)
+    _, global_step = train(model, criterion, optimizer, scheduler, ap,
+                           global_step)


 if __name__ == '__main__':
@ -236,7 +248,7 @@ if __name__ == '__main__':
                     new_fields)

    LOG_DIR = OUT_PATH
-    tb_logger = TensorboardLogger(LOG_DIR)
+    tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')

    try:
        main(args)
--- a/mozilla_voice_tts/bin/train_tts.py
+++ b/mozilla_voice_tts/bin/train_tts.py
@ -1,7 +1,10 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 import argparse
+import glob
 import os
 import sys
-import glob
 import time
 import traceback

@ -9,42 +12,51 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader

-from TTS.datasets.TTSDataset import MyDataset
-from distribute import (DistributedSampler, apply_gradient_allreduce,
-                        init_distributed, reduce_tensor)
-from TTS.layers.losses import TacotronLoss
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder,
-                                     get_git_branch, set_init_dict,
-                                     setup_model, KeepAverage, check_config)
-from TTS.utils.io import (save_best_model, save_checkpoint,
-                          load_config, copy_config_file)
-from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
-                                gradual_training_scheduler, set_weight_decay,
-                                setup_torch_training_env)
-from TTS.utils.tensorboard_logger import TensorboardLogger
-from TTS.utils.console_logger import ConsoleLogger
-from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
-    get_speakers
-from TTS.utils.synthesis import synthesis
-from TTS.utils.text.symbols import make_symbols, phonemes, symbols
-from TTS.utils.visual import plot_alignment, plot_spectrogram
-from TTS.datasets.preprocess import load_meta_data
-from TTS.utils.radam import RAdam
-from TTS.utils.measures import alignment_diagonal_score
-
+from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
+from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
+from mozilla_voice_tts.tts.layers.losses import TacotronLoss
+from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
+                                                    apply_gradient_allreduce,
+                                                    init_distributed,
+                                                    reduce_tensor)
+from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
+from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
+from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
+from mozilla_voice_tts.tts.utils.speakers import (get_speakers,
+                                                  load_speaker_mapping,
+                                                  save_speaker_mapping)
+from mozilla_voice_tts.tts.utils.synthesis import synthesis
+from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
+                                                      symbols)
+from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
+from mozilla_voice_tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.utils.console_logger import ConsoleLogger
+from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
+                                                   count_parameters,
+                                                   create_experiment_folder,
+                                                   get_git_branch,
+                                                   remove_experiment_folder,
+                                                   set_init_dict)
+from mozilla_voice_tts.utils.io import copy_config_file, load_config
+from mozilla_voice_tts.utils.radam import RAdam
+from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
+from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
+                                              check_update,
+                                              gradual_training_scheduler,
+                                              set_weight_decay,
+                                              setup_torch_training_env)

 use_cuda, num_gpus = setup_torch_training_env(True, False)


-def setup_loader(ap, r, is_val=False, verbose=False):
+def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
    if is_val and not c.run_eval:
        loader = None
    else:
        dataset = MyDataset(
            r,
            c.text_cleaner,
-            compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
+            compute_linear_spec=c.model.lower() == 'tacotron',
            meta_data=meta_data_eval if is_val else meta_data_train,
            ap=ap,
            tp=c.characters if 'characters' in c.keys() else None,
@ -56,7 +68,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
            use_phonemes=c.use_phonemes,
            phoneme_language=c.phoneme_language,
            enable_eos_bos=c.enable_eos_bos_chars,
-            verbose=verbose)
+            verbose=verbose,
+            speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
        loader = DataLoader(
            dataset,
@ -70,9 +83,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
            pin_memory=False)
    return loader

-
-def format_data(data):
-    if c.use_speaker_embedding:
+def format_data(data, speaker_mapping=None):
+    if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
        speaker_mapping = load_speaker_mapping(OUT_PATH)

    # setup input data
@ -87,13 +99,20 @@ def format_data(data):
    avg_spec_length = torch.mean(mel_lengths.float())

    if c.use_speaker_embedding:
-        speaker_ids = [
-            speaker_mapping[speaker_name] for speaker_name in speaker_names
-        ]
-        speaker_ids = torch.LongTensor(speaker_ids)
+        if c.use_external_speaker_embedding_file:
+            speaker_embeddings = data[8]
+            speaker_ids = None
+        else:
+            speaker_ids = [
+                speaker_mapping[speaker_name] for speaker_name in speaker_names
+            ]
+            speaker_ids = torch.LongTensor(speaker_ids)
+            speaker_embeddings = None
    else:
+        speaker_embeddings = None
        speaker_ids = None

+
    # set stop targets view, we predict a single stop token per iteration.
    stop_targets = stop_targets.view(text_input.shape[0],
                                     stop_targets.size(1) // c.r, -1)
@ -110,13 +129,16 @@ def format_data(data):
        stop_targets = stop_targets.cuda(non_blocking=True)
        if speaker_ids is not None:
            speaker_ids = speaker_ids.cuda(non_blocking=True)
-    return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
+        if speaker_embeddings is not None:
+            speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
+
+    return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length


 def train(model, criterion, optimizer, optimizer_st, scheduler,
-          ap, global_step, epoch):
+          ap, global_step, epoch, amp, speaker_mapping=None):
    data_loader = setup_loader(ap, model.decoder.r, is_val=False,
-                               verbose=(epoch == 0))
+                               verbose=(epoch == 0), speaker_mapping=speaker_mapping)
    model.train()
    epoch_time = 0
    keep_avg = KeepAverage()
@ -131,7 +153,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
        start_time = time.time()

        # format data
-        text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
+        text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length = format_data(data, speaker_mapping)
        loader_time = time.time() - end_time

        global_step += 1
@ -146,14 +168,14 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
        # forward pass model
        if c.bidirectional_decoder or c.double_decoder_consistency:
            decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
-                text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
+                text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
        else:
            decoder_output, postnet_output, alignments, stop_tokens = model(
-                text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
+                text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
            decoder_backward_output = None
            alignments_backward = None

-        # set the alignment lengths wrt reduction factor for guided attention
+        # set the [alignment] lengths wrt reduction factor for guided attention
        if mel_lengths.max() % model.decoder.r != 0:
            alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
        else:
@ -167,9 +189,18 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
                              text_lengths)

        # backward pass
-        loss_dict['loss'].backward()
+        if amp is not None:
+            with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss_dict['loss'].backward()
+
        optimizer, current_lr = adam_weight_decay(optimizer)
-        grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
+        if amp:
+            amp_opt_params = amp.master_params(optimizer)
+        else:
+            amp_opt_params = None
+        grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True, amp_opt_params=amp_opt_params)
        optimizer.step()

        # compute alignment error (the lower the better )
@ -180,7 +211,11 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
        if c.separate_stopnet:
            loss_dict['stopnet_loss'].backward()
            optimizer_st, _ = adam_weight_decay(optimizer_st)
-            grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
+            if amp:
+                amp_opt_params = amp.master_params(optimizer)
+            else:
+                amp_opt_params = None
+            grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0, amp_opt_params=amp_opt_params)
            optimizer_st.step()
        else:
            grad_norm_st = 0
@ -214,10 +249,15 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,

        # print training progress
        if global_step % c.print_step == 0:
+            log_dict = {
+                "avg_spec_length": [avg_spec_length, 1],  # value, precision
+                "avg_text_length": [avg_text_length, 1],
+                "step_time": [step_time, 4],
+                "loader_time": [loader_time, 2],
+                "current_lr": current_lr,
+            }
            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
-                                      avg_spec_length, avg_text_length,
-                                      step_time, loader_time, current_lr,
-                                      loss_dict, keep_avg.avg_values)
+                                      log_dict, loss_dict, keep_avg.avg_values)

        if args.rank == 0:
            # Plot Training Iter Stats
@ -237,7 +277,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
                    # save model
                    save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
                                    optimizer_st=optimizer_st,
-                                    model_loss=loss_dict['postnet_loss'])
+                                    model_loss=loss_dict['postnet_loss'],
+                                    amp_state_dict=amp.state_dict() if amp else None)

                # Diagnostic visualizations
                const_spec = postnet_output[0].data.cpu().numpy()
@ -247,13 +288,13 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
                align_img = alignments[0].data.cpu().numpy()

                figures = {
-                    "prediction": plot_spectrogram(const_spec, ap),
-                    "ground_truth": plot_spectrogram(gt_spec, ap),
-                    "alignment": plot_alignment(align_img),
+                    "prediction": plot_spectrogram(const_spec, ap, output_fig=False),
+                    "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
+                    "alignment": plot_alignment(align_img, output_fig=False),
                }

                if c.bidirectional_decoder or c.double_decoder_consistency:
-                    figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
+                    figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)

                tb_logger.tb_train_figures(global_step, figures)

@ -281,8 +322,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,


@torch.no_grad()
-def evaluate(model, criterion, ap, global_step, epoch):
-    data_loader = setup_loader(ap, model.decoder.r, is_val=True)
+def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
+    data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping)
    model.eval()
    epoch_time = 0
    keep_avg = KeepAverage()
@ -292,16 +333,16 @@ def evaluate(model, criterion, ap, global_step, epoch):
            start_time = time.time()

            # format data
-            text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
+            text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping)
            assert mel_input.shape[1] % model.decoder.r == 0

            # forward pass model
            if c.bidirectional_decoder or c.double_decoder_consistency:
                decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
-                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
            else:
                decoder_output, postnet_output, alignments, stop_tokens = model(
-                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+                    text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
                decoder_backward_output = None
                alignments_backward = None

@ -361,9 +402,9 @@ def evaluate(model, criterion, ap, global_step, epoch):
            align_img = alignments[idx].data.cpu().numpy()

            eval_figures = {
-                "prediction": plot_spectrogram(const_spec, ap),
-                "ground_truth": plot_spectrogram(gt_spec, ap),
-                "alignment": plot_alignment(align_img)
+                "prediction": plot_spectrogram(const_spec, ap, output_fig=False),
+                "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
+                "alignment": plot_alignment(align_img, output_fig=False)
            }

            # Sample audio
@ -378,7 +419,7 @@ def evaluate(model, criterion, ap, global_step, epoch):

            if c.bidirectional_decoder or c.double_decoder_consistency:
                align_b_img = alignments_backward[idx].data.cpu().numpy()
-                eval_figures['alignment2'] = plot_alignment(align_b_img)
+                eval_figures['alignment2'] = plot_alignment(align_b_img, output_fig=False)
            tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
            tb_logger.tb_eval_figures(global_step, eval_figures)

@ -403,7 +444,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
        style_wav = c.get("style_wav_for_test")
        for idx, test_sentence in enumerate(test_sentences):
            try:
-                wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
+                wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
                    model,
                    test_sentence,
                    c,
@ -423,10 +464,10 @@ def evaluate(model, criterion, ap, global_step, epoch):
                ap.save_wav(wav, file_path)
                test_audios['{}-audio'.format(idx)] = wav
                test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
-                    postnet_output, ap)
+                    postnet_output, ap, output_fig=False)
                test_figures['{}-alignment'.format(idx)] = plot_alignment(
-                    alignment)
-            except:
+                    alignment, output_fig=False)
+            except:  #pylint: disable=bare-except
                print(" !! Error creating Test Sentence -", idx)
                traceback.print_exc()
        tb_logger.tb_test_audios(global_step, test_audios,
@ -453,26 +494,51 @@ def main(args):  # pylint: disable=redefined-outer-name
    # load data instances
    meta_data_train, meta_data_eval = load_meta_data(c.datasets)

+    # set the portion of the data used for training
+    if 'train_portion' in c.keys():
+        meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
+    if 'eval_portion' in c.keys():
+        meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
+
    # parse speakers
    if c.use_speaker_embedding:
        speakers = get_speakers(meta_data_train)
        if args.restore_path:
-            prev_out_path = os.path.dirname(args.restore_path)
-            speaker_mapping = load_speaker_mapping(prev_out_path)
-            assert all([speaker in speaker_mapping
-                        for speaker in speakers]), "As of now you, you cannot " \
-                                                   "introduce new speakers to " \
-                                                   "a previously trained model."
-        else:
+            if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file
+                prev_out_path = os.path.dirname(args.restore_path)
+                speaker_mapping = load_speaker_mapping(prev_out_path)
+                if not speaker_mapping:
+                    print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file")
+                    speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
+                    if not speaker_mapping:
+                        raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file")
+                speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
+            elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file
+                prev_out_path = os.path.dirname(args.restore_path)
+                speaker_mapping = load_speaker_mapping(prev_out_path)
+                speaker_embedding_dim = None
+                assert all([speaker in speaker_mapping
+                            for speaker in speakers]), "As of now you, you cannot " \
+                                                    "introduce new speakers to " \
+                                                    "a previously trained model."
+        elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
+            speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
+            speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
+        elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
+            raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
+        else: # if start new train and don't use External Embedding file
            speaker_mapping = {name: i for i, name in enumerate(speakers)}
+            speaker_embedding_dim = None
        save_speaker_mapping(OUT_PATH, speaker_mapping)
        num_speakers = len(speaker_mapping)
        print("Training with {} speakers: {}".format(num_speakers,
                                                     ", ".join(speakers)))
    else:
        num_speakers = 0
+        speaker_embedding_dim = None
+        speaker_mapping = None

-    model = setup_model(num_chars, num_speakers, c)
+    model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim)

    params = set_weight_decay(model, c.wd)
    optimizer = RAdam(params, lr=c.lr, weight_decay=0)
@ -483,6 +549,14 @@ def main(args):  # pylint: disable=redefined-outer-name
    else:
        optimizer_st = None

+    if c.apex_amp_level == "O1":
+        # pylint: disable=import-outside-toplevel
+        from apex import amp
+        model.cuda()
+        model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level)
+    else:
+        amp = None
+
    # setup criterion
    criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)

@ -495,12 +569,18 @@ def main(args):  # pylint: disable=redefined-outer-name
            if c.reinit_layers:
                raise RuntimeError
            model.load_state_dict(checkpoint['model'])
-        except:
+        except KeyError:
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt'))
+            # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt'))
            model.load_state_dict(model_dict)
            del model_dict
+
+        if amp and 'amp' in checkpoint:
+            amp.load_state_dict(checkpoint['amp'])
+
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
@ -543,14 +623,14 @@ def main(args):  # pylint: disable=redefined-outer-name
            print("\n > Number of output frames:", model.decoder.r)
        train_avg_loss_dict, global_step = train(model, criterion, optimizer,
                                                 optimizer_st, scheduler, ap,
-                                                 global_step, epoch)
+                                                 global_step, epoch, amp, speaker_mapping)
        eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
        c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
        target_loss = train_avg_loss_dict['avg_postnet_loss']
        if c.run_eval:
            target_loss = eval_avg_loss_dict['avg_postnet_loss']
        best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
-                                    OUT_PATH)
+                                    OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)


 if __name__ == '__main__':
@ -602,6 +682,9 @@ if __name__ == '__main__':
    check_config(c)
    _ = os.path.dirname(os.path.realpath(__file__))

+    if c.apex_amp_level == 'O1':
+        print("   >  apex AMP level: ", c.apex_amp_level)
+
    OUT_PATH = args.continue_path
    if args.continue_path == '':
        OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
--- a/mozilla_voice_tts/bin/train_vocoder.py
+++ b/mozilla_voice_tts/bin/train_vocoder.py
@ -4,31 +4,34 @@ import os
 import sys
 import time
 import traceback
+from inspect import signature

 import torch
 from torch.utils.data import DataLoader

-from inspect import signature
-
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import (KeepAverage, count_parameters,
-                                     create_experiment_folder, get_git_branch,
-                                     remove_experiment_folder, set_init_dict)
-from TTS.utils.io import copy_config_file, load_config
-from TTS.utils.radam import RAdam
-from TTS.utils.tensorboard_logger import TensorboardLogger
-from TTS.utils.training import setup_torch_training_env
-from TTS.vocoder.datasets.gan_dataset import GANDataset
-from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from mozilla_voice_tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.utils.console_logger import ConsoleLogger
+from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
+                                                   count_parameters,
+                                                   create_experiment_folder,
+                                                   get_git_branch,
+                                                   remove_experiment_folder,
+                                                   set_init_dict)
+from mozilla_voice_tts.utils.io import copy_config_file, load_config
+from mozilla_voice_tts.utils.radam import RAdam
+from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
+from mozilla_voice_tts.utils.training import setup_torch_training_env
+from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
+from mozilla_voice_tts.vocoder.datasets.preprocess import (load_wav_data,
+                                                           load_wav_feat_data)
 # from distribute import (DistributedSampler, apply_gradient_allreduce,
 #                         init_distributed, reduce_tensor)
-from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
-from TTS.vocoder.utils.io import save_checkpoint, save_best_model
-from TTS.vocoder.utils.console_logger import ConsoleLogger
-from TTS.vocoder.utils.generic_utils import (check_config, plot_results,
-                                             setup_discriminator,
-                                             setup_generator)
-
+from mozilla_voice_tts.vocoder.layers.losses import (DiscriminatorLoss,
+                                                     GeneratorLoss)
+from mozilla_voice_tts.vocoder.utils.generic_utils import (plot_results,
+                                                           setup_discriminator,
+                                                           setup_generator)
+from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint

 use_cuda, num_gpus = setup_torch_training_env(True, True)

@ -124,6 +127,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
            y_hat_vis = y_hat
            y_G_sub = model_G.pqmf_analysis(y_G)

+        scores_fake, feats_fake, feats_real = None, None, None
        if global_step > c.steps_to_start_discriminator:

            # run D with or without cond. features
@ -146,8 +150,6 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
                    _, feats_real = D_out_real
            else:
                scores_fake = D_out_fake
-        else:
-            scores_fake, feats_fake, feats_real = None, None, None

        # compute losses
        loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
@ -239,10 +241,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,

        # print training stats
        if global_step % c.print_step == 0:
+            log_dict = {
+                'step_time': [step_time, 2],
+                'loader_time': [loader_time, 4],
+                "current_lr_G": current_lr_G,
+                "current_lr_D": current_lr_D
+            }
            c_logger.print_train_step(batch_n_iter, num_iter, global_step,
-                                      step_time, loader_time, current_lr_G,
-                                      current_lr_D, loss_dict,
-                                      keep_avg.avg_values)
+                                      log_dict, loss_dict, keep_avg.avg_values)

        # plot step stats
        if global_step % 10 == 0:
@ -328,6 +334,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
            y_G_sub = model_G.pqmf_analysis(y_G)


+        scores_fake, feats_fake, feats_real = None, None, None
        if global_step > c.steps_to_start_discriminator:

            if len(signature(model_D.forward).parameters) == 2:
@ -349,8 +356,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
                    _, feats_real = D_out_real
            else:
                scores_fake = D_out_fake
-        else:
-            scores_fake, feats_fake, feats_real = None, None, None
+                feats_fake, feats_real = None, None

        # compute losses
        loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
@ -615,7 +621,7 @@ if __name__ == '__main__':

    # setup output paths and read configs
    c = load_config(args.config_path)
-    check_config(c)
+    # check_config(c)
    _ = os.path.dirname(os.path.realpath(__file__))

    OUT_PATH = args.continue_path
--- a/mozilla_voice_tts/server/README.md
+++ b/mozilla_voice_tts/server/README.md
@ -15,7 +15,7 @@ If you have the environment set already for TTS, then you can directly call ```s
 3. source /tmp/venv/bin/activate
 4. pip install -U pip setuptools wheel
 5. pip install -U https//example.com/url/to/python/package.whl
-6. python -m TTS.server.server
+6. python -m mozilla_voice_tts.server.server

 You can now open http://localhost:5002 in a browser

--- a/mozilla_voice_tts/server/init.py
+++ b/mozilla_voice_tts/server/init.py
--- a/mozilla_voice_tts/server/conf.json
+++ b/mozilla_voice_tts/server/conf.json
--- a/mozilla_voice_tts/server/server.py
+++ b/mozilla_voice_tts/server/server.py
@ -3,7 +3,7 @@ import argparse
 import os

 from flask import Flask, request, render_template, send_file
-from TTS.server.synthesizer import Synthesizer
+from mozilla_voice_tts.server.synthesizer import Synthesizer


 def create_argparser():
@ -18,8 +18,8 @@ def create_argparser():
    parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
    parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
    parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
-    parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
-    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
+    parser.add_argument('--vocoder_config', type=str, default=None, help='path to mozilla_voice_tts.vocoder config file.')
+    parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to mozilla_voice_tts.vocoder checkpoint file.')
    parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
    parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
    parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
--- a/mozilla_voice_tts/server/synthesizer.py
+++ b/mozilla_voice_tts/server/synthesizer.py
@ -4,19 +4,18 @@ import time

 import numpy as np
 import torch
-import yaml
 import pysbd

-from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_config
-from TTS.utils.generic_utils import setup_model
-from TTS.utils.speakers import load_speaker_mapping
-from TTS.vocoder.utils.generic_utils import setup_generator
+from mozilla_voice_tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.utils.io import load_config
+from mozilla_voice_tts.tts.utils.generic_utils import setup_model
+from mozilla_voice_tts.tts.utils.speakers import load_speaker_mapping
+from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
-from TTS.utils.synthesis import *
+from mozilla_voice_tts.tts.utils.synthesis import *

-from TTS.utils.text import make_symbols, phonemes, symbols
+from mozilla_voice_tts.tts.utils.text import make_symbols, phonemes, symbols


 class Synthesizer(object):
--- a/mozilla_voice_tts/server/templates/index.html
+++ b/mozilla_voice_tts/server/templates/index.html
--- a/mozilla_voice_tts/speaker_encoder/README.md
+++ b/mozilla_voice_tts/speaker_encoder/README.md
@ -10,7 +10,7 @@ Below is an example showing embedding results of various speakers. You can gener

 Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.

-To run the code, you need to follow the same flow as in TTS.
+To run the code, you need to follow the same flow as in mozilla_voice_tts.

 - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
 - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
--- a/mozilla_voice_tts/speaker_encoder/init.py
+++ b/mozilla_voice_tts/speaker_encoder/init.py
--- a/mozilla_voice_tts/speaker_encoder/compute_embeddings.py
+++ b/mozilla_voice_tts/speaker_encoder/compute_embeddings.py
@ -6,9 +6,9 @@ import numpy as np
 from tqdm import tqdm

 import torch
-from TTS.speaker_encoder.model import SpeakerEncoder
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import load_config
+from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
+from mozilla_voice_tts.tts.utils.audio import AudioProcessor
+from mozilla_voice_tts.tts.utils.generic_utils import load_config

 parser = argparse.ArgumentParser(
    description='Compute embedding vectors for each wav file in a dataset. ')
--- a/mozilla_voice_tts/speaker_encoder/config.json
+++ b/mozilla_voice_tts/speaker_encoder/config.json
@ -0,0 +1,61 @@
+
+{
+    "run_name": "Model compatible to  CorentinJ/Real-Time-Voice-Cloning",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 40,         // size of the mel spec frame. 
+        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 400,     // stft window length in ms.
+        "hop_length": 160,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+    },
+    "reinit_layers": [],
+    "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 1, // Number of steps to log traning on console.
+    "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 40,
+        "proj_dim": 256,
+        "lstm_dim": 256,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": false
+    },
+    "datasets": 
+        [
+            {
+                "name": "vctk",
+                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            }
+        ]
+}
--- a/mozilla_voice_tts/speaker_encoder/dataset.py
+++ b/mozilla_voice_tts/speaker_encoder/dataset.py
@ -9,7 +9,7 @@ class MyDataset(Dataset):
                 num_utter_per_speaker=10, skip_speakers=False, verbose=False):
        """
        Args:
-            ap (TTS.utils.AudioProcessor): audio processor object.
+            ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object.
            meta_data (list): list of dataset instances.
            seq_len (int): voice segment length in seconds.
            verbose (bool): print diagnostic information.
@ -31,7 +31,7 @@ class MyDataset(Dataset):
            print(f" | > Num speakers: {len(self.speakers)}")

    def load_wav(self, filename):
-        audio = self.ap.load_wav(filename)
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
        return audio

    def load_data(self, idx):
--- a/mozilla_voice_tts/speaker_encoder/generic_utils.py
+++ b/mozilla_voice_tts/speaker_encoder/generic_utils.py
@ -15,7 +15,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path,
        'optimizer': optimizer.state_dict() if optimizer is not None else None,
        'step': current_step,
        'epoch': epoch,
-        'GE2Eloss': model_loss,
+        'loss': model_loss,
        'date': datetime.date.today().strftime("%B %d, %Y"),
    }
    torch.save(state, checkpoint_path)
@ -29,7 +29,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
            'model': new_state_dict,
            'optimizer': optimizer.state_dict(),
            'step': current_step,
-            'GE2Eloss': model_loss,
+            'loss': model_loss,
            'date': datetime.date.today().strftime("%B %d, %Y"),
        }
        best_loss = model_loss
@ -38,4 +38,4 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
            model_loss, bestmodel_path))
        torch.save(state, bestmodel_path)
-    return best_loss
+    return best_loss
--- a/mozilla_voice_tts/speaker_encoder/losses.py
+++ b/mozilla_voice_tts/speaker_encoder/losses.py
@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
+import numpy as np

 # adapted from https://github.com/cvqluu/GE2E-Loss
 class GE2ELoss(nn.Module):
@ -23,6 +23,8 @@ class GE2ELoss(nn.Module):
        self.b = nn.Parameter(torch.tensor(init_b))
        self.loss_method = loss_method

+        print(' > Initialised Generalized End-to-End loss')
+
        assert self.loss_method in ["softmax", "contrast"]

        if self.loss_method == "softmax":
@ -119,3 +121,40 @@ class GE2ELoss(nn.Module):
        cos_sim_matrix = self.w * cos_sim_matrix + self.b
        L = self.embed_loss(dvecs, cos_sim_matrix)
        return L.mean()
+
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+    """
+    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector
+        Args:
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super(AngleProtoLoss, self).__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion = torch.nn.CrossEntropyLoss()
+
+        print(' > Initialised Angular Prototypical loss')
+
+    def forward(self, x):
+        """
+        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        out_anchor = torch.mean(x[:, 1:, :], 1)
+        out_positive = x[:, 0, :]
+        num_speakers = out_anchor.size()[0]
+
+        cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
+        L = self.criterion(cos_sim_matrix, label)
+        return L
--- a/mozilla_voice_tts/speaker_encoder/model.py
+++ b/mozilla_voice_tts/speaker_encoder/model.py
@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module):
        o, (_, _) = self.lstm(x)
        return self.linear(o)

+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim,
+                            hidden_size=lstm_dim,
+                            num_layers=num_lstm_layers,
+                            batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))

 class SpeakerEncoder(nn.Module):
-    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
+    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
        layers = []
-        layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
-        for _ in range(num_lstm_layers - 1):
-            layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
-        self.layers = nn.Sequential(*layers)
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
        self._init_layers()

    def _init_layers(self):
@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module):
    def forward(self, x):
        # TODO: implement state passing for lstms
        d = self.layers(x)
-        d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
        return d

    def inference(self, x):
        d = self.layers.forward(x)
-        d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
        return d

    def compute_embedding(self, x, num_frames=160, overlap=0.5):
@ -85,4 +109,3 @@ class SpeakerEncoder(nn.Module):
                    frames[cur_iter <= num_iters, :, :]
                )
        return embed / num_iters
-
--- a/mozilla_voice_tts/speaker_encoder/requirements.txt
+++ b/mozilla_voice_tts/speaker_encoder/requirements.txt
--- a/mozilla_voice_tts/speaker_encoder/umap.png
+++ b/mozilla_voice_tts/speaker_encoder/umap.png
--- a/mozilla_voice_tts/speaker_encoder/visual.py
+++ b/mozilla_voice_tts/speaker_encoder/visual.py
--- a/mozilla_voice_tts/tts/init.py
+++ b/mozilla_voice_tts/tts/init.py
--- a/mozilla_voice_tts/tts/configs/config.json
+++ b/mozilla_voice_tts/tts/configs/config.json
@ -67,6 +67,7 @@
    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.

    // VALIDATION
    "run_eval": true,
@ -84,8 +85,8 @@

    // TACOTRON PRENET
    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
-    "prenet_type": "bn",     // "original" or "bn".
-    "prenet_dropout": false,        // enable/disable dropout at prenet.
+    "prenet_type": "bn",           // "original" or "bn".
+    "prenet_dropout": false,       // enable/disable dropout at prenet.

    // TACOTRON ATTENTION
    "attention_type": "original",  // 'original' or 'graves'
@ -122,33 +123,35 @@
    "max_seq_len": 153,     // DATASET-RELATED: maximum text length

    // PATHS
-    "output_path": "/home/erogol/Models/LJSpeech/",
+    "output_path": "../../Mozilla-TTS/vctk-test/",

    // PHONEMES
-    "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "../../Mozilla-TTS/vctk-test/",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages

    // MULTI-SPEAKER and GST
-    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_speaker_embedding": true,      // use speaker embedding to enable multi-speaker learning.
+    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
    "use_gst": true,       			    // use global style tokens
    "gst":	{			                // gst parameter if gst is enabled
-        "gst_style_input": null,        // Condition the style input either on a 
-                                        // -> wave file [path to wave] or 
-                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}  
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
-        "gst_embedding_dim": 512,       
+        "gst_embedding_dim": 512,
        "gst_num_heads": 4,
        "gst_style_tokens": 10
-	},  
+	},

    // DATASETS
    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
        [
            {
-                "name": "ljspeech",
-                "path": "/home/erogol/Data/LJSpeech-1.1/",
-                "meta_file_train": "metadata.csv",
+                "name": "vctk",
+                "path": "../../../datasets/VCTK-Corpus-removed-silence/",
+                "meta_file_train": ["p225", "p234", "p238", "p245", "p248", "p261", "p294", "p302", "p326", "p335", "p347"], // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
                "meta_file_val": null
            }
        ]
--- a/mozilla_voice_tts/tts/datasets/TTSDataset.py
+++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py
@ -5,8 +5,8 @@ import torch
 import random
 from torch.utils.data import Dataset

-from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
-from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
+from mozilla_voice_tts.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
+from mozilla_voice_tts.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target


 class MyDataset(Dataset):
@ -24,13 +24,14 @@ class MyDataset(Dataset):
                 phoneme_cache_path=None,
                 phoneme_language="en-us",
                 enable_eos_bos=False,
+                 speaker_mapping=None,
                 verbose=False):
        """
        Args:
            outputs_per_step (int): number of time frames predicted per step.
            text_cleaner (str): text cleaner used for the dataset.
            compute_linear_spec (bool): compute linear spectrogram if True.
-            ap (TTS.utils.AudioProcessor): audio processor object.
+            ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object.
            meta_data (list): list of dataset instances.
            batch_group_size (int): (0) range of batch randomization after sorting
                sequences by length.
@ -58,6 +59,7 @@ class MyDataset(Dataset):
        self.phoneme_cache_path = phoneme_cache_path
        self.phoneme_language = phoneme_language
        self.enable_eos_bos = enable_eos_bos
+        self.speaker_mapping = speaker_mapping
        self.verbose = verbose
        if use_phonemes and not os.path.isdir(phoneme_cache_path):
            os.makedirs(phoneme_cache_path, exist_ok=True)
@ -127,7 +129,8 @@ class MyDataset(Dataset):
            'text': text,
            'wav': wav,
            'item_idx': self.items[idx][1],
-            'speaker_name': speaker_name
+            'speaker_name': speaker_name,
+            'wav_file_name': os.path.basename(wav_file)
        }
        return sample

@ -191,9 +194,15 @@ class MyDataset(Dataset):
                batch[idx]['item_idx'] for idx in ids_sorted_decreasing
            ]
            text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
+
            speaker_name = [batch[idx]['speaker_name']
                            for idx in ids_sorted_decreasing]
-
+            # get speaker embeddings
+            if self.speaker_mapping  is not None:
+                wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing]
+                speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names]
+            else:
+                speaker_embedding = None
            # compute features
            mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]

@ -224,6 +233,9 @@ class MyDataset(Dataset):
            mel_lengths = torch.LongTensor(mel_lengths)
            stop_targets = torch.FloatTensor(stop_targets)

+            if speaker_embedding is not None:
+                speaker_embedding = torch.FloatTensor(speaker_embedding)
+
            # compute linear spectrogram
            if self.compute_linear_spec:
                linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
@ -234,7 +246,7 @@ class MyDataset(Dataset):
            else:
                linear = None
            return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
-                   stop_targets, item_idxs
+                   stop_targets, item_idxs, speaker_embedding

        raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
                         found {}".format(type(batch[0]))))
--- a/mozilla_voice_tts/tts/datasets/init.py
+++ b/mozilla_voice_tts/tts/datasets/init.py
--- a/mozilla_voice_tts/tts/datasets/preprocess.py
+++ b/mozilla_voice_tts/tts/datasets/preprocess.py
@ -2,7 +2,7 @@ import os
 from glob import glob
 import re
 import sys
-from TTS.utils.generic_utils import split_dataset
+from mozilla_voice_tts.tts.utils.generic_utils import split_dataset


 def load_meta_data(datasets):
@ -93,9 +93,10 @@ def mozilla_de(root_path, meta_file):

 def mailabs(root_path, meta_files=None):
    """Normalizes M-AI-Labs meta data files to TTS format"""
-    speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
+    speaker_regex = re.compile(
+        "by_book/(male|female)/(?P<speaker_name>[^/]+)/")
    if meta_files is None:
-        csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
+        csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
    else:
        csv_files = meta_files
    # meta_files = [f.strip() for f in meta_files.split(",")]
@ -115,12 +116,15 @@ def mailabs(root_path, meta_files=None):
                if meta_files is None:
                    wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
                else:
-                    wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
+                    wav_file = os.path.join(root_path,
+                                            folder.replace("metadata.csv", ""),
+                                            'wavs', cols[0] + '.wav')
                if os.path.isfile(wav_file):
                    text = cols[1].strip()
                    items.append([text, wav_file, speaker_name])
                else:
-                    raise RuntimeError("> File %s does not exist!"%(wav_file))
+                    raise RuntimeError("> File %s does not exist!" %
+                                       (wav_file))
    return items


@ -185,7 +189,8 @@ def libri_tts(root_path, meta_files=None):
                text = cols[1]
                items.append([text, wav_file, speaker_name])
    for item in items:
-        assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
+        assert os.path.exists(
+            item[1]), f" [!] wav files don't exist - {item[1]}"
    return items


@ -197,7 +202,8 @@ def custom_turkish(root_path, meta_file):
    with open(txt_file, 'r', encoding='utf-8') as ttf:
        for line in ttf:
            cols = line.split('|')
-            wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
+            wav_file = os.path.join(root_path, 'wavs',
+                                    cols[0].strip() + '.wav')
            if not os.path.exists(wav_file):
                skipped_files.append(wav_file)
                continue
@ -205,3 +211,44 @@ def custom_turkish(root_path, meta_file):
            items.append([text, wav_file, speaker_name])
    print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
    return items
+
+
+# ToDo: add the dataset link when the dataset is released publicly
+def brspeech(root_path, meta_file):
+    '''BRSpeech 3.0 beta'''
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            if line.startswith("wav_filename"):
+                continue
+            cols = line.split('|')
+            #print(cols)
+            wav_file = os.path.join(root_path, cols[0])
+            text = cols[2]
+            speaker_name = cols[3]
+            items.append([text, wav_file, speaker_name])
+    return items
+
+
+def vctk(root_path, meta_files=None, wavs_path='wav48'):
+    """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
+    test_speakers = meta_files
+    items = []
+    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt",
+                      recursive=True)
+    for meta_file in meta_files:
+        _, speaker_id, txt_file = os.path.relpath(meta_file,
+                                                  root_path).split(os.sep)
+        file_id = txt_file.split('.')[0]
+        if isinstance(test_speakers,
+                      list):  # if is list ignore this speakers ids
+            if speaker_id in test_speakers:
+                continue
+        with open(meta_file) as file_text:
+            text = file_text.readlines()[0]
+        wav_file = os.path.join(root_path, wavs_path, speaker_id,
+                                file_id + '.wav')
+        items.append([text, wav_file, speaker_id])
+
+    return items
--- a/mozilla_voice_tts/tts/layers/init.py
+++ b/mozilla_voice_tts/tts/layers/init.py
--- a/mozilla_voice_tts/tts/layers/common_layers.py
+++ b/mozilla_voice_tts/tts/layers/common_layers.py
@ -1,6 +1,5 @@
 import torch
 from torch import nn
-from torch.autograd import Variable
 from torch.nn import functional as F


@ -52,6 +51,7 @@ class LinearBN(nn.Module):


 class Prenet(nn.Module):
+    # pylint: disable=dangerous-default-value
    def __init__(self,
                 in_features,
                 prenet_type="original",
@ -244,14 +244,14 @@ class OriginalAttention(nn.Module):
        self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)

    def init_location_attention(self, inputs):
-        B = inputs.shape[0]
-        T = inputs.shape[1]
-        self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_())
+        B = inputs.size(0)
+        T = inputs.size(1)
+        self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)

    def init_states(self, inputs):
-        B = inputs.shape[0]
-        T = inputs.shape[1]
-        self.attention_weights = Variable(inputs.data.new(B, T).zero_())
+        B = inputs.size(0)
+        T = inputs.size(1)
+        self.attention_weights = torch.zeros([B, T], device=inputs.device)
        if self.location_attention:
            self.init_location_attention(inputs)
        if self.forward_attn:
@ -300,8 +300,8 @@ class OriginalAttention(nn.Module):

    def apply_forward_attention(self, alignment):
        # forward attention
-        fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
-                            (1, 0, 0, 0))
+        fwd_shifted_alpha = F.pad(
+            self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
        # compute transition potentials
        alpha = ((1 - self.u) * self.alpha
                 + self.u * fwd_shifted_alpha
@ -309,7 +309,7 @@ class OriginalAttention(nn.Module):
        # force incremental alignment
        if not self.training and self.forward_attn_mask:
            _, n = fwd_shifted_alpha.max(1)
-            val, n2 = alpha.max(1)
+            val, _ = alpha.max(1)
            for b in range(alignment.shape[0]):
                alpha[b, n[b] + 3:] = 0
                alpha[b, :(
--- a/mozilla_voice_tts/tts/layers/gst_layers.py
+++ b/mozilla_voice_tts/tts/layers/gst_layers.py
@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module):
        # x: 3D tensor [batch_size, post_conv_width,
        #               num_channels*post_conv_height]
        self.recurrence.flatten_parameters()
-        memory, out = self.recurrence(x)
+        _, out = self.recurrence(x)
        # out: 3D tensor [seq_len==1, batch_size, encoding_size=128]

        return out.squeeze(0)
@ -96,7 +96,7 @@ class StyleTokenLayer(nn.Module):
        self.key_dim = embedding_dim // num_heads
        self.style_tokens = nn.Parameter(
            torch.FloatTensor(num_style_tokens, self.key_dim))
-        nn.init.orthogonal_(self.style_tokens)
+        nn.init.normal_(self.style_tokens, mean=0, std=0.5)
        self.attention = MultiHeadAttention(
            query_dim=self.query_dim,
            key_dim=self.key_dim,
--- a/mozilla_voice_tts/tts/layers/losses.py
+++ b/mozilla_voice_tts/tts/layers/losses.py
@ -2,7 +2,7 @@ import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional
-from TTS.utils.generic_utils import sequence_mask
+from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask


 class L1LossMasked(nn.Module):
@ -150,7 +150,7 @@ class GuidedAttentionLoss(torch.nn.Module):

    @staticmethod
    def _make_ga_mask(ilen, olen, sigma):
-        grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
+        grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device), torch.arange(ilen, device=ilen.device))
        grid_x, grid_y = grid_x.float(), grid_y.float()
        return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))

@ -243,4 +243,3 @@ class TacotronLoss(torch.nn.Module):

        return_dict['loss'] = loss
        return return_dict
-
--- a/mozilla_voice_tts/tts/layers/tacotron.py
+++ b/mozilla_voice_tts/tts/layers/tacotron.py
@ -1,7 +1,7 @@
 # coding: utf-8
 import torch
 from torch import nn
-from .common_layers import Prenet, init_attn, Linear
+from .common_layers import Prenet, init_attn


 class BatchNormConv1d(nn.Module):
@ -18,8 +18,8 @@ class BatchNormConv1d(nn.Module):
        activation: activation function set b/w Conv1d and BatchNorm

    Shapes:
-        - input: batch x dims
-        - output: batch x dims
+        - input: (B, D)
+        - output: (B, D)
    """

    def __init__(self,
@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module):
        # self.init_layers()

    def init_layers(self):
-        if type(self.activation) == torch.nn.ReLU:
+        if isinstance(self.activation, torch.nn.ReLU):
            w_gain = 'relu'
-        elif type(self.activation) == torch.nn.Tanh:
+        elif isinstance(self.activation, torch.nn.Tanh):
            w_gain = 'tanh'
        elif self.activation is None:
            w_gain = 'linear'
@ -67,12 +67,23 @@ class BatchNormConv1d(nn.Module):


 class Highway(nn.Module):
+    r"""Highway layers as explained in https://arxiv.org/abs/1505.00387
+
+    Args:
+        in_features (int): size of each input sample
+        out_feature (int): size of each output sample
+
+    Shapes:
+        - input: (B, *, H_in)
+        - output: (B, *, H_out)
+    """
+
    # TODO: Try GLU layer
-    def __init__(self, in_size, out_size):
+    def __init__(self, in_features, out_feature):
        super(Highway, self).__init__()
-        self.H = nn.Linear(in_size, out_size)
+        self.H = nn.Linear(in_features, out_feature)
        self.H.bias.data.zero_()
-        self.T = nn.Linear(in_size, out_size)
+        self.T = nn.Linear(in_features, out_feature)
        self.T.bias.data.fill_(-1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
@ -103,10 +114,10 @@ class CBHG(nn.Module):
            num_highways (int): number of highways layers

        Shapes:
-            - input: B x D x T_in
-            - output: B x T_in x D*2
+            - input: (B, C, T_in)
+            - output: (B, T_in, C*2)
    """
-
+    #pylint: disable=dangerous-default-value
    def __init__(self,
                 in_features,
                 K=16,
@ -195,6 +206,8 @@ class CBHG(nn.Module):


 class EncoderCBHG(nn.Module):
+    r"""CBHG module with Encoder specific arguments"""
+
    def __init__(self):
        super(EncoderCBHG, self).__init__()
        self.cbhg = CBHG(
@ -211,7 +224,14 @@ class EncoderCBHG(nn.Module):


 class Encoder(nn.Module):
-    r"""Encapsulate Prenet and CBHG modules for encoder"""
+    r"""Stack Prenet and CBHG module for encoder
+    Args:
+        inputs (FloatTensor): embedding features
+
+    Shapes:
+        - inputs: (B, T, D_in)
+        - outputs: (B, T, 128 * 2)
+    """

    def __init__(self, in_features):
        super(Encoder, self).__init__()
@ -219,14 +239,6 @@ class Encoder(nn.Module):
        self.cbhg = EncoderCBHG()

    def forward(self, inputs):
-        r"""
-        Args:
-            inputs (FloatTensor): embedding features
-
-        Shapes:
-            - inputs: batch x time x in_features
-            - outputs: batch x time x 128*2
-        """
        # B x T x prenet_dim
        outputs = self.prenet(inputs)
        outputs = self.cbhg(outputs.transpose(1, 2))
@ -250,35 +262,48 @@ class PostCBHG(nn.Module):


 class Decoder(nn.Module):
-    """Decoder module.
+    """Tacotron decoder.

    Args:
-        in_features (int): input vector (encoder output) sample size.
-        memory_dim (int): memory vector (prev. time-step output) sample size.
-        r (int): number of outputs per time step.
+        in_channels (int): number of input channels.
+        frame_channels (int): number of feature frame channels.
+        r (int): number of outputs per time step (reduction rate).
        memory_size (int): size of the past window. if <= 0 memory_size = r
-        TODO: arguments
+        attn_type (string): type of attention used in decoder.
+        attn_windowing (bool): if true, define an attention window centered to maximum
+            attention response. It provides more robust attention alignment especially
+            at interence time.
+        attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
+        prenet_type (string): 'original' or 'bn'.
+        prenet_dropout (float): prenet dropout rate.
+        forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
+        trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
+        forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
+        location_attn (bool): if true, use location sensitive attention.
+        attn_K (int): number of attention heads for GravesAttention.
+        separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
+        speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training.
    """

    # Pylint gets confused by PyTorch conventions here
-    #pylint: disable=attribute-defined-outside-init
+    # pylint: disable=attribute-defined-outside-init

-    def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing,
+    def __init__(self, in_channels, frame_channels, r, memory_size, attn_type, attn_windowing,
                 attn_norm, prenet_type, prenet_dropout, forward_attn,
                 trans_agent, forward_attn_mask, location_attn, attn_K,
-                 separate_stopnet, speaker_embedding_dim):
+                 separate_stopnet):
        super(Decoder, self).__init__()
        self.r_init = r
        self.r = r
-        self.in_features = in_features
+        self.in_channels = in_channels
        self.max_decoder_steps = 500
        self.use_memory_queue = memory_size > 0
        self.memory_size = memory_size if memory_size > 0 else r
-        self.memory_dim = memory_dim
+        self.frame_channels = frame_channels
        self.separate_stopnet = separate_stopnet
        self.query_dim = 256
        # memory -> |Prenet| -> processed_memory
-        prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
+        prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels
        self.prenet = Prenet(
            prenet_dim,
            prenet_type,
@ -286,11 +311,11 @@ class Decoder(nn.Module):
            out_features=[256, 128])
        # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
        # attention_rnn generates queries for the attention mechanism
-        self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
+        self.attention_rnn = nn.GRUCell(in_channels + 128, self.query_dim)

        self.attention = init_attn(attn_type=attn_type,
                                   query_dim=self.query_dim,
-                                   embedding_dim=in_features,
+                                   embedding_dim=in_channels,
                                   attention_dim=128,
                                   location_attention=location_attn,
                                   attention_location_n_filters=32,
@ -302,14 +327,14 @@ class Decoder(nn.Module):
                                   forward_attn_mask=forward_attn_mask,
                                   attn_K=attn_K)
        # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
-        self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
+        self.project_to_decoder_in = nn.Linear(256 + in_channels, 256)
        # decoder_RNN_input -> |RNN| -> RNN_state
        self.decoder_rnns = nn.ModuleList(
            [nn.GRUCell(256, 256) for _ in range(2)])
        # RNN_state -> |Linear| -> mel_spec
-        self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
+        self.proj_to_mel = nn.Linear(256, frame_channels * self.r_init)
        # learn init values instead of zero init.
-        self.stopnet = StopNet(256 + memory_dim * self.r_init)
+        self.stopnet = StopNet(256 + frame_channels * self.r_init)

    def set_r(self, new_r):
        self.r = new_r
@ -319,9 +344,9 @@ class Decoder(nn.Module):
        Reshape the spectrograms for given 'r'
        """
        # Grouping multiple frames if necessary
-        if memory.size(-1) == self.memory_dim:
+        if memory.size(-1) == self.frame_channels:
            memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
-        # Time first (T_decoder, B, memory_dim)
+        # Time first (T_decoder, B, frame_channels)
        memory = memory.transpose(0, 1)
        return memory

@ -330,19 +355,18 @@ class Decoder(nn.Module):
        Initialization of decoder states
        """
        B = inputs.size(0)
-        T = inputs.size(1)
        # go frame as zeros matrix
        if self.use_memory_queue:
-            self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
+            self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
        else:
-            self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
+            self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels)
        # decoder states
        self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
        self.decoder_rnn_hiddens = [
            torch.zeros(1, device=inputs.device).repeat(B, 256)
            for idx in range(len(self.decoder_rnns))
        ]
-        self.context_vec = inputs.data.new(B, self.in_features).zero_()
+        self.context_vec = inputs.data.new(B, self.in_channels).zero_()
        # cache attention inputs
        self.processed_inputs = self.attention.preprocess_inputs(inputs)

@ -352,7 +376,7 @@ class Decoder(nn.Module):
        stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
        outputs = torch.stack(outputs).transpose(0, 1).contiguous()
        outputs = outputs.view(
-            outputs.size(0), -1, self.memory_dim)
+            outputs.size(0), -1, self.frame_channels)
        outputs = outputs.transpose(1, 2)
        return outputs, attentions, stop_tokens

@ -386,7 +410,7 @@ class Decoder(nn.Module):
            stop_token = self.stopnet(stopnet_input.detach())
        else:
            stop_token = self.stopnet(stopnet_input)
-        output = output[:, : self.r * self.memory_dim]
+        output = output[:, : self.r * self.frame_channels]
        return output, stop_token, self.attention.attention_weights

    def _update_memory_input(self, new_memory):
@ -395,17 +419,17 @@ class Decoder(nn.Module):
                # memory queue size is larger than number of frames per decoder iter
                self.memory_input = torch.cat([
                    new_memory, self.memory_input[:, :(
-                        self.memory_size - self.r) * self.memory_dim].clone()
+                        self.memory_size - self.r) * self.frame_channels].clone()
                ], dim=-1)
            else:
                # memory queue size smaller than number of frames per decoder iter
-                self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
+                self.memory_input = new_memory[:, :self.memory_size * self.frame_channels]
        else:
            # use only the last frame prediction
-            # assert new_memory.shape[-1] == self.r * self.memory_dim
-            self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
+            # assert new_memory.shape[-1] == self.r * self.frame_channels
+            self.memory_input = new_memory[:, self.frame_channels * (self.r - 1):]

-    def forward(self, inputs, memory, mask, speaker_embeddings=None):
+    def forward(self, inputs, memory, mask):
        """
        Args:
            inputs: Encoder outputs.
@ -415,8 +439,8 @@ class Decoder(nn.Module):
            mask: Attention mask for sequence padding.

        Shapes:
-            - inputs: batch x time x encoder_out_dim
-            - memory: batch x #mel_specs x mel_spec_dim
+            - inputs: (B, T, D_out_enc)
+            - memory: (B, T_mel, D_mel)
        """
        # Run greedy decoding if memory is None
        memory = self._reshape_memory(memory)
@ -430,8 +454,7 @@ class Decoder(nn.Module):
            if t > 0:
                new_memory = memory[t - 1]
                self._update_memory_input(new_memory)
-            if speaker_embeddings is not None:
-                self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
+
            output, stop_token, attention = self.decode(inputs, mask)
            outputs += [output]
            attentions += [attention]
@ -439,15 +462,12 @@ class Decoder(nn.Module):
            t += 1
        return self._parse_outputs(outputs, attentions, stop_tokens)

-    def inference(self, inputs, speaker_embeddings=None):
+    def inference(self, inputs):
        """
        Args:
            inputs: encoder outputs.
-            speaker_embeddings: speaker vectors.
-
        Shapes:
            - inputs: batch x time x encoder_out_dim
-            - speaker_embeddings: batch x embed_dim
        """
        outputs = []
        attentions = []
@ -460,8 +480,6 @@ class Decoder(nn.Module):
            if t > 0:
                new_memory = outputs[-1]
                self._update_memory_input(new_memory)
-            if speaker_embeddings is not None:
-                self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
            output, stop_token, attention = self.decode(inputs, None)
            stop_token = torch.sigmoid(stop_token.data)
            outputs += [output]
@ -471,14 +489,14 @@ class Decoder(nn.Module):
            if t > inputs.shape[1] / 4 and (stop_token > 0.6
                                            or attention[:, -1].item() > 0.6):
                break
-            elif t > self.max_decoder_steps:
+            if t > self.max_decoder_steps:
                print("   | > Decoder stopped with 'max_decoder_steps")
                break
        return self._parse_outputs(outputs, attentions, stop_tokens)


 class StopNet(nn.Module):
-    r"""
+    r"""Stopnet signalling decoder to stop inference.
    Args:
        in_features (int): feature dimension of input.
    """
--- a/mozilla_voice_tts/tts/layers/tacotron2.py
+++ b/mozilla_voice_tts/tts/layers/tacotron2.py
@ -1,11 +1,24 @@
 import torch
-from torch.autograd import Variable
 from torch import nn
 from torch.nn import functional as F
 from .common_layers import init_attn, Prenet, Linear

-
+# NOTE: linter has a problem with the current TF release
+#pylint: disable=no-value-for-parameter
+#pylint: disable=unexpected-keyword-arg
 class ConvBNBlock(nn.Module):
+    r"""Convolutions with Batch Normalization and non-linear activation.
+
+    Args:
+        in_channels (int): number of input channels.
+        out_channels (int): number of output channels.
+        kernel_size (int): convolution kernel size.
+        activation (str): 'relu', 'tanh', None (linear).
+
+    Shapes:
+        - input: (B, C_in, T)
+        - output: (B, C_out, T)
+    """
    def __init__(self, in_channels, out_channels, kernel_size, activation=None):
        super(ConvBNBlock, self).__init__()
        assert (kernel_size - 1) % 2 == 0
@ -32,16 +45,25 @@ class ConvBNBlock(nn.Module):


 class Postnet(nn.Module):
-    def __init__(self, output_dim, num_convs=5):
+    r"""Tacotron2 Postnet
+
+    Args:
+        in_out_channels (int): number of output channels.
+
+    Shapes:
+        - input: (B, C_in, T)
+        - output: (B, C_in, T)
+    """
+    def __init__(self, in_out_channels, num_convs=5):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()
        self.convolutions.append(
-            ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
+            ConvBNBlock(in_out_channels, 512, kernel_size=5, activation='tanh'))
        for _ in range(1, num_convs - 1):
            self.convolutions.append(
                ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
        self.convolutions.append(
-            ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
+            ConvBNBlock(512, in_out_channels, kernel_size=5, activation=None))

    def forward(self, x):
        o = x
@ -51,14 +73,23 @@ class Postnet(nn.Module):


 class Encoder(nn.Module):
-    def __init__(self, output_input_dim=512):
+    r"""Tacotron2 Encoder
+
+    Args:
+        in_out_channels (int): number of input and output channels.
+
+    Shapes:
+        - input: (B, C_in, T)
+        - output: (B, C_in, T)
+    """
+    def __init__(self, in_out_channels=512):
        super(Encoder, self).__init__()
        self.convolutions = nn.ModuleList()
        for _ in range(3):
            self.convolutions.append(
-                ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
-        self.lstm = nn.LSTM(output_input_dim,
-                            int(output_input_dim / 2),
+                ConvBNBlock(in_out_channels, in_out_channels, 5, 'relu'))
+        self.lstm = nn.LSTM(in_out_channels,
+                            int(in_out_channels / 2),
                            num_layers=1,
                            batch_first=True,
                            bias=True,
@ -90,20 +121,40 @@ class Encoder(nn.Module):

 # adapted from https://github.com/NVIDIA/tacotron2/
 class Decoder(nn.Module):
+    """Tacotron2 decoder. We don't use Zoneout but Dropout between RNN layers.
+
+    Args:
+        in_channels (int): number of input channels.
+        frame_channels (int): number of feature frame channels.
+        r (int): number of outputs per time step (reduction rate).
+        memory_size (int): size of the past window. if <= 0 memory_size = r
+        attn_type (string): type of attention used in decoder.
+        attn_win (bool): if true, define an attention window centered to maximum
+            attention response. It provides more robust attention alignment especially
+            at interence time.
+        attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
+        prenet_type (string): 'original' or 'bn'.
+        prenet_dropout (float): prenet dropout rate.
+        forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
+        trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
+        forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
+        location_attn (bool): if true, use location sensitive attention.
+        attn_K (int): number of attention heads for GravesAttention.
+        separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
+    """
    # Pylint gets confused by PyTorch conventions here
    #pylint: disable=attribute-defined-outside-init
-    def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
+    def __init__(self, in_channels, frame_channels, r, attn_type, attn_win, attn_norm,
                 prenet_type, prenet_dropout, forward_attn, trans_agent,
-                 forward_attn_mask, location_attn, attn_K, separate_stopnet,
-                 speaker_embedding_dim):
+                 forward_attn_mask, location_attn, attn_K, separate_stopnet):
        super(Decoder, self).__init__()
-        self.frame_dim = frame_dim
+        self.frame_channels = frame_channels
        self.r_init = r
        self.r = r
-        self.encoder_embedding_dim = input_dim
+        self.encoder_embedding_dim = in_channels
        self.separate_stopnet = separate_stopnet
        self.max_decoder_steps = 1000
-        self.gate_threshold = 0.5
+        self.stop_threshold = 0.5

        # model dimensions
        self.query_dim = 1024
@ -114,20 +165,20 @@ class Decoder(nn.Module):
        self.p_decoder_dropout = 0.1

        # memory -> |Prenet| -> processed_memory
-        prenet_dim = self.frame_dim
+        prenet_dim = self.frame_channels
        self.prenet = Prenet(prenet_dim,
                             prenet_type,
                             prenet_dropout,
                             out_features=[self.prenet_dim, self.prenet_dim],
                             bias=False)

-        self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
+        self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_channels,
                                         self.query_dim,
                                         bias=True)

        self.attention = init_attn(attn_type=attn_type,
                                   query_dim=self.query_dim,
-                                   embedding_dim=input_dim,
+                                   embedding_dim=in_channels,
                                   attention_dim=128,
                                   location_attention=location_attn,
                                   attention_location_n_filters=32,
@ -139,16 +190,16 @@ class Decoder(nn.Module):
                                   forward_attn_mask=forward_attn_mask,
                                   attn_K=attn_K)

-        self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
+        self.decoder_rnn = nn.LSTMCell(self.query_dim + in_channels,
                                       self.decoder_rnn_dim,
                                       bias=True)

-        self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
-                                        self.frame_dim * self.r_init)
+        self.linear_projection = Linear(self.decoder_rnn_dim + in_channels,
+                                        self.frame_channels * self.r_init)

        self.stopnet = nn.Sequential(
            nn.Dropout(0.1),
-            Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
+            Linear(self.decoder_rnn_dim + self.frame_channels * self.r_init,
                   1,
                   bias=True,
                   init_gain='sigmoid'))
@ -159,8 +210,8 @@ class Decoder(nn.Module):

    def get_go_frame(self, inputs):
        B = inputs.size(0)
-        memory = torch.zeros(1, device=inputs.device).repeat(B,
-                             self.frame_dim * self.r)
+        memory = torch.zeros(1, device=inputs.device).repeat(
+            B, self.frame_channels * self.r)
        return memory

    def _init_states(self, inputs, mask, keep_states=False):
@ -186,9 +237,9 @@ class Decoder(nn.Module):
        Reshape the spectrograms for given 'r'
        """
        # Grouping multiple frames if necessary
-        if memory.size(-1) == self.frame_dim:
+        if memory.size(-1) == self.frame_channels:
            memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
-        # Time first (T_decoder, B, frame_dim)
+        # Time first (T_decoder, B, frame_channels)
        memory = memory.transpose(0, 1)
        return memory

@ -196,22 +247,22 @@ class Decoder(nn.Module):
        alignments = torch.stack(alignments).transpose(0, 1)
        stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
        outputs = torch.stack(outputs).transpose(0, 1).contiguous()
-        outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
+        outputs = outputs.view(outputs.size(0), -1, self.frame_channels)
        outputs = outputs.transpose(1, 2)
        return outputs, stop_tokens, alignments

    def _update_memory(self, memory):
        if len(memory.shape) == 2:
-            return memory[:, self.frame_dim * (self.r - 1):]
-        return memory[:, :, self.frame_dim * (self.r - 1):]
+            return memory[:, self.frame_channels * (self.r - 1):]
+        return memory[:, :, self.frame_channels * (self.r - 1):]

    def decode(self, memory):
        '''
         shapes:
-            - memory: B x r * self.frame_dim
+            - memory: B x r * self.frame_channels
        '''
        # self.context: B x D_en
-        # query_input: B x D_en + (r * self.frame_dim)
+        # query_input: B x D_en + (r * self.frame_channels)
        query_input = torch.cat((memory, self.context), -1)
        # self.query and self.attention_rnn_cell_state : B x D_attn_rnn
        self.query, self.attention_rnn_cell_state = self.attention_rnn(
@ -234,25 +285,36 @@ class Decoder(nn.Module):
        # B x (D_decoder_rnn + D_en)
        decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
                                           dim=1)
-        # B x (self.r * self.frame_dim)
+        # B x (self.r * self.frame_channels)
        decoder_output = self.linear_projection(decoder_hidden_context)
-        # B x (D_decoder_rnn + (self.r * self.frame_dim))
+        # B x (D_decoder_rnn + (self.r * self.frame_channels))
        stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
        if self.separate_stopnet:
            stop_token = self.stopnet(stopnet_input.detach())
        else:
            stop_token = self.stopnet(stopnet_input)
        # select outputs for the reduction rate self.r
-        decoder_output = decoder_output[:, :self.r * self.frame_dim]
+        decoder_output = decoder_output[:, :self.r * self.frame_channels]
        return decoder_output, self.attention.attention_weights, stop_token

-    def forward(self, inputs, memories, mask, speaker_embeddings=None):
+    def forward(self, inputs, memories, mask):
+        r"""Train Decoder with teacher forcing.
+        Args:
+            inputs: Encoder outputs.
+            memories: Feature frames for teacher-forcing.
+            mask: Attention mask for sequence padding.
+
+        Shapes:
+            - inputs: (B, T, D_out_enc)
+            - memory: (B, T_mel, D_mel)
+            - outputs: (B, T_mel, D_mel)
+            - alignments: (B, T_in, T_out)
+            - stop_tokens: (B, T_out)
+        """
        memory = self.get_go_frame(inputs).unsqueeze(0)
        memories = self._reshape_memory(memories)
        memories = torch.cat((memory, memories), dim=0)
        memories = self._update_memory(memories)
-        if speaker_embeddings is not None:
-            memories = torch.cat([memories, speaker_embeddings], dim=-1)
        memories = self.prenet(memories)

        self._init_states(inputs, mask=mask)
@ -270,7 +332,18 @@ class Decoder(nn.Module):
            outputs, stop_tokens, alignments)
        return outputs, alignments, stop_tokens

-    def inference(self, inputs, speaker_embeddings=None):
+    def inference(self, inputs):
+        r"""Decoder inference without teacher forcing and use
+        Stopnet to stop decoder.
+        Args:
+            inputs: Encoder outputs.
+
+        Shapes:
+            - inputs: (B, T, D_out_enc)
+            - outputs: (B, T_mel, D_mel)
+            - alignments: (B, T_in, T_out)
+            - stop_tokens: (B, T_out)
+        """
        memory = self.get_go_frame(inputs)
        memory = self._update_memory(memory)

@ -280,15 +353,13 @@ class Decoder(nn.Module):
        outputs, stop_tokens, alignments, t = [], [], [], 0
        while True:
            memory = self.prenet(memory)
-            if speaker_embeddings is not None:
-                memory = torch.cat([memory, speaker_embeddings], dim=-1)
            decoder_output, alignment, stop_token = self.decode(memory)
            stop_token = torch.sigmoid(stop_token.data)
            outputs += [decoder_output.squeeze(1)]
            stop_tokens += [stop_token]
            alignments += [alignment]

-            if stop_token > 0.7 and t > inputs.shape[0] / 2:
+            if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
                break
            if len(outputs) == self.max_decoder_steps:
                print("   | > Decoder stopped with 'max_decoder_steps")
@ -315,7 +386,6 @@ class Decoder(nn.Module):
        self.attention.init_win_idx()
        self.attention.init_states(inputs)
        outputs, stop_tokens, alignments, t = [], [], [], 0
-        stop_flags = [True, False, False]
        while True:
            memory = self.prenet(self.memory_truncated)
            decoder_output, alignment, stop_token = self.decode(memory)
--- a/mozilla_voice_tts/tts/models/init.py
+++ b/mozilla_voice_tts/tts/models/init.py
--- a/mozilla_voice_tts/tts/models/tacotron.py
+++ b/mozilla_voice_tts/tts/models/tacotron.py
@ -2,9 +2,9 @@
 import torch
 from torch import nn

-from TTS.layers.gst_layers import GST
-from TTS.layers.tacotron import Decoder, Encoder, PostCBHG
-from TTS.models.tacotron_abstract import TacotronAbstract
+from mozilla_voice_tts.tts.layers.gst_layers import GST
+from mozilla_voice_tts.tts.layers.tacotron import Decoder, Encoder, PostCBHG
+from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract


 class Tacotron(TacotronAbstract):
@ -28,6 +28,9 @@ class Tacotron(TacotronAbstract):
                 bidirectional_decoder=False,
                 double_decoder_consistency=False,
                 ddc_r=None,
+                 encoder_in_features=256,
+                 decoder_in_features=256,
+                 speaker_embedding_dim=None,
                 gst=False,
                 gst_embedding_dim=256,
                 gst_num_heads=4,
@ -40,31 +43,36 @@ class Tacotron(TacotronAbstract):
                             forward_attn, trans_agent, forward_attn_mask,
                             location_attn, attn_K, separate_stopnet,
                             bidirectional_decoder, double_decoder_consistency,
-                             ddc_r, gst)
-        decoder_in_features = 512 if num_speakers > 1 else 256
-        encoder_in_features = 512 if num_speakers > 1 else 256
-        speaker_embedding_dim = 256
-        proj_speaker_dim = 80 if num_speakers > 1 else 0
-        # base model layers
+                             ddc_r, encoder_in_features, decoder_in_features,
+                             speaker_embedding_dim, gst, gst_embedding_dim,
+                             gst_num_heads, gst_style_tokens)
+
+        # speaker embedding layers
+        if self.num_speakers > 1:
+            if not self.embeddings_per_sample:
+                speaker_embedding_dim = 256
+                self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
+                self.speaker_embedding.weight.data.normal_(0, 0.3)
+
+        # speaker and gst embeddings is concat in decoder input
+        if self.num_speakers > 1:
+            self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim
+
+        # embedding layer
        self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
        self.embedding.weight.data.normal_(0, 0.3)
-        self.encoder = Encoder(encoder_in_features)
-        self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
+
+        # base model layers
+        self.encoder = Encoder(self.encoder_in_features)
+        self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r,
                               memory_size, attn_type, attn_win, attn_norm,
                               prenet_type, prenet_dropout, forward_attn,
                               trans_agent, forward_attn_mask, location_attn,
-                               attn_K, separate_stopnet, proj_speaker_dim)
+                               attn_K, separate_stopnet)
        self.postnet = PostCBHG(decoder_output_dim)
        self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
                                     postnet_output_dim)
-        # speaker embedding layers
-        if num_speakers > 1:
-            self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
-            self.speaker_embedding.weight.data.normal_(0, 0.3)
-            self.speaker_project_mel = nn.Sequential(
-                nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
-            self.speaker_embeddings = None
-            self.speaker_embeddings_projected = None
+
        # global style token layers
        if self.gst:
            self.gst_layer = GST(num_mel=80,
@ -77,13 +85,12 @@ class Tacotron(TacotronAbstract):
        # setup DDC
        if self.double_decoder_consistency:
            self.coarse_decoder = Decoder(
-                decoder_in_features, decoder_output_dim, ddc_r, memory_size,
+                self.decoder_in_features, decoder_output_dim, ddc_r, memory_size,
                attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
                forward_attn, trans_agent, forward_attn_mask, location_attn,
-                attn_K, separate_stopnet, proj_speaker_dim)
+                attn_K, separate_stopnet)

-
-    def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
+    def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
        """
        Shapes:
            - characters: B x T_in
@ -91,17 +98,9 @@ class Tacotron(TacotronAbstract):
            - mel_specs: B x T_out x D
            - speaker_ids: B x 1
        """
-        self._init_states()
        input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
        # B x T_in x embed_dim
        inputs = self.embedding(characters)
-        # B x speaker_embed_dim
-        if speaker_ids is not None:
-            self.compute_speaker_embedding(speaker_ids)
-        if self.num_speakers > 1:
-            # B x T_in x embed_dim + speaker_embed_dim
-            inputs = self._concat_speaker_embedding(inputs,
-                                                    self.speaker_embeddings)
        # B x T_in x encoder_in_features
        encoder_outputs = self.encoder(inputs)
        # sequence masking
@ -110,15 +109,20 @@ class Tacotron(TacotronAbstract):
        if self.gst:
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
+        # speaker embedding
        if self.num_speakers > 1:
-            encoder_outputs = self._concat_speaker_embedding(
-                encoder_outputs, self.speaker_embeddings)
+            if not self.embeddings_per_sample:
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+            else:
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
        # decoder_outputs: B x decoder_in_features x T_out
        # alignments: B x T_in x encoder_in_features
        # stop_tokens: B x T_in
        decoder_outputs, alignments, stop_tokens = self.decoder(
-            encoder_outputs, mel_specs, input_mask,
-            self.speaker_embeddings_projected)
+            encoder_outputs, mel_specs, input_mask)
        # sequence masking
        if output_mask is not None:
            decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
@ -140,22 +144,22 @@ class Tacotron(TacotronAbstract):
        return decoder_outputs, postnet_outputs, alignments, stop_tokens

    @torch.no_grad()
-    def inference(self, characters, speaker_ids=None, style_mel=None):
+    def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
        inputs = self.embedding(characters)
-        self._init_states()
-        if speaker_ids is not None:
-            self.compute_speaker_embedding(speaker_ids)
-        if self.num_speakers > 1:
-            inputs = self._concat_speaker_embedding(inputs,
-                                                    self.speaker_embeddings)
        encoder_outputs = self.encoder(inputs)
-        if self.gst and style_mel is not None:
+        if self.gst:
+            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
        if self.num_speakers > 1:
-            encoder_outputs = self._concat_speaker_embedding(
-                encoder_outputs, self.speaker_embeddings)
+            if not self.embeddings_per_sample:
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+            else:
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
        decoder_outputs, alignments, stop_tokens = self.decoder.inference(
-            encoder_outputs, self.speaker_embeddings_projected)
+            encoder_outputs)
        postnet_outputs = self.postnet(decoder_outputs)
        postnet_outputs = self.last_linear(postnet_outputs)
        decoder_outputs = decoder_outputs.transpose(1, 2)
--- a/mozilla_voice_tts/tts/models/tacotron2.py
+++ b/mozilla_voice_tts/tts/models/tacotron2.py
@ -1,10 +1,9 @@
 import torch
 from torch import nn

-from TTS.layers.gst_layers import GST
-from TTS.layers.tacotron2 import Decoder, Encoder, Postnet
-from TTS.models.tacotron_abstract import TacotronAbstract
-
+from mozilla_voice_tts.tts.layers.gst_layers import GST
+from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet
+from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract

 # TODO: match function arguments with tacotron
 class Tacotron2(TacotronAbstract):
@ -28,6 +27,9 @@ class Tacotron2(TacotronAbstract):
                 bidirectional_decoder=False,
                 double_decoder_consistency=False,
                 ddc_r=None,
+                 encoder_in_features=512,
+                 decoder_in_features=512,
+                 speaker_embedding_dim=None,
                 gst=False,
                 gst_embedding_dim=512,
                 gst_num_heads=4,
@ -39,46 +41,48 @@ class Tacotron2(TacotronAbstract):
                             forward_attn, trans_agent, forward_attn_mask,
                             location_attn, attn_K, separate_stopnet,
                             bidirectional_decoder, double_decoder_consistency,
-                             ddc_r, gst)
+                             ddc_r, encoder_in_features, decoder_in_features,
+                             speaker_embedding_dim, gst, gst_embedding_dim,
+                             gst_num_heads, gst_style_tokens)

-        # init layer dims
-        speaker_embedding_dim = 512 if num_speakers > 1 else 0
-        gst_embedding_dim = gst_embedding_dim if self.gst else 0
-        decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim
-        encoder_in_features = 512 if num_speakers > 1 else 512
-        proj_speaker_dim = 80 if num_speakers > 1 else 0
+        # speaker embedding layer
+        if self.num_speakers > 1:
+            if not self.embeddings_per_sample:
+                speaker_embedding_dim = 512
+                self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
+                self.speaker_embedding.weight.data.normal_(0, 0.3)
+
+        # speaker and gst embeddings is concat in decoder input
+        if self.num_speakers > 1:
+            self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim

        # embedding layer
        self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)

-        # speaker embedding layer
-        if num_speakers > 1:
-            self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
-            self.speaker_embedding.weight.data.normal_(0, 0.3)
-
-        self.encoder = Encoder(encoder_in_features)
-        self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
+        # base model layers
+        self.encoder = Encoder(self.encoder_in_features)
+        self.decoder = Decoder(self.decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
                               attn_norm, prenet_type, prenet_dropout,
                               forward_attn, trans_agent, forward_attn_mask,
-                               location_attn, attn_K, separate_stopnet, proj_speaker_dim)
+                               location_attn, attn_K, separate_stopnet)
        self.postnet = Postnet(self.postnet_output_dim)

        # global style token layers
        if self.gst:
            self.gst_layer = GST(num_mel=80,
-                                 num_heads=gst_num_heads,
-                                 num_style_tokens=gst_style_tokens,
-                                 embedding_dim=gst_embedding_dim)
+                                 num_heads=self.gst_num_heads,
+                                 num_style_tokens=self.gst_style_tokens,
+                                 embedding_dim=self.gst_embedding_dim)
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
        # setup DDC
        if self.double_decoder_consistency:
            self.coarse_decoder = Decoder(
-                decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
+                self.decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
                attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
                trans_agent, forward_attn_mask, location_attn, attn_K,
-                separate_stopnet, proj_speaker_dim)
+                separate_stopnet)

    @staticmethod
    def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
@ -86,25 +90,7 @@ class Tacotron2(TacotronAbstract):
        mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
        return mel_outputs, mel_outputs_postnet, alignments

-    def compute_gst(self, inputs, style_input):
-        """ Compute global style token """
-        device = inputs.device
-        if isinstance(style_input, dict):
-            query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
-            _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
-            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
-            for k_token, v_amplifier in style_input.items():
-                key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
-                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
-                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
-        elif style_input is None:
-            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
-        else:
-            gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
-        embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1)
-        return inputs, embedded_gst
-
-    def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
+    def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
        # compute mask for padding
        # B x T_in_max (boolean)
        input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
@ -113,20 +99,18 @@ class Tacotron2(TacotronAbstract):
        # B x T_in_max x D_en
        encoder_outputs = self.encoder(embedded_inputs, text_lengths)

+        if self.gst:
+            # B x gst_dim
+            encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
+
        if self.num_speakers > 1:
-            embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
-            embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
+            if not self.embeddings_per_sample:
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
            else:
-                encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
-        else:
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+                # B x 1 x speaker_embed_dim
+                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)

        encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)

@ -154,24 +138,18 @@ class Tacotron2(TacotronAbstract):
        return decoder_outputs, postnet_outputs, alignments, stop_tokens

    @torch.no_grad()
-    def inference(self, text, speaker_ids=None, style_mel=None):
+    def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
        embedded_inputs = self.embedding(text).transpose(1, 2)
        encoder_outputs = self.encoder.inference(embedded_inputs)

+        if self.gst:
+            # B x gst_dim
+            encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
+
        if self.num_speakers > 1:
-            embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
-            embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
-            else:
-                encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
-        else:
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+            if not self.embeddings_per_sample:
+                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)

        decoder_outputs, alignments, stop_tokens = self.decoder.inference(
            encoder_outputs)
@ -181,27 +159,21 @@ class Tacotron2(TacotronAbstract):
            decoder_outputs, postnet_outputs, alignments)
        return decoder_outputs, postnet_outputs, alignments, stop_tokens

-    def inference_truncated(self, text, speaker_ids=None, style_mel=None):
+    def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
        """
        Preserve model states for continuous inference
        """
        embedded_inputs = self.embedding(text).transpose(1, 2)
        encoder_outputs = self.encoder.inference_truncated(embedded_inputs)

+        if self.gst:
+            # B x gst_dim
+            encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
+
        if self.num_speakers > 1:
-            embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
-            embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
-            else:
-                encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
-        else:
-            if self.gst:
-                # B x gst_dim
-                encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
-                encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
+            if not self.embeddings_per_sample:
+                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)

        mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
            encoder_outputs)
--- a/mozilla_voice_tts/tts/models/tacotron_abstract.py
+++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py
@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 import torch
 from torch import nn

-from TTS.utils.generic_utils import sequence_mask
+from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask


 class TacotronAbstract(ABC, nn.Module):
@ -28,6 +28,9 @@ class TacotronAbstract(ABC, nn.Module):
                 bidirectional_decoder=False,
                 double_decoder_consistency=False,
                 ddc_r=None,
+                 encoder_in_features=512,
+                 decoder_in_features=512,
+                 speaker_embedding_dim=None,
                 gst=False,
                 gst_embedding_dim=512,
                 gst_num_heads=4,
@ -57,6 +60,9 @@ class TacotronAbstract(ABC, nn.Module):
        self.location_attn = location_attn
        self.attn_K = attn_K
        self.separate_stopnet = separate_stopnet
+        self.encoder_in_features = encoder_in_features
+        self.decoder_in_features = decoder_in_features
+        self.speaker_embedding_dim = speaker_embedding_dim

        # layers
        self.embedding = None
@ -64,8 +70,17 @@ class TacotronAbstract(ABC, nn.Module):
        self.decoder = None
        self.postnet = None

+        # multispeaker
+        if self.speaker_embedding_dim is None:
+            # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
+            self.embeddings_per_sample = False
+        else:
+            # if speaker_embedding_dim is not None we need use speaker embedding per sample
+            self.embeddings_per_sample = True
+
        # global style token
        if self.gst:
+            self.decoder_in_features += gst_embedding_dim # add gst embedding dim
            self.gst_layer = None

        # model states
@ -164,11 +179,22 @@ class TacotronAbstract(ABC, nn.Module):
            self.speaker_embeddings_projected = self.speaker_project_mel(
                self.speaker_embeddings).squeeze(1)

-    def compute_gst(self, inputs, mel_specs):
+    def compute_gst(self, inputs, style_input):
        """ Compute global style token """
-        # pylint: disable=not-callable
-        gst_outputs = self.gst_layer(mel_specs)
-        inputs = self._add_speaker_embedding(inputs, gst_outputs)
+        device = inputs.device
+        if isinstance(style_input, dict):
+            query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
+            _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
+            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
+            for k_token, v_amplifier in style_input.items():
+                key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
+                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
+                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
+        elif style_input is None:
+            gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
+        else:
+            gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
+        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
        return inputs

    @staticmethod
--- a/mozilla_voice_tts/tts/tf/README.md
+++ b/mozilla_voice_tts/tts/tf/README.md
--- a/mozilla_voice_tts/tts/tf/init.py
+++ b/mozilla_voice_tts/tts/tf/init.py
--- a/mozilla_voice_tts/tts/tf/layers/common_layers.py
+++ b/mozilla_voice_tts/tts/tf/layers/common_layers.py
@ -3,6 +3,9 @@ from tensorflow import keras
 from tensorflow.python.ops import math_ops
 # from tensorflow_addons.seq2seq import BahdanauAttention

+# NOTE: linter has a problem with the current TF release
+#pylint: disable=no-value-for-parameter
+#pylint: disable=unexpected-keyword-arg

 class Linear(keras.layers.Layer):
    def __init__(self, units, use_bias, **kwargs):
--- a/mozilla_voice_tts/tts/tf/layers/tacotron2.py
+++ b/mozilla_voice_tts/tts/tf/layers/tacotron2.py
@ -1,10 +1,12 @@
 import tensorflow as tf
 from tensorflow import keras
-from TTS.tf.utils.tf_utils import shape_list
-from TTS.tf.layers.common_layers import Prenet, Attention
+from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
+from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
 # from tensorflow_addons.seq2seq import AttentionWrapper

-
+# NOTE: linter has a problem with the current TF release
+#pylint: disable=no-value-for-parameter
+#pylint: disable=unexpected-keyword-arg
 class ConvBNBlock(keras.layers.Layer):
    def __init__(self, filters, kernel_size, activation, **kwargs):
        super(ConvBNBlock, self).__init__(**kwargs)
--- a/mozilla_voice_tts/tts/tf/models/tacotron2.py
+++ b/mozilla_voice_tts/tts/tf/models/tacotron2.py
@ -1,11 +1,11 @@
 import tensorflow as tf
 from tensorflow import keras

-from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
-from TTS.tf.utils.tf_utils import shape_list
+from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
+from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list


-#pylint: disable=too-many-ancestors
+#pylint: disable=too-many-ancestors, abstract-method
 class Tacotron2(keras.models.Model):
    def __init__(self,
                 num_chars,
@ -105,4 +105,3 @@ class Tacotron2(keras.models.Model):
        # TODO: issue https://github.com/PyCQA/pylint/issues/3613
        input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32)  #pylint: disable=unexpected-keyword-arg
        self(input_ids)
-
--- a/mozilla_voice_tts/tts/tf/utils/convert_torch_to_tf_utils.py
+++ b/mozilla_voice_tts/tts/tf/utils/convert_torch_to_tf_utils.py
@ -1,6 +1,9 @@
 import numpy as np
 import tensorflow as tf

+# NOTE: linter has a problem with the current TF release
+#pylint: disable=no-value-for-parameter
+#pylint: disable=unexpected-keyword-arg

 def tf_create_dummy_inputs():
    """ Create dummy inputs for TF Tacotron2 model """
--- a/mozilla_voice_tts/tts/tf/utils/generic_utils.py
+++ b/mozilla_voice_tts/tts/tf/utils/generic_utils.py
@ -1,4 +1,3 @@
-import os
 import datetime
 import importlib
 import pickle
@ -78,7 +77,7 @@ def count_parameters(model, c):

 def setup_model(num_chars, num_speakers, c, enable_tflite=False):
    print(" > Using model: {}".format(c.model))
-    MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
+    MyModel = importlib.import_module('mozilla_voice_tts.tts.tf.models.' + c.model.lower())
    MyModel = getattr(MyModel, c.model)
    if c.model.lower() in "tacotron":
        raise NotImplementedError(' [!] Tacotron model is not ready.')
--- a/mozilla_voice_tts/tts/tf/utils/io.py
+++ b/mozilla_voice_tts/tts/tf/utils/io.py
@ -39,4 +39,3 @@ def load_tflite_model(tflite_path):
    tflite_model = tf.lite.Interpreter(model_path=tflite_path)
    tflite_model.allocate_tensors()
    return tflite_model
-
--- a/mozilla_voice_tts/tts/tf/utils/tf_utils.py
+++ b/mozilla_voice_tts/tts/tf/utils/tf_utils.py
--- a/mozilla_voice_tts/tts/tf/utils/tflite.py
+++ b/mozilla_voice_tts/tts/tf/utils/tflite.py
@ -28,4 +28,4 @@ def convert_tacotron2_to_tflite(model,
 def load_tflite_model(tflite_path):
    tflite_model = tf.lite.Interpreter(model_path=tflite_path)
    tflite_model.allocate_tensors()
-    return tflite_model
+    return tflite_model
--- a/mozilla_voice_tts/tts/utils/init.py
+++ b/mozilla_voice_tts/tts/utils/init.py
--- a/mozilla_voice_tts/tts/utils/data.py
+++ b/mozilla_voice_tts/tts/utils/data.py
@ -74,4 +74,3 @@ class StandardScaler():
        X *= self.scale_
        X += self.mean_
        return X
-
--- a/mozilla_voice_tts/tts/utils/distribute.py
+++ b/mozilla_voice_tts/tts/utils/distribute.py
@ -1,15 +1,11 @@
 # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
-import os, sys
 import math
-import time
-import subprocess
-import argparse
+
 import torch
 import torch.distributed as dist
-from torch.utils.data.sampler import Sampler
-from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from TTS.utils.generic_utils import create_experiment_folder
+from torch.autograd import Variable
+from torch.utils.data.sampler import Sampler


 class DistributedSampler(Sampler):
@ -108,7 +104,7 @@ def apply_gradient_allreduce(module):
    for param in list(module.parameters()):

        def allreduce_hook(*_):
-            Variable._execution_engine.queue_callback(allreduce_params)
+            Variable._execution_engine.queue_callback(allreduce_params)  #pylint: disable=protected-access

        if param.requires_grad:
            param.register_hook(allreduce_hook)
@ -118,61 +114,3 @@ def apply_gradient_allreduce(module):

    module.register_forward_hook(set_needs_reduction)
    return module
-
-
-def main():
-    """
-    Call train.py as a new process and pass command arguments
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--continue_path',
-        type=str,
-        help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
-        default='',
-        required='--config_path' not in sys.argv)
-    parser.add_argument(
-        '--restore_path',
-        type=str,
-        help='Model file to be restored. Use to finetune a model.',
-        default='')
-    parser.add_argument(
-        '--config_path',
-        type=str,
-        help='Path to config file for training.',
-        required='--continue_path' not in sys.argv
-    )
-    args = parser.parse_args()
-
-    # OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
-                                        # True)
-    # stdout_path = os.path.join(OUT_PATH, "process_stdout/")
-
-    num_gpus = torch.cuda.device_count()
-    group_id = time.strftime("%Y_%m_%d-%H%M%S")
-
-    # set arguments for train.py
-    command = ['train.py']
-    command.append('--continue_path={}'.format(args.continue_path))
-    command.append('--restore_path={}'.format(args.restore_path))
-    command.append('--config_path={}'.format(args.config_path))
-    command.append('--group_id=group_{}'.format(group_id))
-    command.append('')
-
-    # run processes
-    processes = []
-    for i in range(num_gpus):
-        my_env = os.environ.copy()
-        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
-        command[-1] = '--rank={}'.format(i)
-        stdout = None if i == 0 else open(os.devnull, 'w')
-        p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
-        processes.append(p)
-        print(command)
-
-    for p in processes:
-        p.wait()
-
-
-if __name__ == '__main__':
-    main()
--- a/mozilla_voice_tts/tts/utils/generic_utils.py
+++ b/mozilla_voice_tts/tts/utils/generic_utils.py
@ -0,0 +1,268 @@
+import torch
+import importlib
+import numpy as np
+from collections import Counter
+
+from mozilla_voice_tts.utils.generic_utils import check_argument
+
+
+def split_dataset(items):
+    is_multi_speaker = False
+    speakers = [item[-1] for item in items]
+    is_multi_speaker = len(set(speakers)) > 1
+    eval_split_size = 500 if len(items) * 0.01 > 500 else int(
+        len(items) * 0.01)
+    assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
+    np.random.seed(0)
+    np.random.shuffle(items)
+    if is_multi_speaker:
+        items_eval = []
+        # most stupid code ever -- Fix it !
+        while len(items_eval) < eval_split_size:
+            speakers = [item[-1] for item in items]
+            speaker_counter = Counter(speakers)
+            item_idx = np.random.randint(0, len(items))
+            if speaker_counter[items[item_idx][-1]] > 1:
+                items_eval.append(items[item_idx])
+                del items[item_idx]
+        return items_eval, items
+    return items[:eval_split_size], items[eval_split_size:]
+
+
+# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
+def sequence_mask(sequence_length, max_len=None):
+    if max_len is None:
+        max_len = sequence_length.data.max()
+    batch_size = sequence_length.size(0)
+    seq_range = torch.arange(0, max_len).long()
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    if sequence_length.is_cuda:
+        seq_range_expand = seq_range_expand.to(sequence_length.device)
+    seq_length_expand = (
+        sequence_length.unsqueeze(1).expand_as(seq_range_expand))
+    # B x T_max
+    return seq_range_expand < seq_length_expand
+
+
+def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
+    print(" > Using model: {}".format(c.model))
+    MyModel = importlib.import_module('mozilla_voice_tts.tts.models.' + c.model.lower())
+    MyModel = getattr(MyModel, c.model)
+    if c.model.lower() in "tacotron":
+        model = MyModel(num_chars=num_chars,
+                        num_speakers=num_speakers,
+                        r=c.r,
+                        postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
+                        decoder_output_dim=c.audio['num_mels'],
+                        gst=c.use_gst,
+                        gst_embedding_dim=c.gst['gst_embedding_dim'],
+                        gst_num_heads=c.gst['gst_num_heads'],
+                        gst_style_tokens=c.gst['gst_style_tokens'],
+                        memory_size=c.memory_size,
+                        attn_type=c.attention_type,
+                        attn_win=c.windowing,
+                        attn_norm=c.attention_norm,
+                        prenet_type=c.prenet_type,
+                        prenet_dropout=c.prenet_dropout,
+                        forward_attn=c.use_forward_attn,
+                        trans_agent=c.transition_agent,
+                        forward_attn_mask=c.forward_attn_mask,
+                        location_attn=c.location_attn,
+                        attn_K=c.attention_heads,
+                        separate_stopnet=c.separate_stopnet,
+                        bidirectional_decoder=c.bidirectional_decoder,
+                        double_decoder_consistency=c.double_decoder_consistency,
+                        ddc_r=c.ddc_r,
+                        speaker_embedding_dim=speaker_embedding_dim)
+    elif c.model.lower() == "tacotron2":
+        model = MyModel(num_chars=num_chars,
+                        num_speakers=num_speakers,
+                        r=c.r,
+                        postnet_output_dim=c.audio['num_mels'],
+                        decoder_output_dim=c.audio['num_mels'],
+                        gst=c.use_gst,
+                        gst_embedding_dim=c.gst['gst_embedding_dim'],
+                        gst_num_heads=c.gst['gst_num_heads'],
+                        gst_style_tokens=c.gst['gst_style_tokens'],
+                        attn_type=c.attention_type,
+                        attn_win=c.windowing,
+                        attn_norm=c.attention_norm,
+                        prenet_type=c.prenet_type,
+                        prenet_dropout=c.prenet_dropout,
+                        forward_attn=c.use_forward_attn,
+                        trans_agent=c.transition_agent,
+                        forward_attn_mask=c.forward_attn_mask,
+                        location_attn=c.location_attn,
+                        attn_K=c.attention_heads,
+                        separate_stopnet=c.separate_stopnet,
+                        bidirectional_decoder=c.bidirectional_decoder,
+                        double_decoder_consistency=c.double_decoder_consistency,
+                        ddc_r=c.ddc_r,
+                        speaker_embedding_dim=speaker_embedding_dim)
+    return model
+
+
+class KeepAverage():
+    def __init__(self):
+        self.avg_values = {}
+        self.iters = {}
+
+    def __getitem__(self, key):
+        return self.avg_values[key]
+
+    def items(self):
+        return self.avg_values.items()
+
+    def add_value(self, name, init_val=0, init_iter=0):
+        self.avg_values[name] = init_val
+        self.iters[name] = init_iter
+
+    def update_value(self, name, value, weighted_avg=False):
+        if name not in self.avg_values:
+            # add value if not exist before
+            self.add_value(name, init_val=value)
+        else:
+            # else update existing value
+            if weighted_avg:
+                self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
+                self.iters[name] += 1
+            else:
+                self.avg_values[name] = self.avg_values[name] * \
+                    self.iters[name] + value
+                self.iters[name] += 1
+                self.avg_values[name] /= self.iters[name]
+
+    def add_values(self, name_dict):
+        for key, value in name_dict.items():
+            self.add_value(key, init_val=value)
+
+    def update_values(self, value_dict):
+        for key, value in value_dict.items():
+            self.update_value(key, value)
+
+
+def check_config(c):
+    check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
+    check_argument('run_name', c, restricted=True, val_type=str)
+    check_argument('run_description', c, val_type=str)
+
+    # AUDIO
+    check_argument('audio', c, restricted=True, val_type=dict)
+
+    # audio processing parameters
+    check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
+    check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
+    check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
+    check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
+    check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
+    check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
+    check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
+    check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
+    check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
+
+    # vocabulary parameters
+    check_argument('characters', c, restricted=False, val_type=dict)
+    check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+    check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
+
+    # normalization parameters
+    check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
+    check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
+    check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
+    check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
+    check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
+    check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
+    check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100)
+    check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
+    check_argument('trim_db', c['audio'], restricted=True, val_type=int)
+
+    # training parameters
+    check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
+    check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
+    check_argument('r', c, restricted=True, val_type=int, min_val=1)
+    check_argument('gradual_training', c, restricted=False, val_type=list)
+    check_argument('loss_masking', c, restricted=True, val_type=bool)
+    check_argument('apex_amp_level', c, restricted=False, val_type=str)
+    # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
+
+    # validation parameters
+    check_argument('run_eval', c, restricted=True, val_type=bool)
+    check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
+    check_argument('test_sentences_file', c, restricted=False, val_type=str)
+
+    # optimizer
+    check_argument('noam_schedule', c, restricted=False, val_type=bool)
+    check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
+    check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
+    check_argument('lr', c, restricted=True, val_type=float, min_val=0)
+    check_argument('wd', c, restricted=True, val_type=float, min_val=0)
+    check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
+    check_argument('seq_len_norm', c, restricted=True, val_type=bool)
+
+    # tacotron prenet
+    check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
+    check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
+    check_argument('prenet_dropout', c, restricted=True, val_type=bool)
+
+    # attention
+    check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
+    check_argument('attention_heads', c, restricted=True, val_type=int)
+    check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
+    check_argument('windowing', c, restricted=True, val_type=bool)
+    check_argument('use_forward_attn', c, restricted=True, val_type=bool)
+    check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
+    check_argument('transition_agent', c, restricted=True, val_type=bool)
+    check_argument('transition_agent', c, restricted=True, val_type=bool)
+    check_argument('location_attn', c, restricted=True, val_type=bool)
+    check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
+    check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
+    check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
+
+    # stopnet
+    check_argument('stopnet', c, restricted=True, val_type=bool)
+    check_argument('separate_stopnet', c, restricted=True, val_type=bool)
+
+    # tensorboard
+    check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
+    check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
+    check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
+    check_argument('checkpoint', c, restricted=True, val_type=bool)
+    check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
+
+    # dataloading
+    # pylint: disable=import-outside-toplevel
+    from mozilla_voice_tts.tts.utils.text import cleaners
+    check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
+    check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
+    check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
+    check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
+    check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
+    check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
+
+    # paths
+    check_argument('output_path', c, restricted=True, val_type=str)
+
+    # multi-speaker and gst
+    check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
+    check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool)
+    check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str)
+    check_argument('use_gst', c, restricted=True, val_type=bool)
+    check_argument('gst', c, restricted=True, val_type=dict)
+    check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict])
+    check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)
+    check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
+    check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
+
+    # datasets - checking only the first entry
+    check_argument('datasets', c, restricted=True, val_type=list)
+    for dataset_entry in c['datasets']:
+        check_argument('name', dataset_entry, restricted=True, val_type=str)
+        check_argument('path', dataset_entry, restricted=True, val_type=str)
+        check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
+        check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
--- a/mozilla_voice_tts/tts/utils/io.py
+++ b/mozilla_voice_tts/tts/utils/io.py
@ -1,44 +1,13 @@
 import os
-import json
-import re
 import torch
 import datetime


-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
-def load_config(config_path):
-    config = AttrDict()
-    with open(config_path, "r") as f:
-        input_str = f.read()
-    input_str = re.sub(r'\\\n', '', input_str)
-    input_str = re.sub(r'//.*\n', '\n', input_str)
-    data = json.loads(input_str)
-    config.update(data)
-    return config
-
-
-def copy_config_file(config_file, out_path, new_fields):
-    config_lines = open(config_file, "r").readlines()
-    # add extra information fields
-    for key, value in new_fields.items():
-        if isinstance(value, str):
-            new_line = '"{}":"{}",\n'.format(key, value)
-        else:
-            new_line = '"{}":{},\n'.format(key, value)
-        config_lines.insert(1, new_line)
-    config_out_file = open(out_path, "w")
-    config_out_file.writelines(config_lines)
-    config_out_file.close()
-
-
-def load_checkpoint(model, checkpoint_path, use_cuda=False):
+def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
    state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
+    if amp and 'amp' in state:
+        amp.load_state_dict(state['amp'])
    if use_cuda:
        model.cuda()
    # set model stepsize
@ -47,7 +16,7 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False):
    return model, state


-def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
+def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs):
    new_state_dict = model.state_dict()
    state = {
        'model': new_state_dict,
@ -57,6 +26,8 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
        'date': datetime.date.today().strftime("%B %d, %Y"),
        'r': r
    }
+    if amp_state_dict:
+        state['amp'] = amp_state_dict
    state.update(kwargs)
    torch.save(state, output_path)

--- a/mozilla_voice_tts/tts/utils/measures.py
+++ b/mozilla_voice_tts/tts/utils/measures.py
@ -1,6 +1,3 @@
-import torch
-
-
 def alignment_diagonal_score(alignments, binary=False):
    """
    Compute how diagonal alignment predictions are. It is useful
--- a/mozilla_voice_tts/tts/utils/speakers.py
+++ b/mozilla_voice_tts/tts/utils/speakers.py
@ -1,8 +1,6 @@
 import os
 import json

-from TTS.datasets.preprocess import get_preprocessor_by_name
-

 def make_speakers_json_path(out_path):
    """Returns conventional speakers.json location."""
@ -12,12 +10,15 @@ def make_speakers_json_path(out_path):
 def load_speaker_mapping(out_path):
    """Loads speaker mapping if already present."""
    try:
-        with open(make_speakers_json_path(out_path)) as f:
+        if os.path.splitext(out_path)[1] == '.json':
+            json_file = out_path
+        else:
+            json_file = make_speakers_json_path(out_path)
+        with open(json_file) as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

-
 def save_speaker_mapping(out_path, speaker_mapping):
    """Saves speaker mapping if not yet present."""
    speakers_json_path = make_speakers_json_path(out_path)
--- a/mozilla_voice_tts/tts/utils/synthesis.py
+++ b/mozilla_voice_tts/tts/utils/synthesis.py
@ -39,23 +39,23 @@ def numpy_to_tf(np_array, dtype):

 def compute_style_mel(style_wav, ap, cuda=False):
    style_mel = torch.FloatTensor(ap.melspectrogram(
-        ap.load_wav(style_wav))).unsqueeze(0)
+        ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0)
    if cuda:
        return style_mel.cuda()
    return style_mel


-def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
+def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None):
    if CONFIG.use_gst:
        decoder_output, postnet_output, alignments, stop_tokens = model.inference(
-            inputs, style_mel=style_mel, speaker_ids=speaker_id)
+            inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
    else:
        if truncated:
            decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
-                inputs, speaker_ids=speaker_id)
+                inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
        else:
            decoder_output, postnet_output, alignments, stop_tokens = model.inference(
-                inputs, speaker_ids=speaker_id)
+                inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
    return decoder_output, postnet_output, alignments, stop_tokens


@ -140,6 +140,15 @@ def id_to_torch(speaker_id, cuda=False):
    return speaker_id


+def embedding_to_torch(speaker_embedding, cuda=False):
+    if speaker_embedding is not None:
+        speaker_embedding = np.asarray(speaker_embedding)
+        speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor)
+    if cuda:
+        return speaker_embedding.cuda()
+    return speaker_embedding
+
+
 # TODO: perform GL with pytorch for batching
 def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
    '''Apply griffin-lim to each sample iterating throught the first dimension.
@ -169,15 +178,16 @@ def synthesis(model,
              enable_eos_bos_chars=False, #pylint: disable=unused-argument
              use_griffin_lim=False,
              do_trim_silence=False,
+              speaker_embedding=None,
              backend='torch'):
    """Synthesize voice for the given text.

        Args:
-            model (TTS.models): model to synthesize.
+            model (mozilla_voice_tts.tts.models): model to synthesize.
            text (str): target text
            CONFIG (dict): config dictionary to be loaded from config.json.
            use_cuda (bool): enable cuda.
-            ap (TTS.utils.audio.AudioProcessor): audio processor to process
+            ap (mozilla_voice_tts.tts.utils.audio.AudioProcessor): audio processor to process
                model outputs.
            speaker_id (int): id of speaker
            style_wav (str): Uses for style embedding of GST.
@ -200,6 +210,10 @@ def synthesis(model,
    if backend == 'torch':
        if speaker_id is not None:
            speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
+
+        if speaker_embedding is not None:
+            speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda)
+
        if not isinstance(style_mel, dict):
            style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
        inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
@ -216,7 +230,7 @@ def synthesis(model,
    # synthesize voice
    if backend == 'torch':
        decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
-            model, inputs, CONFIG, truncated, speaker_id, style_mel)
+            model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding)
        postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
            postnet_output, decoder_output, alignments, stop_tokens)
    elif backend == 'tf':
--- a/mozilla_voice_tts/tts/utils/text/init.py
+++ b/mozilla_voice_tts/tts/utils/text/init.py
@ -4,10 +4,11 @@ import re
 from packaging import version
 import phonemizer
 from phonemizer.phonemize import phonemize
-from TTS.utils.text import cleaners
-from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
+from mozilla_voice_tts.tts.utils.text import cleaners
+from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
    _eos

+# pylint: disable=unnecessary-comprehension
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
--- a/mozilla_voice_tts/tts/utils/text/cleaners.py
+++ b/mozilla_voice_tts/tts/utils/text/cleaners.py
@ -67,15 +67,16 @@ def remove_aux_symbols(text):
    text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
    return text

-
-def replace_symbols(text):
+def replace_symbols(text, lang='en'):
    text = text.replace(';', ',')
    text = text.replace('-', ' ')
-    text = text.replace(':', ',')
-    text = text.replace('&', 'and')
+    text = text.replace(':', ' ')
+    if lang == 'en':
+        text = text.replace('&', 'and')
+    elif lang == 'pt':
+        text = text.replace('&', ' e ')
    return text

-
 def basic_cleaners(text):
    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
    text = lowercase(text)
@ -106,7 +107,6 @@ def basic_turkish_cleaners(text):
    text = collapse_whitespace(text)
    return text

-
 def english_cleaners(text):
    '''Pipeline for English text, including number and abbreviation expansion.'''
    text = convert_to_ascii(text)
@ -118,6 +118,14 @@ def english_cleaners(text):
    text = collapse_whitespace(text)
    return text

+def portuguese_cleaners(text):
+    '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
+        numbers, phonemizer already does that'''
+    text = lowercase(text)
+    text = replace_symbols(text, lang='pt')
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text

 def phoneme_cleaners(text):
    '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
--- a/mozilla_voice_tts/tts/utils/text/cmudict.py
+++ b/mozilla_voice_tts/tts/utils/text/cmudict.py
--- a/mozilla_voice_tts/tts/utils/text/number_norm.py
+++ b/mozilla_voice_tts/tts/utils/text/number_norm.py
@ -31,14 +31,13 @@ def _expand_dollars(m):
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
+    if dollars:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        return '%s %s' % (dollars, dollar_unit)
-    elif cents:
+    if cents:
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s' % (cents, cent_unit)
-    else:
-        return 'zero dollars'
+    return 'zero dollars'


 def _expand_ordinal(m):
--- a/mozilla_voice_tts/tts/utils/text/symbols.py
+++ b/mozilla_voice_tts/tts/utils/text/symbols.py
--- a/mozilla_voice_tts/tts/utils/visual.py
+++ b/mozilla_voice_tts/tts/utils/visual.py
@ -3,10 +3,10 @@ import librosa
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme
+from mozilla_voice_tts.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme


-def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
+def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False):
    if isinstance(alignment, torch.Tensor):
        alignment_ = alignment.detach().cpu().numpy().squeeze()
    else:
@ -24,10 +24,12 @@ def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
    plt.tight_layout()
    if title is not None:
        plt.title(title)
+    if not output_fig:
+        plt.close()
    return fig


-def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
+def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
    if isinstance(spectrogram, torch.Tensor):
        spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
    else:
@ -38,10 +40,12 @@ def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
    plt.imshow(spectrogram_, aspect="auto", origin="lower")
    plt.colorbar()
    plt.tight_layout()
+    if not output_fig:
+        plt.close()
    return fig


-def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)):
+def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24), output_fig=False):
    if decoder_output is not None:
        num_plot = 4
    else:
@ -91,3 +95,6 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG,
        print(output_path)
        fig.savefig(output_path)
        plt.close()
+
+    if not output_fig:
+        plt.close()
--- a/mozilla_voice_tts/utils/init.py
+++ b/mozilla_voice_tts/utils/init.py
--- a/mozilla_voice_tts/utils/audio.py
+++ b/mozilla_voice_tts/utils/audio.py
@ -1,10 +1,10 @@
 import librosa
 import soundfile as sf
 import numpy as np
-import scipy.io
+import scipy.io.wavfile
 import scipy.signal

-from TTS.utils.data import StandardScaler
+from mozilla_voice_tts.tts.utils.data import StandardScaler


 class AudioProcessor(object):
@ -52,7 +52,7 @@ class AudioProcessor(object):
        self.mel_fmin = mel_fmin or 0
        self.mel_fmax = mel_fmax
        self.spec_gain = float(spec_gain)
-        self.stft_pad_mode = 'reflect'
+        self.stft_pad_mode = stft_pad_mode
        self.max_norm = 1.0 if max_norm is None else float(max_norm)
        self.clip_norm = clip_norm
        self.do_trim_silence = do_trim_silence
@ -123,7 +123,7 @@ class AudioProcessor(object):
            if self.symmetric_norm:
                S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
                if self.clip_norm:
-                    S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
+                    S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)  # pylint: disable=invalid-unary-operand-type
                return S_norm
            else:
                S_norm = self.max_norm * S_norm
@ -148,7 +148,7 @@ class AudioProcessor(object):
                    raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
            if self.symmetric_norm:
                if self.clip_norm:
-                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
+                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)  #pylint: disable=invalid-unary-operand-type
                S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
                return S_denorm + self.ref_level_db
            else:
--- a/mozilla_voice_tts/utils/console_logger.py
+++ b/mozilla_voice_tts/utils/console_logger.py
@ -1,5 +1,5 @@
 import datetime
-from TTS.utils.io import AttrDict
+from mozilla_voice_tts.utils.io import AttrDict


 tcolors = AttrDict({
@ -15,8 +15,8 @@ tcolors = AttrDict({


 class ConsoleLogger():
-    # TODO: merge this with TTS ConsoleLogger
    def __init__(self):
+        # TODO: color code for value changes
        # use these to compare values between iterations
        self.old_train_loss_dict = None
        self.old_epoch_loss_dict = None
@ -35,8 +35,7 @@ class ConsoleLogger():
    def print_train_start(self):
        print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")

-    def print_train_step(self, batch_steps, step, global_step,
-                         step_time, loader_time, lrG, lrD,
+    def print_train_step(self, batch_steps, step, global_step, log_dict,
                         loss_dict, avg_loss_dict):
        indent = "     | > "
        print()
@ -48,7 +47,13 @@ class ConsoleLogger():
                log_text += "{}{}: {:.5f}  ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
            else:
                log_text += "{}{}: {:.5f} \n".format(indent, key, value)
-        log_text += f"{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lrG: {lrG}\n{indent}lrD: {lrD}"
+        for idx, (key, value) in enumerate(log_dict.items()):
+            if isinstance(value, list):
+                log_text += f"{indent}{key}: {value[0]:.{value[1]}f}"
+            else:
+                log_text += f"{indent}{key}: {value}"
+            if idx < len(log_dict)-1:
+                log_text += "\n"
        print(log_text, flush=True)

    # pylint: disable=unused-argument
--- a/mozilla_voice_tts/utils/generic_utils.py
+++ b/mozilla_voice_tts/utils/generic_utils.py
@ -0,0 +1,156 @@
+import os
+import glob
+import shutil
+import datetime
+import subprocess
+
+
+def get_git_branch():
+    try:
+        out = subprocess.check_output(["git", "branch"]).decode("utf8")
+        current = next(line for line in out.split("\n")
+                       if line.startswith("*"))
+        current.replace("* ", "")
+    except subprocess.CalledProcessError:
+        current = "inside_docker"
+    return current
+
+
+def get_commit_hash():
+    """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
+    # try:
+    #     subprocess.check_output(['git', 'diff-index', '--quiet',
+    #                              'HEAD'])  # Verify client is clean
+    # except:
+    #     raise RuntimeError(
+    #         " !! Commit before training to get the commit hash.")
+    try:
+        commit = subprocess.check_output(
+            ['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
+    # Not copying .git folder into docker container
+    except subprocess.CalledProcessError:
+        commit = "0000000"
+    print(' > Git Hash: {}'.format(commit))
+    return commit
+
+
+def create_experiment_folder(root_path, model_name, debug):
+    """ Create a folder with the current date and time """
+    date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
+    if debug:
+        commit_hash = 'debug'
+    else:
+        commit_hash = get_commit_hash()
+    output_folder = os.path.join(
+        root_path, model_name + '-' + date_str + '-' + commit_hash)
+    os.makedirs(output_folder, exist_ok=True)
+    print(" > Experiment folder: {}".format(output_folder))
+    return output_folder
+
+
+def remove_experiment_folder(experiment_path):
+    """Check folder if there is a checkpoint, otherwise remove the folder"""
+
+    checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
+    if not checkpoint_files:
+        if os.path.exists(experiment_path):
+            shutil.rmtree(experiment_path, ignore_errors=True)
+            print(" ! Run is removed from {}".format(experiment_path))
+    else:
+        print(" ! Run is kept in {}".format(experiment_path))
+
+
+def count_parameters(model):
+    r"""Count number of trainable parameters in a network"""
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def set_init_dict(model_dict, checkpoint_state, c):
+    # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
+    for k, v in checkpoint_state.items():
+        if k not in model_dict:
+            print(" | > Layer missing in the model definition: {}".format(k))
+    # 1. filter out unnecessary keys
+    pretrained_dict = {
+        k: v
+        for k, v in checkpoint_state.items() if k in model_dict
+    }
+    # 2. filter out different size layers
+    pretrained_dict = {
+        k: v
+        for k, v in pretrained_dict.items()
+        if v.numel() == model_dict[k].numel()
+    }
+    # 3. skip reinit layers
+    if c.reinit_layers is not None:
+        for reinit_layer_name in c.reinit_layers:
+            pretrained_dict = {
+                k: v
+                for k, v in pretrained_dict.items()
+                if reinit_layer_name not in k
+            }
+    # 4. overwrite entries in the existing state dict
+    model_dict.update(pretrained_dict)
+    print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
+                                                     len(model_dict)))
+    return model_dict
+
+class KeepAverage():
+    def __init__(self):
+        self.avg_values = {}
+        self.iters = {}
+
+    def __getitem__(self, key):
+        return self.avg_values[key]
+
+    def items(self):
+        return self.avg_values.items()
+
+    def add_value(self, name, init_val=0, init_iter=0):
+        self.avg_values[name] = init_val
+        self.iters[name] = init_iter
+
+    def update_value(self, name, value, weighted_avg=False):
+        if name not in self.avg_values:
+            # add value if not exist before
+            self.add_value(name, init_val=value)
+        else:
+            # else update existing value
+            if weighted_avg:
+                self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
+                self.iters[name] += 1
+            else:
+                self.avg_values[name] = self.avg_values[name] * \
+                    self.iters[name] + value
+                self.iters[name] += 1
+                self.avg_values[name] /= self.iters[name]
+
+    def add_values(self, name_dict):
+        for key, value in name_dict.items():
+            self.add_value(key, init_val=value)
+
+    def update_values(self, value_dict):
+        for key, value in value_dict.items():
+            self.update_value(key, value)
+
+
+def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
+    if alternative in c.keys() and c[alternative] is not None:
+        return
+    if restricted:
+        assert name in c.keys(), f' [!] {name} not defined in config.json'
+    if name in c.keys():
+        if max_val:
+            assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
+        if min_val:
+            assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
+        if enum_list:
+            assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
+        if isinstance(val_type, list):
+            is_valid = False
+            for typ in val_type:
+                if isinstance(c[name], typ):
+                    is_valid = True
+            assert is_valid or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
+        elif val_type:
+            assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
--- a/mozilla_voice_tts/utils/io.py
+++ b/mozilla_voice_tts/utils/io.py
@ -0,0 +1,32 @@
+import re
+import json
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def load_config(config_path):
+    config = AttrDict()
+    with open(config_path, "r") as f:
+        input_str = f.read()
+    input_str = re.sub(r'\\\n', '', input_str)
+    input_str = re.sub(r'//.*\n', '\n', input_str)
+    data = json.loads(input_str)
+    config.update(data)
+    return config
+
+
+def copy_config_file(config_file, out_path, new_fields):
+    config_lines = open(config_file, "r").readlines()
+    # add extra information fields
+    for key, value in new_fields.items():
+        if isinstance(value, str):
+            new_line = '"{}":"{}",\n'.format(key, value)
+        else:
+            new_line = '"{}":{},\n'.format(key, value)
+        config_lines.insert(1, new_line)
+    config_out_file = open(out_path, "w")
+    config_out_file.writelines(config_lines)
+    config_out_file.close()
--- a/mozilla_voice_tts/utils/radam.py
+++ b/mozilla_voice_tts/utils/radam.py
@ -2,7 +2,7 @@

 import math
 import torch
-from torch.optim.optimizer import Optimizer, required
+from torch.optim.optimizer import Optimizer


 class RAdam(Optimizer):
@ -25,7 +25,7 @@ class RAdam(Optimizer):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
        super(RAdam, self).__init__(params, defaults)

-    def __setstate__(self, state):
+    def __setstate__(self, state):  # pylint: disable=useless-super-delegation
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):
--- a/mozilla_voice_tts/utils/tensorboard_logger.py
+++ b/mozilla_voice_tts/utils/tensorboard_logger.py
@ -47,7 +47,7 @@ class TensorboardLogger(object):
        for key, value in audios.items():
            try:
                self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
-            except:
+            except RuntimeError:
                traceback.print_exc()

    def tb_train_iter_stats(self, step, stats):
--- a/mozilla_voice_tts/utils/training.py
+++ b/mozilla_voice_tts/utils/training.py
@ -13,13 +13,21 @@ def setup_torch_training_env(cudnn_enable, cudnn_benchmark):
    return use_cuda, num_gpus


-def check_update(model, grad_clip, ignore_stopnet=False):
+def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
    r'''Check model gradient against unexpected jumps and failures'''
    skip_flag = False
    if ignore_stopnet:
-        grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
+        if not amp_opt_params:
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                [param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
    else:
-        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        if not amp_opt_params:
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
+
    # compatibility with different torch versions
    if isinstance(grad_norm, float):
        if np.isinf(grad_norm):
--- a/mozilla_voice_tts/vocoder/README.md
+++ b/mozilla_voice_tts/vocoder/README.md
@ -0,0 +1,39 @@
+# Mozilla TTS Vocoders (Experimental)
+
+Here there are vocoder model implementations which can be combined with the other TTS models.
+
+Currently, following models are implemented:
+
+- Melgan
+- MultiBand-Melgan
+- ParallelWaveGAN
+- GAN-TTS (Discriminator Only)
+
+It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
+
+## Training a model
+
+You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
+
+In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
+
+You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
+
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
+
+Example config files can be found under `tts/vocoder/configs/` folder.
+
+You can continue a previous training run by the following command.
+
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
+
+You can fine-tune a pre-trained model by the following command.
+
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
+
+Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
+
+You can also follow your training runs on Tensorboard as you do with our TTS models.
+
+## Acknowledgement
+Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
--- a/mozilla_voice_tts/vocoder/init.py
+++ b/mozilla_voice_tts/vocoder/init.py
--- a/mozilla_voice_tts/vocoder/configs/multiband-melgan_and_rwd_config.json
+++ b/mozilla_voice_tts/vocoder/configs/multiband-melgan_and_rwd_config.json
--- a/mozilla_voice_tts/vocoder/configs/multiband_melgan_config.json
+++ b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config.json
--- a/mozilla_voice_tts/vocoder/configs/multiband_melgan_config_mozilla.json
+++ b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config_mozilla.json
@ -0,0 +1,144 @@
+{
+    "run_name": "multiband-melgan",
+    "run_description": "multiband melgan mean-var scaling",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 0,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": "/home/erogol/Data/MozillaMerged22050/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // DISTRIBUTED TRAINING
+    // "distributed":{
+    //     "backend": "nccl",
+    //     "url": "tcp:\/\/localhost:54321"
+    // },
+
+    // MODEL PARAMETERS
+    "use_pqmf": true,
+
+    // LOSS PARAMETERS
+    "use_stft_loss": true,
+    "use_subband_stft_loss": true,
+    "use_mse_gan_loss": true,
+    "use_hinge_gan_loss": false,
+    "use_feat_match_loss": false,  // use only with melgan discriminators
+
+    // loss weights
+    "stft_loss_weight": 0.5,
+    "subband_stft_loss_weight": 0.5,
+    "mse_G_loss_weight": 2.5,
+    "hinge_G_loss_weight": 2.5,
+    "feat_match_loss_weight": 25,
+
+    // multiscale stft loss parameters
+    "stft_loss_params": {
+        "n_ffts": [1024, 2048, 512],
+        "hop_lengths": [120, 240, 50],
+        "win_lengths": [600, 1200, 240]
+    },
+
+    // subband multiscale stft loss parameters
+    "subband_stft_loss_params":{
+        "n_ffts": [384, 683, 171],
+        "hop_lengths": [30, 60, 10],
+        "win_lengths": [150, 300, 60]
+    },
+
+    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
+
+    // DISCRIMINATOR
+    "discriminator_model": "melgan_multiscale_discriminator",
+    "discriminator_model_params":{
+        "base_channels": 16,
+        "max_channels":512,
+        "downsample_factors":[4, 4, 4]
+    },
+    "steps_to_start_discriminator": 200000,      // steps required to start GAN trainining.1
+
+    // GENERATOR
+    "generator_model": "multiband_melgan_generator",
+    "generator_model_params": {
+        "upsample_factors":[8, 4, 2],
+        "num_res_blocks": 4
+    },
+
+    // DATASET
+    "data_path": "/home/erogol/Data/MozillaMerged22050/wavs/",
+    "feature_path": null,
+    "seq_len": 16384,
+    "pad_short": 2000,
+    "conv_pad": 0,
+    "use_noise_augment": false,
+    "use_cache": true,
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 64,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "epochs": 10000,                // total number of epochs to train.
+    "wd": 0.0,                // Weight decay weight.
+    "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
+    "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
+    "lr_scheduler_gen": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_gen_params": {
+        "gamma": 0.5,
+        "milestones": [100000, 200000, 300000, 400000, 500000, 600000]
+    },
+    "lr_scheduler_disc": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_disc_params": {
+        "gamma": 0.5,
+        "milestones": [100000, 200000, 300000, 400000, 500000, 600000]
+    },
+    "lr_gen": 1e-4,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_disc": 1e-4,
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 10,
+
+    // PATHS
+    "output_path": "/home/erogol/Models/Mozilla/"
+}
+
--- a/mozilla_voice_tts/vocoder/configs/parallel_wavegan_config.json
+++ b/mozilla_voice_tts/vocoder/configs/parallel_wavegan_config.json
@ -0,0 +1,143 @@
+{
+    "run_name": "pwgan",
+    "run_description": "parallel-wavegan training",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 0,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // DISTRIBUTED TRAINING
+    // "distributed":{
+    //     "backend": "nccl",
+    //     "url": "tcp:\/\/localhost:54321"
+    // },
+
+    // MODEL PARAMETERS
+    "use_pqmf": true,
+
+    // LOSS PARAMETERS
+    "use_stft_loss": true,
+    "use_subband_stft_loss": false,  // USE ONLY WITH MULTIBAND MODELS
+    "use_mse_gan_loss": true,
+    "use_hinge_gan_loss": false,
+    "use_feat_match_loss": false,  // use only with melgan discriminators
+
+    // loss weights
+    "stft_loss_weight": 0.5,
+    "subband_stft_loss_weight": 0.5,
+    "mse_G_loss_weight": 2.5,
+    "hinge_G_loss_weight": 2.5,
+    "feat_match_loss_weight": 25,
+
+    // multiscale stft loss parameters
+    "stft_loss_params": {
+        "n_ffts": [1024, 2048, 512],
+        "hop_lengths": [120, 240, 50],
+        "win_lengths": [600, 1200, 240]
+    },
+
+    // subband multiscale stft loss parameters
+    "subband_stft_loss_params":{
+        "n_ffts": [384, 683, 171],
+        "hop_lengths": [30, 60, 10],
+        "win_lengths": [150, 300, 60]
+    },
+
+    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
+
+    // DISCRIMINATOR
+    "discriminator_model": "parallel_wavegan_discriminator",
+    "discriminator_model_params":{
+        "num_layers": 10
+    },
+    "steps_to_start_discriminator": 200000,      // steps required to start GAN trainining.1
+
+    // GENERATOR
+    "generator_model": "parallel_wavegan_generator",
+    "generator_model_params": {
+        "upsample_factors":[4, 4, 4, 4],
+        "stacks": 3,
+        "num_res_blocks": 30
+    },
+
+    // DATASET
+    "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
+    "feature_path": null,
+    "seq_len": 25600,
+    "pad_short": 2000,
+    "conv_pad": 0,
+    "use_noise_augment": false,
+    "use_cache": true,
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 6,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "epochs": 10000,                // total number of epochs to train.
+    "wd": 0.0,                // Weight decay weight.
+    "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
+    "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
+    "lr_scheduler_gen": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_gen_params": {
+        "gamma": 0.5,
+        "milestones": [100000, 200000, 300000, 400000, 500000, 600000]
+    },
+    "lr_scheduler_disc": "MultiStepLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_disc_params": {
+        "gamma": 0.5,
+        "milestones": [100000, 200000, 300000, 400000, 500000, 600000]
+    },
+    "lr_gen": 1e-4,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_disc": 1e-4,
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 10,
+
+    // PATHS
+    "output_path": "/home/erogol/Models/LJSpeech/"
+}
+
--- a/mozilla_voice_tts/vocoder/datasets/init.py
+++ b/mozilla_voice_tts/vocoder/datasets/init.py
--- a/mozilla_voice_tts/vocoder/datasets/gan_dataset.py
+++ b/mozilla_voice_tts/vocoder/datasets/gan_dataset.py
--- a/mozilla_voice_tts/vocoder/datasets/preprocess.py
+++ b/mozilla_voice_tts/vocoder/datasets/preprocess.py
--- a/mozilla_voice_tts/vocoder/layers/init.py
+++ b/mozilla_voice_tts/vocoder/layers/init.py
--- a/mozilla_voice_tts/vocoder/layers/losses.py
+++ b/mozilla_voice_tts/vocoder/layers/losses.py
@ -306,4 +306,4 @@ class DiscriminatorLoss(nn.Module):
            loss += hinge_D_loss

        return_dict['D_loss'] = loss
-        return return_dict
+        return return_dict
--- a/mozilla_voice_tts/vocoder/layers/melgan.py
+++ b/mozilla_voice_tts/vocoder/layers/melgan.py
--- a/mozilla_voice_tts/vocoder/layers/parallel_wavegan.py
+++ b/mozilla_voice_tts/vocoder/layers/parallel_wavegan.py
@ -0,0 +1,87 @@
+import torch
+from torch.nn import functional as F
+
+
+class ResidualBlock(torch.nn.Module):
+    """Residual block module in WaveNet."""
+    def __init__(self,
+                 kernel_size=3,
+                 res_channels=64,
+                 gate_channels=128,
+                 skip_channels=64,
+                 aux_channels=80,
+                 dropout=0.0,
+                 dilation=1,
+                 bias=True,
+                 use_causal_conv=False):
+        super(ResidualBlock, self).__init__()
+        self.dropout = dropout
+        # no future time stamps available
+        if use_causal_conv:
+            padding = (kernel_size - 1) * dilation
+        else:
+            assert (kernel_size -
+                    1) % 2 == 0, "Not support even number kernel size."
+            padding = (kernel_size - 1) // 2 * dilation
+        self.use_causal_conv = use_causal_conv
+
+        # dilation conv
+        self.conv = torch.nn.Conv1d(res_channels,
+                                    gate_channels,
+                                    kernel_size,
+                                    padding=padding,
+                                    dilation=dilation,
+                                    bias=bias)
+
+        # local conditioning
+        if aux_channels > 0:
+            self.conv1x1_aux = torch.nn.Conv1d(aux_channels,
+                                               gate_channels,
+                                               1,
+                                               bias=False)
+        else:
+            self.conv1x1_aux = None
+
+        # conv output is split into two groups
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = torch.nn.Conv1d(gate_out_channels,
+                                           res_channels,
+                                           1,
+                                           bias=bias)
+        self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels,
+                                            skip_channels,
+                                            1,
+                                            bias=bias)
+
+    def forward(self, x, c):
+        """
+        x: B x D_res x T
+        c: B x D_aux x T
+        """
+        residual = x
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.conv(x)
+
+        # remove future time steps if use_causal_conv conv
+        x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
+
+        # split into two part for gated activation
+        splitdim = 1
+        xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
+
+        # local conditioning
+        if c is not None:
+            assert self.conv1x1_aux is not None
+            c = self.conv1x1_aux(c)
+            ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ca, xb + cb
+
+        x = torch.tanh(xa) * torch.sigmoid(xb)
+
+        # for skip connection
+        s = self.conv1x1_skip(x)
+
+        # for residual connection
+        x = (self.conv1x1_out(x) + residual) * (0.5**2)
+
+        return x, s
--- a/mozilla_voice_tts/vocoder/layers/pqmf.py
+++ b/mozilla_voice_tts/vocoder/layers/pqmf.py
--- a/mozilla_voice_tts/vocoder/layers/qmf.dat
+++ b/mozilla_voice_tts/vocoder/layers/qmf.dat
--- a/mozilla_voice_tts/vocoder/layers/upsample.py
+++ b/mozilla_voice_tts/vocoder/layers/upsample.py
@ -0,0 +1,101 @@
+import torch
+from torch.nn import functional as F
+
+
+class Stretch2d(torch.nn.Module):
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        super(Stretch2d, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+
+    def forward(self, x):
+        """
+            x (Tensor): Input tensor (B, C, F, T).
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
+
+
+class UpsampleNetwork(torch.nn.Module):
+    # pylint: disable=dangerous-default-value
+    def __init__(self,
+                 upsample_factors,
+                 nonlinear_activation=None,
+                 nonlinear_activation_params={},
+                 interpolate_mode="nearest",
+                 freq_axis_kernel_size=1,
+                 use_causal_conv=False,
+                 ):
+        super(UpsampleNetwork, self).__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_factors:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+
+            # conv layer
+            assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = torch.nn.Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+                self.up_layers += [nonlinear]
+
+    def forward(self, c):
+        """
+            c :  (B, C, T_in).
+            Tensor: (B, C, T_upsample)
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+
+
+class ConvUpsample(torch.nn.Module):
+    # pylint: disable=dangerous-default-value
+    def __init__(self,
+                 upsample_factors,
+                 nonlinear_activation=None,
+                 nonlinear_activation_params={},
+                 interpolate_mode="nearest",
+                 freq_axis_kernel_size=1,
+                 aux_channels=80,
+                 aux_context_window=0,
+                 use_causal_conv=False
+                 ):
+        super(ConvUpsample, self).__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        # To capture wide-context information in conditional features
+        kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        # NOTE(kan-bayashi): Here do not use padding because the input is already padded
+        self.conv_in = torch.nn.Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
+        self.upsample = UpsampleNetwork(
+            upsample_factors=upsample_factors,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv,
+        )
+
+    def forward(self, c):
+        """
+        c : (B, C, T_in).
+        Tensor: (B, C, T_upsampled),
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)
--- a/mozilla_voice_tts/vocoder/models/init.py
+++ b/mozilla_voice_tts/vocoder/models/init.py
--- a/mozilla_voice_tts/vocoder/models/melgan_discriminator.py
+++ b/mozilla_voice_tts/vocoder/models/melgan_discriminator.py
--- a/mozilla_voice_tts/vocoder/models/melgan_generator.py
+++ b/mozilla_voice_tts/vocoder/models/melgan_generator.py
@ -2,7 +2,7 @@ import torch
 from torch import nn
 from torch.nn.utils import weight_norm

-from TTS.vocoder.layers.melgan import ResidualStack
+from mozilla_voice_tts.vocoder.layers.melgan import ResidualStack


 class MelganGenerator(nn.Module):
@ -95,4 +95,3 @@ class MelganGenerator(nn.Module):
                    nn.utils.remove_weight_norm(layer)
                except ValueError:
                    layer.remove_weight_norm()
-
--- a/mozilla_voice_tts/vocoder/models/melgan_multiscale_discriminator.py
+++ b/mozilla_voice_tts/vocoder/models/melgan_multiscale_discriminator.py
@ -1,6 +1,6 @@
 from torch import nn

-from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
+from mozilla_voice_tts.vocoder.models.melgan_discriminator import MelganDiscriminator


 class MelganMultiscaleDiscriminator(nn.Module):
@ -38,4 +38,4 @@ class MelganMultiscaleDiscriminator(nn.Module):
            scores.append(score)
            feats.append(feat)
            x = self.pooling(x)
-        return scores, feats
+        return scores, feats
--- a/Show More
+++ b/Show More