From cd889578aaa4f7c935cd15f813f166ece343bdb9 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 3 Mar 2021 19:59:21 +0100
Subject: [PATCH 01/43] Add resample script

---
 TTS/bin/resample.py | 69 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 TTS/bin/resample.py

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
new file mode 100644
index 00000000..41eced24
--- /dev/null
+++ b/TTS/bin/resample.py
@@ -0,0 +1,69 @@
+import argparse
+import glob
+import shutil
+import librosa
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from tqdm import tqdm
+
+def resample_file(filename):
+    global args
+    y, sr = librosa.load(filename, sr=args.output_sr)
+    librosa.output.write_wav(filename, y, sr)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='''Resample a folder recusively with librosa
+Can be used in place or create a copy of the folder as an output.\n\n'''
+
+'''
+Example run:
+    python TTS/bin/resample.py
+        --input_dir /root/LJSpeech-1.1/
+        --output_sr 22050
+        --output_dir /root/resampled_LJSpeech-1.1/
+        --n_jobs 24
+''',
+        formatter_class=RawTextHelpFormatter)
+
+    parser.add_argument('--input_dir',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='Path of the folder containing the audio files to resample')
+    
+    parser.add_argument('--output_sr',
+                        type=int,
+                        default=22050,
+                        required=False,
+                        help='Samlple rate to which the audio files should be resampled')
+
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default=None,
+                        required=False,
+                        help='Path of the destination folder. If not defined, the operation is done in place')
+    
+    parser.add_argument('--n_jobs',
+                        type=int,
+                        default=None,
+                        help='Number of threads to use, by default it uses all cores')
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        print('Recursively copying the input folder...')
+        shutil.copytree(args.input_dir, args.output_dir)
+        args.input_dir = args.output_dir
+
+    print('Resampling the audio files...')
+    audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True)
+    print(f'Found {len(audio_files)} files...')
+    with Pool(processes=args.n_jobs) as p:
+        with tqdm(total=len(audio_files)) as pbar:
+            for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+                pbar.update()
+
+    print('Done !')
+                
\ No newline at end of file

From fba0c828cdd5e347e9f95cf478c07c350d3478e9 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 3 Mar 2021 21:50:50 +0100
Subject: [PATCH 02/43] Using path.join instead of concat

---
 TTS/bin/resample.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 41eced24..c5f2b5de 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -1,5 +1,6 @@
 import argparse
 import glob
+import os
 import shutil
 import librosa
 from argparse import RawTextHelpFormatter
@@ -58,7 +59,7 @@ Example run:
         args.input_dir = args.output_dir
 
     print('Resampling the audio files...')
-    audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True)
+    audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:

From e769a959167825214e3fdfce3b639a0fef73f07e Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 18:44:41 +0100
Subject: [PATCH 03/43] linter + test

---
 TTS/bin/resample.py | 26 ++++++++++++--------------
 run_tests.sh        |  1 +
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index c5f2b5de..42de7080 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -16,16 +16,14 @@ if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(
         description='''Resample a folder recusively with librosa
-Can be used in place or create a copy of the folder as an output.\n\n'''
-
-'''
-Example run:
-    python TTS/bin/resample.py
-        --input_dir /root/LJSpeech-1.1/
-        --output_sr 22050
-        --output_dir /root/resampled_LJSpeech-1.1/
-        --n_jobs 24
-''',
+                       Can be used in place or create a copy of the folder as an output.\n\n
+                       Example run:
+                            python TTS/bin/resample.py
+                                --input_dir /root/LJSpeech-1.1/
+                                --output_sr 22050
+                                --output_dir /root/resampled_LJSpeech-1.1/
+                                --n_jobs 24
+                    ''',
         formatter_class=RawTextHelpFormatter)
 
     parser.add_argument('--input_dir',
@@ -33,7 +31,7 @@ Example run:
                         default=None,
                         required=True,
                         help='Path of the folder containing the audio files to resample')
-    
+
     parser.add_argument('--output_sr',
                         type=int,
                         default=22050,
@@ -45,7 +43,7 @@ Example run:
                         default=None,
                         required=False,
                         help='Path of the destination folder. If not defined, the operation is done in place')
-    
+
     parser.add_argument('--n_jobs',
                         type=int,
                         default=None,
@@ -55,11 +53,11 @@ Example run:
 
     if args.output_dir:
         print('Recursively copying the input folder...')
-        shutil.copytree(args.input_dir, args.output_dir)
+        copy_tree(args.input_dir, args.output_dir)
         args.input_dir = args.output_dir
 
     print('Resampling the audio files...')
-    audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True)
+    audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:
diff --git a/run_tests.sh b/run_tests.sh
index c562027c..3972306c 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -6,6 +6,7 @@ nosetests tests -x &&\
 
 # runtime tests
 ./tests/test_demo_server.sh && \
+./tests/test_resample.sh && \
 ./tests/test_tacotron_train.sh && \
 ./tests/test_glow-tts_train.sh && \
 ./tests/test_vocoder_gan_train.sh && \

From c1742c99281876514cf889618d7517f92b702c7b Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 18:50:03 +0100
Subject: [PATCH 04/43] test case

---
 TTS/bin/resample.py           |  2 +-
 tests/test_resample.sh        | 16 ++++++++++++++++
 tests/test_vocoder_pqmf.py    |  4 ++--
 tests/test_vocoder_tf_pqmf.py |  4 ++--
 4 files changed, 21 insertions(+), 5 deletions(-)
 create mode 100755 tests/test_resample.sh

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 42de7080..54599b8e 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -1,8 +1,8 @@
 import argparse
 import glob
 import os
-import shutil
 import librosa
+from distutils.dir_util import copy_tree
 from argparse import RawTextHelpFormatter
 from multiprocessing import Pool
 from tqdm import tqdm
diff --git a/tests/test_resample.sh b/tests/test_resample.sh
new file mode 100755
index 00000000..ddae17ad
--- /dev/null
+++ b/tests/test_resample.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -xe
+BASEDIR=$(dirname "$0")
+TARGET_SR=16000
+echo "$BASEDIR"
+#run the resample script
+python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR
+#check samplerate of output
+OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python )
+OUT_SR=$(($OUT_SR + 0))
+if [[ $OUT_SR -ne $TARGET_SR ]]; then
+    echo "Missmatch between target and output sample rates"
+    exit 1
+fi
+#cleaning up
+rm -rf $BASEDIR/outputs/resample_tests
diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py
index 1f141dd2..94e6ed01 100644
--- a/tests/test_vocoder_pqmf.py
+++ b/tests/test_vocoder_pqmf.py
@@ -4,7 +4,7 @@ import torch
 import soundfile as sf
 from librosa.core import load
 
-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.layers.pqmf import PQMF
 
 
@@ -24,4 +24,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
+    sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr)
diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py
index a1c4f692..c80def60 100644
--- a/tests/test_vocoder_tf_pqmf.py
+++ b/tests/test_vocoder_tf_pqmf.py
@@ -4,7 +4,7 @@ import tensorflow as tf
 import soundfile as sf
 from librosa.core import load
 
-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.tf.layers.pqmf import PQMF
 
 
@@ -25,4 +25,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
+    sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr)

From a1839d32454752b2b4fe6add909cd4375302866a Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 19:56:50 +0100
Subject: [PATCH 05/43] fix french_cleaners

---
 TTS/tts/utils/text/abbreviations.py | 76 ++++++++++++++++-------------
 TTS/tts/utils/text/cleaners.py      |  4 +-
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index fe4c1cdc..bc2f4830 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                     ]]
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
-abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
-                    for x in [
-                        ('M', 'monsieur'),
-                        ('Mlle', 'mademoiselle'),
-                        ('Mlles', 'mesdemoiselles'),
-                        ('Mme', 'Madame'),
-                        ('Mmes', 'Mesdames'),
-                        ('N.B', 'nota bene'),
-                        ('M', 'monsieur'),
-                        ('p.c.q', 'parce que'),
-                        ('Pr', 'professeur'),
-                        ('qqch', 'quelque chose'),
-                        ('rdv', 'rendez-vous'),
-                        ('max', 'maximum'),
-                        ('min', 'minimum'),
-                        ('no', 'numéro'),
-                        ('adr', 'adresse'),
-                        ('dr', 'docteur'),
-                        ('st', 'saint'),
-                        ('co', 'companie'),
-                        ('jr', 'junior'),
-                        ('sgt', 'sergent'),
-                        ('capt', 'capitain'),
-                        ('col', 'colonel'),
-                        ('av', 'avenue'),
-                        ('av. J.-C', 'avant Jésus-Christ'),
-                        ('apr. J.-C', 'après Jésus-Christ'),
-                        ('art', 'article'),
-                        ('boul', 'boulevard'),
-                        ('c.-à-d', 'c’est-à-dire'),
-                        ('etc', 'et cetera'),
-                        ('ex', 'exemple'),
-                        ('excl', 'exclusivement'),
-                        ('boul', 'boulevard'),
-                    ]]
+abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('M', 'monsieur'),
+                      ('Mlle', 'mademoiselle'),
+                      ('Mlles', 'mesdemoiselles'),
+                      ('Mme', 'Madame'),
+                      ('Mmes', 'Mesdames'),
+                      ('N.B', 'nota bene'),
+                      ('M', 'monsieur'),
+                      ('p.c.q', 'parce que'),
+                      ('Pr', 'professeur'),
+                      ('qqch', 'quelque chose'),
+                      ('rdv', 'rendez-vous'),
+                      ('max', 'maximum'),
+                      ('min', 'minimum'),
+                      ('no', 'numéro'),
+                      ('adr', 'adresse'),
+                      ('dr', 'docteur'),
+                      ('st', 'saint'),
+                      ('co', 'companie'),
+                      ('jr', 'junior'),
+                      ('sgt', 'sergent'),
+                      ('capt', 'capitain'),
+                      ('col', 'colonel'),
+                      ('av', 'avenue'),
+                      ('av. J.-C', 'avant Jésus-Christ'),
+                      ('apr. J.-C', 'après Jésus-Christ'),
+                      ('art', 'article'),
+                      ('boul', 'boulevard'),
+                      ('c.-à-d', 'c’est-à-dire'),
+                      ('etc', 'et cetera'),
+                      ('ex', 'exemple'),
+                      ('excl', 'exclusivement'),
+                      ('boul', 'boulevard'),
+                  ]] + [(re.compile('\\b%s' % x[0]), x[1])
+                  for x in [
+                      ('Mlle', 'mademoiselle'),
+                      ('Mlles', 'mesdemoiselles'),
+                      ('Mme', 'Madame'),
+                      ('Mmes', 'Mesdames'),
+                  ]]
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index 49a25557..c7a2b91a 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -108,8 +108,8 @@ def english_cleaners(text):
 
 def french_cleaners(text):
     '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
-    text = lowercase(text)
     text = expand_abbreviations(text, lang='fr')
+    text = lowercase(text)
     text = replace_symbols(text, lang='fr')
     text = remove_aux_symbols(text)
     text = collapse_whitespace(text)
@@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str:
     text = replace_numbers_to_characters_in_text(text)
     return text
 
-
-
 def phoneme_cleaners(text):
     '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
     text = expand_numbers(text)

From 16ce4e4805962bbd8e418d51cec3814bea434673 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Sat, 6 Mar 2021 23:24:12 +0100
Subject: [PATCH 06/43] fix linter issues

---
 TTS/bin/resample.py                 |  7 ++++---
 TTS/tts/utils/text/abbreviations.py | 12 ++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 54599b8e..aa3f9a37 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -7,9 +7,9 @@ from argparse import RawTextHelpFormatter
 from multiprocessing import Pool
 from tqdm import tqdm
 
-def resample_file(filename):
-    global args
-    y, sr = librosa.load(filename, sr=args.output_sr)
+def resample_file(func_args):
+    filename, output_sr = func_args
+    y, sr = librosa.load(filename, sr=output_sr)
     librosa.output.write_wav(filename, y, sr)
 
 if __name__ == '__main__':
@@ -59,6 +59,7 @@ if __name__ == '__main__':
     print('Resampling the audio files...')
     audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
+    audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr]))
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:
             for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index bc2f4830..3cafc65b 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -59,9 +59,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                       ('excl', 'exclusivement'),
                       ('boul', 'boulevard'),
                   ]] + [(re.compile('\\b%s' % x[0]), x[1])
-                  for x in [
-                      ('Mlle', 'mademoiselle'),
-                      ('Mlles', 'mesdemoiselles'),
-                      ('Mme', 'Madame'),
-                      ('Mmes', 'Mesdames'),
-                  ]]
+                        for x in [
+                            ('Mlle', 'mademoiselle'),
+                            ('Mlles', 'mesdemoiselles'),
+                            ('Mme', 'Madame'),
+                            ('Mmes', 'Mesdames'),
+                        ]]

From 10d7f6df022a8c9e416c11bce595b0f5f2d28c0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:08:52 +0100
Subject: [PATCH 07/43] add more CI tests

---
 .github/workflows/main.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a5b22f5b..d7a24a99 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -58,3 +58,5 @@ jobs:
           ./tests/test_vocoder_wavegrad_train.sh
           ./tests/test_vocoder_wavernn_train.sh
           ./tests/test_speedy_speech_train.sh
+          ./tests/test_resample.sh
+          ./tests/test_compute_statistics.sh

From 0cdb100536e4384ef452454a85ac06de64e126b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:11:28 +0100
Subject: [PATCH 08/43] update version 0.0.11

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index eb2b4c33..de277655 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version)
     )
 
 
-version = '0.0.10.2'
+version = '0.0.11'
 cwd = os.path.dirname(os.path.abspath(__file__))
 
 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors

From b46498ca68bfda6e49a768c454220c440e4f6c3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:35:47 +0100
Subject: [PATCH 09/43] linter fix

---
 TTS/tts/utils/text/abbreviations.py | 79 ++++++++++++++---------------
 tests/test_vocoder_pqmf.py          |  3 +-
 tests/test_vocoder_tf_pqmf.py       |  3 +-
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index 3cafc65b..579d7dcd 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -25,43 +25,42 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
 abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('M', 'monsieur'),
-                      ('Mlle', 'mademoiselle'),
-                      ('Mlles', 'mesdemoiselles'),
-                      ('Mme', 'Madame'),
-                      ('Mmes', 'Mesdames'),
-                      ('N.B', 'nota bene'),
-                      ('M', 'monsieur'),
-                      ('p.c.q', 'parce que'),
-                      ('Pr', 'professeur'),
-                      ('qqch', 'quelque chose'),
-                      ('rdv', 'rendez-vous'),
-                      ('max', 'maximum'),
-                      ('min', 'minimum'),
-                      ('no', 'numéro'),
-                      ('adr', 'adresse'),
-                      ('dr', 'docteur'),
-                      ('st', 'saint'),
-                      ('co', 'companie'),
-                      ('jr', 'junior'),
-                      ('sgt', 'sergent'),
-                      ('capt', 'capitain'),
-                      ('col', 'colonel'),
-                      ('av', 'avenue'),
-                      ('av. J.-C', 'avant Jésus-Christ'),
-                      ('apr. J.-C', 'après Jésus-Christ'),
-                      ('art', 'article'),
-                      ('boul', 'boulevard'),
-                      ('c.-à-d', 'c’est-à-dire'),
-                      ('etc', 'et cetera'),
-                      ('ex', 'exemple'),
-                      ('excl', 'exclusivement'),
-                      ('boul', 'boulevard'),
-                  ]] + [(re.compile('\\b%s' % x[0]), x[1])
-                        for x in [
-                            ('Mlle', 'mademoiselle'),
-                            ('Mlles', 'mesdemoiselles'),
-                            ('Mme', 'Madame'),
-                            ('Mmes', 'Mesdames'),
-                        ]]
+                    for x in [
+                        ('M', 'monsieur'),
+                        ('Mlle', 'mademoiselle'),
+                        ('Mlles', 'mesdemoiselles'),
+                        ('Mme', 'Madame'),
+                        ('Mmes', 'Mesdames'),
+                        ('N.B', 'nota bene'),
+                        ('M', 'monsieur'),
+                        ('p.c.q', 'parce que'),
+                        ('Pr', 'professeur'),
+                        ('qqch', 'quelque chose'),
+                        ('rdv', 'rendez-vous'),
+                        ('max', 'maximum'),
+                        ('min', 'minimum'),
+                        ('no', 'numéro'),
+                        ('adr', 'adresse'),
+                        ('dr', 'docteur'),
+                        ('st', 'saint'),
+                        ('co', 'companie'),
+                        ('jr', 'junior'),
+                        ('sgt', 'sergent'),
+                        ('capt', 'capitain'),
+                        ('col', 'colonel'),
+                        ('av', 'avenue'),
+                        ('av. J.-C', 'avant Jésus-Christ'),
+                        ('apr. J.-C', 'après Jésus-Christ'),
+                        ('art', 'article'),
+                        ('boul', 'boulevard'),
+                        ('c.-à-d', 'c’est-à-dire'),
+                        ('etc', 'et cetera'),
+                        ('ex', 'exemple'),
+                        ('excl', 'exclusivement'),
+                        ('boul', 'boulevard'),
+                    ]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [
+                        ('Mlle', 'mademoiselle'),
+                        ('Mlles', 'mesdemoiselles'),
+                        ('Mme', 'Madame'),
+                        ('Mmes', 'Mesdames'),
+                    ]]
diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py
index 94e6ed01..74da451f 100644
--- a/tests/test_vocoder_pqmf.py
+++ b/tests/test_vocoder_pqmf.py
@@ -24,4 +24,5 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'),
+             w2_.flatten().detach(), sr)
diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py
index c80def60..16c46b2a 100644
--- a/tests/test_vocoder_tf_pqmf.py
+++ b/tests/test_vocoder_tf_pqmf.py
@@ -25,4 +25,5 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'),
+             w2_.flatten(), sr)

From 9ce29d8094de4dcc756edaf7653c0e027a12c1da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 16:14:55 +0100
Subject: [PATCH 10/43] update CI config

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d7a24a99..d59e9a6c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -35,8 +35,8 @@ jobs:
         run: cat /etc/os-release
       - name: Install dependencies
         run: |
-          sudo apt update
-          sudo apt install espeak-ng git
+          apt update
+          apt install -y espeak-ng git
       - name: Upgrade pip
         # so we can take advantage of pyproject.toml build-dependency support
         run: python3 -m pip install --upgrade pip

From eabd7e6a52cb6d15446d3c5b399f14254c6a3d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:14:53 +0100
Subject: [PATCH 11/43] fix #374

---
 .github/workflows/main.yml                    |  2 +-
 TTS/bin/train_tacotron.py                     |  2 +-
 TTS/tts/layers/losses.py                      | 22 +++++++++----------
 ...config.json => test_tacotron2_config.json} |  0
 tests/test_tacotron_train.sh                  | 14 +++++++++++-
 tests/test_train_tts.py                       |  0
 6 files changed, 25 insertions(+), 15 deletions(-)
 rename tests/inputs/{test_train_config.json => test_tacotron2_config.json} (100%)
 delete mode 100644 tests/test_train_tts.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d59e9a6c..afefad2c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -46,7 +46,7 @@ jobs:
           python3 setup.py egg_info
       - name: Lint check
         run: |
-          cardboardlinter -n auto
+          cardboardlinter --refspec main -n auto
       - name: Unit tests
         run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
       - name: Test scripts
diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py
index 0887c2cc..331571d7 100644
--- a/TTS/bin/train_tacotron.py
+++ b/TTS/bin/train_tacotron.py
@@ -85,7 +85,7 @@ def format_data(data):
     text_input = data[0]
     text_lengths = data[1]
     speaker_names = data[2]
-    linear_input = data[3] if c.model in ["Tacotron"] else None
+    linear_input = data[3] if c.model.lower() in ["tacotron"] else None
     mel_input = data[4]
     mel_lengths = data[5]
     stop_targets = data[6]
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 50575b80..c5497054 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module):
                 stopnet_output, stopnet_target, output_lens, decoder_b_output,
                 alignments, alignment_lens, alignments_backwards, input_lens):
 
+
+        # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
+        # the target should be set acccordingly
+        postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
+
         return_dict = {}
         # remove lengths if no masking is applied
         if not self.config.loss_masking:
@@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module):
                 decoder_loss = self.criterion(decoder_output, mel_input,
                                               output_lens)
             if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input,
-                                                  output_lens)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input,
-                                                  output_lens)
+                postnet_loss = self.criterion(postnet_output, postnet_target,
+                                                output_lens)
         else:
             if self.decoder_alpha > 0:
                 decoder_loss = self.criterion(decoder_output, mel_input)
             if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input)
+                postnet_loss = self.criterion(postnet_output, postnet_target)
         loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
         return_dict['decoder_loss'] = decoder_loss
         return_dict['postnet_loss'] = postnet_loss
@@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module):
 
         # postnet differential spectral loss
         if self.config.postnet_diff_spec_alpha > 0:
-            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
+            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens)
             loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha
             return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss
 
@@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module):
 
         # postnet ssim loss
         if self.config.postnet_ssim_alpha > 0:
-            postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens)
+            postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens)
             loss += postnet_ssim_loss * self.postnet_ssim_alpha
             return_dict['postnet_ssim_loss'] = postnet_ssim_loss
 
diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_tacotron2_config.json
similarity index 100%
rename from tests/inputs/test_train_config.json
rename to tests/inputs/test_tacotron2_config.json
diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh
index 9268ea96..fa9930a7 100755
--- a/tests/test_tacotron_train.sh
+++ b/tests/test_tacotron_train.sh
@@ -3,7 +3,7 @@ set -xe
 BASEDIR=$(dirname "$0")
 echo "$BASEDIR"
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
@@ -11,3 +11,15 @@ echo $LATEST_FOLDER
 CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/
+
+# Tacotron2
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
diff --git a/tests/test_train_tts.py b/tests/test_train_tts.py
deleted file mode 100644
index e69de29b..00000000

From b2ecea0a049cb5444895f34f5e89ac9c657af809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:16:54 +0100
Subject: [PATCH 12/43] test config for tacotron model

---
 tests/inputs/test_tacotron_config.json | 177 +++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 tests/inputs/test_tacotron_config.json

diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json
new file mode 100644
index 00000000..a2fdd690
--- /dev/null
+++ b/tests/inputs/test_tacotron_config.json
@@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+

From 8dacf762becf3bc63030b60c4e48a7f49e66d785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:31:26 +0100
Subject: [PATCH 13/43] CI config update

---
 .github/workflows/main.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index afefad2c..f396f959 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -35,8 +35,9 @@ jobs:
         run: cat /etc/os-release
       - name: Install dependencies
         run: |
-          apt update
-          apt install -y espeak-ng git
+          sudo apt update
+          sudo apt install -y espeak-ng git
+          sudo apt install -y python3-wheel gcc
       - name: Upgrade pip
         # so we can take advantage of pyproject.toml build-dependency support
         run: python3 -m pip install --upgrade pip

From edc5b07d75d5caba250c93e6c493726acef06dbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:41:35 +0100
Subject: [PATCH 14/43] CI config update

---
 .github/workflows/main.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f396f959..0da0228c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - master
-      - dev
   pull_request:
     types: [opened, synchronize, reopened]
 jobs:
@@ -47,7 +46,7 @@ jobs:
           python3 setup.py egg_info
       - name: Lint check
         run: |
-          cardboardlinter --refspec main -n auto
+          cardboardlinter -n auto
       - name: Unit tests
         run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
       - name: Test scripts

From d652fb4999be8c4c36d15b2efcb112741b0029bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:47:16 +0100
Subject: [PATCH 15/43] linter fix

---
 TTS/tts/layers/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index c5497054..213970a7 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -313,7 +313,7 @@ class TacotronLoss(torch.nn.Module):
                                               output_lens)
             if self.postnet_alpha > 0:
                 postnet_loss = self.criterion(postnet_output, postnet_target,
-                                                output_lens)
+                                              output_lens)
         else:
             if self.decoder_alpha > 0:
                 decoder_loss = self.criterion(decoder_output, mel_input)

From f34b32b6cfd6ce64dacdccbc0ce11595e1206677 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Sat, 13 Mar 2021 00:46:53 +0100
Subject: [PATCH 16/43] force utf8

---
 TTS/utils/manage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index b2b93eac..ef77ca4e 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -43,7 +43,7 @@ class ModelManager(object):
         Args:
             file_path (str): path to .models.json.
         """
-        with open(file_path) as json_file:
+        with open(file_path, "r", encoding="utf-8") as json_file:
             self.models_dict = json.load(json_file)
 
     def list_langs(self):

From 09734887655858da59cf228d048716ac4bc40f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 15 Mar 2021 11:28:06 +0100
Subject: [PATCH 17/43] fix mozilla/TTS#685

---
 TTS/tts/models/tacotron_abstract.py |  3 +--
 tests/test_tacotron_train.sh        | 11 +++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py
index 10953269..22e86ee4 100644
--- a/TTS/tts/models/tacotron_abstract.py
+++ b/TTS/tts/models/tacotron_abstract.py
@@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module):
     def _backward_pass(self, mel_specs, encoder_outputs, mask):
         """ Run backwards decoder """
         decoder_outputs_b, alignments_b, _ = self.decoder_backward(
-            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
-            self.speaker_embeddings_projected)
+            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask)
         decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
         return decoder_outputs_b, alignments_b
 
diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh
index fa9930a7..e0a0253b 100755
--- a/tests/test_tacotron_train.sh
+++ b/tests/test_tacotron_train.sh
@@ -2,6 +2,7 @@
 set -xe
 BASEDIR=$(dirname "$0")
 echo "$BASEDIR"
+
 # run training
 CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
 # find the training folder
@@ -12,6 +13,16 @@ CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASED
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/
 
+# run Tacotron bi-directional decoder
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
 # Tacotron2
 # run training
 CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json

From 80befd1af5d7f52d034ddb80de0e2b7079b1ddf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 15 Mar 2021 11:52:20 +0100
Subject: [PATCH 18/43] add missing config file

---
 tests/inputs/test_tacotron_bd_config.json | 177 ++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 tests/inputs/test_tacotron_bd_config.json

diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json
new file mode 100644
index 00000000..b6092f4f
--- /dev/null
+++ b/tests/inputs/test_tacotron_bd_config.json
@@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": true,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": false,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+

From b09fc48a1d12e083aa066dcf60f99c0efc6ebb91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 17:07:29 +0100
Subject: [PATCH 19/43] bug fix

---
 TTS/utils/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py
index 6a09986c..ea195767 100644
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@@ -176,7 +176,7 @@ def process_args(args, model_type):
 
     _ = os.path.dirname(os.path.realpath(__file__))
 
-    if model_type in "tacotron wavegrad wavernn" and c.mixed_precision:
+    if c.mixed_precision:
         print("   >  Mixed precision mode is ON")
 
     out_path = args.continue_path

From 31935cd39e965cbbc511104856b3f4c328a3e1cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 17:04:34 +0100
Subject: [PATCH 20/43] bug fix in preprocessor

---
 TTS/tts/datasets/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 439a4091..eac121b8 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True):
             meta_data_eval_all += meta_data_eval
         meta_data_train_all += meta_data_train
         # load attention masks for duration predictor training
-        if 'meta_file_attn_mask' in dataset:
+        if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None:
             meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
             for idx, ins in enumerate(meta_data_train_all):
                 attn_file = meta_data[ins[1]].strip()

From 7bda48c81e68676cf86e742367f9bc0ff8716287 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 17:23:35 +0100
Subject: [PATCH 21/43] fix #382

---
 TTS/tts/models/speedy_speech.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py
index 886d6fd4..101d77a0 100644
--- a/TTS/tts/models/speedy_speech.py
+++ b/TTS/tts/models/speedy_speech.py
@@ -181,8 +181,12 @@ class SpeedySpeech(nn.Module):
             x_lengths: [B]
             g: [B, C]
         """
+        # input sequence should be greated than the max convolution size
+        inference_padding = 5
+        if x.shape[1] < 13:
+            inference_padding += 13 - x.shape[1]
         # pad input to prevent dropping the last word
-        x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
+        x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode='constant', value=0)
         o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
         # duration predictor pass
         o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)

From d6749f030f0c16f5bc5752b9d72710d72b123ac7 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 3 Mar 2021 19:59:21 +0100
Subject: [PATCH 22/43] Add resample script

---
 TTS/bin/resample.py | 69 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 TTS/bin/resample.py

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
new file mode 100644
index 00000000..41eced24
--- /dev/null
+++ b/TTS/bin/resample.py
@@ -0,0 +1,69 @@
+import argparse
+import glob
+import shutil
+import librosa
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from tqdm import tqdm
+
+def resample_file(filename):
+    global args
+    y, sr = librosa.load(filename, sr=args.output_sr)
+    librosa.output.write_wav(filename, y, sr)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='''Resample a folder recusively with librosa
+Can be used in place or create a copy of the folder as an output.\n\n'''
+
+'''
+Example run:
+    python TTS/bin/resample.py
+        --input_dir /root/LJSpeech-1.1/
+        --output_sr 22050
+        --output_dir /root/resampled_LJSpeech-1.1/
+        --n_jobs 24
+''',
+        formatter_class=RawTextHelpFormatter)
+
+    parser.add_argument('--input_dir',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='Path of the folder containing the audio files to resample')
+    
+    parser.add_argument('--output_sr',
+                        type=int,
+                        default=22050,
+                        required=False,
+                        help='Samlple rate to which the audio files should be resampled')
+
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default=None,
+                        required=False,
+                        help='Path of the destination folder. If not defined, the operation is done in place')
+    
+    parser.add_argument('--n_jobs',
+                        type=int,
+                        default=None,
+                        help='Number of threads to use, by default it uses all cores')
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        print('Recursively copying the input folder...')
+        shutil.copytree(args.input_dir, args.output_dir)
+        args.input_dir = args.output_dir
+
+    print('Resampling the audio files...')
+    audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True)
+    print(f'Found {len(audio_files)} files...')
+    with Pool(processes=args.n_jobs) as p:
+        with tqdm(total=len(audio_files)) as pbar:
+            for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+                pbar.update()
+
+    print('Done !')
+                
\ No newline at end of file

From 17f197f51ebef45e1a1e2a722f23ae4ba9d97dee Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 3 Mar 2021 21:50:50 +0100
Subject: [PATCH 23/43] Using path.join instead of concat

---
 TTS/bin/resample.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 41eced24..c5f2b5de 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -1,5 +1,6 @@
 import argparse
 import glob
+import os
 import shutil
 import librosa
 from argparse import RawTextHelpFormatter
@@ -58,7 +59,7 @@ Example run:
         args.input_dir = args.output_dir
 
     print('Resampling the audio files...')
-    audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True)
+    audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:

From 93fdc0729c2e1ba6c04733e824b4f10f4083f45f Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 18:44:41 +0100
Subject: [PATCH 24/43] linter + test

---
 TTS/bin/resample.py | 26 ++++++++++++--------------
 run_tests.sh        |  1 +
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index c5f2b5de..42de7080 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -16,16 +16,14 @@ if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(
         description='''Resample a folder recusively with librosa
-Can be used in place or create a copy of the folder as an output.\n\n'''
-
-'''
-Example run:
-    python TTS/bin/resample.py
-        --input_dir /root/LJSpeech-1.1/
-        --output_sr 22050
-        --output_dir /root/resampled_LJSpeech-1.1/
-        --n_jobs 24
-''',
+                       Can be used in place or create a copy of the folder as an output.\n\n
+                       Example run:
+                            python TTS/bin/resample.py
+                                --input_dir /root/LJSpeech-1.1/
+                                --output_sr 22050
+                                --output_dir /root/resampled_LJSpeech-1.1/
+                                --n_jobs 24
+                    ''',
         formatter_class=RawTextHelpFormatter)
 
     parser.add_argument('--input_dir',
@@ -33,7 +31,7 @@ Example run:
                         default=None,
                         required=True,
                         help='Path of the folder containing the audio files to resample')
-    
+
     parser.add_argument('--output_sr',
                         type=int,
                         default=22050,
@@ -45,7 +43,7 @@ Example run:
                         default=None,
                         required=False,
                         help='Path of the destination folder. If not defined, the operation is done in place')
-    
+
     parser.add_argument('--n_jobs',
                         type=int,
                         default=None,
@@ -55,11 +53,11 @@ Example run:
 
     if args.output_dir:
         print('Recursively copying the input folder...')
-        shutil.copytree(args.input_dir, args.output_dir)
+        copy_tree(args.input_dir, args.output_dir)
         args.input_dir = args.output_dir
 
     print('Resampling the audio files...')
-    audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True)
+    audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:
diff --git a/run_tests.sh b/run_tests.sh
index 15e33ebf..d7bbcf89 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -6,6 +6,7 @@ nosetests tests -x &&\
 
 # runtime tests
 ./tests/test_demo_server.sh && \
+./tests/test_resample.sh && \
 ./tests/test_tacotron_train.sh && \
 ./tests/test_glow-tts_train.sh && \
 ./tests/test_vocoder_gan_train.sh && \

From b94373afb8ff5e5ec2a65bd76cba07b90c6bee0d Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 18:50:03 +0100
Subject: [PATCH 25/43] test case

---
 TTS/bin/resample.py           |  2 +-
 tests/test_resample.sh        | 16 ++++++++++++++++
 tests/test_vocoder_pqmf.py    |  4 ++--
 tests/test_vocoder_tf_pqmf.py |  4 ++--
 4 files changed, 21 insertions(+), 5 deletions(-)
 create mode 100755 tests/test_resample.sh

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 42de7080..54599b8e 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -1,8 +1,8 @@
 import argparse
 import glob
 import os
-import shutil
 import librosa
+from distutils.dir_util import copy_tree
 from argparse import RawTextHelpFormatter
 from multiprocessing import Pool
 from tqdm import tqdm
diff --git a/tests/test_resample.sh b/tests/test_resample.sh
new file mode 100755
index 00000000..ddae17ad
--- /dev/null
+++ b/tests/test_resample.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -xe
+BASEDIR=$(dirname "$0")
+TARGET_SR=16000
+echo "$BASEDIR"
+#run the resample script
+python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR
+#check samplerate of output
+OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python )
+OUT_SR=$(($OUT_SR + 0))
+if [[ $OUT_SR -ne $TARGET_SR ]]; then
+    echo "Missmatch between target and output sample rates"
+    exit 1
+fi
+#cleaning up
+rm -rf $BASEDIR/outputs/resample_tests
diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py
index 1f141dd2..94e6ed01 100644
--- a/tests/test_vocoder_pqmf.py
+++ b/tests/test_vocoder_pqmf.py
@@ -4,7 +4,7 @@ import torch
 import soundfile as sf
 from librosa.core import load
 
-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.layers.pqmf import PQMF
 
 
@@ -24,4 +24,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
+    sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr)
diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py
index a1c4f692..c80def60 100644
--- a/tests/test_vocoder_tf_pqmf.py
+++ b/tests/test_vocoder_tf_pqmf.py
@@ -4,7 +4,7 @@ import tensorflow as tf
 import soundfile as sf
 from librosa.core import load
 
-from tests import get_tests_path, get_tests_input_path
+from tests import get_tests_path, get_tests_input_path, get_tests_output_path
 from TTS.vocoder.tf.layers.pqmf import PQMF
 
 
@@ -25,4 +25,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
+    sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr)

From 1574d8dd39396f50b0a1905c93717117ffe7a52c Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 5 Mar 2021 19:56:50 +0100
Subject: [PATCH 26/43] fix french_cleaners

---
 TTS/tts/utils/text/abbreviations.py | 76 ++++++++++++++++-------------
 TTS/tts/utils/text/cleaners.py      |  4 +-
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index fe4c1cdc..bc2f4830 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                     ]]
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
-abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
-                    for x in [
-                        ('M', 'monsieur'),
-                        ('Mlle', 'mademoiselle'),
-                        ('Mlles', 'mesdemoiselles'),
-                        ('Mme', 'Madame'),
-                        ('Mmes', 'Mesdames'),
-                        ('N.B', 'nota bene'),
-                        ('M', 'monsieur'),
-                        ('p.c.q', 'parce que'),
-                        ('Pr', 'professeur'),
-                        ('qqch', 'quelque chose'),
-                        ('rdv', 'rendez-vous'),
-                        ('max', 'maximum'),
-                        ('min', 'minimum'),
-                        ('no', 'numéro'),
-                        ('adr', 'adresse'),
-                        ('dr', 'docteur'),
-                        ('st', 'saint'),
-                        ('co', 'companie'),
-                        ('jr', 'junior'),
-                        ('sgt', 'sergent'),
-                        ('capt', 'capitain'),
-                        ('col', 'colonel'),
-                        ('av', 'avenue'),
-                        ('av. J.-C', 'avant Jésus-Christ'),
-                        ('apr. J.-C', 'après Jésus-Christ'),
-                        ('art', 'article'),
-                        ('boul', 'boulevard'),
-                        ('c.-à-d', 'c’est-à-dire'),
-                        ('etc', 'et cetera'),
-                        ('ex', 'exemple'),
-                        ('excl', 'exclusivement'),
-                        ('boul', 'boulevard'),
-                    ]]
+abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('M', 'monsieur'),
+                      ('Mlle', 'mademoiselle'),
+                      ('Mlles', 'mesdemoiselles'),
+                      ('Mme', 'Madame'),
+                      ('Mmes', 'Mesdames'),
+                      ('N.B', 'nota bene'),
+                      ('M', 'monsieur'),
+                      ('p.c.q', 'parce que'),
+                      ('Pr', 'professeur'),
+                      ('qqch', 'quelque chose'),
+                      ('rdv', 'rendez-vous'),
+                      ('max', 'maximum'),
+                      ('min', 'minimum'),
+                      ('no', 'numéro'),
+                      ('adr', 'adresse'),
+                      ('dr', 'docteur'),
+                      ('st', 'saint'),
+                      ('co', 'companie'),
+                      ('jr', 'junior'),
+                      ('sgt', 'sergent'),
+                      ('capt', 'capitain'),
+                      ('col', 'colonel'),
+                      ('av', 'avenue'),
+                      ('av. J.-C', 'avant Jésus-Christ'),
+                      ('apr. J.-C', 'après Jésus-Christ'),
+                      ('art', 'article'),
+                      ('boul', 'boulevard'),
+                      ('c.-à-d', 'c’est-à-dire'),
+                      ('etc', 'et cetera'),
+                      ('ex', 'exemple'),
+                      ('excl', 'exclusivement'),
+                      ('boul', 'boulevard'),
+                  ]] + [(re.compile('\\b%s' % x[0]), x[1])
+                  for x in [
+                      ('Mlle', 'mademoiselle'),
+                      ('Mlles', 'mesdemoiselles'),
+                      ('Mme', 'Madame'),
+                      ('Mmes', 'Mesdames'),
+                  ]]
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index 49a25557..c7a2b91a 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -108,8 +108,8 @@ def english_cleaners(text):
 
 def french_cleaners(text):
     '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
-    text = lowercase(text)
     text = expand_abbreviations(text, lang='fr')
+    text = lowercase(text)
     text = replace_symbols(text, lang='fr')
     text = remove_aux_symbols(text)
     text = collapse_whitespace(text)
@@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str:
     text = replace_numbers_to_characters_in_text(text)
     return text
 
-
-
 def phoneme_cleaners(text):
     '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
     text = expand_numbers(text)

From 11e25a71251721ac1b6c772d3ad38ace53259edb Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Sat, 6 Mar 2021 23:24:12 +0100
Subject: [PATCH 27/43] fix linter issues

---
 TTS/bin/resample.py                 |  7 ++++---
 TTS/tts/utils/text/abbreviations.py | 12 ++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index 54599b8e..aa3f9a37 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -7,9 +7,9 @@ from argparse import RawTextHelpFormatter
 from multiprocessing import Pool
 from tqdm import tqdm
 
-def resample_file(filename):
-    global args
-    y, sr = librosa.load(filename, sr=args.output_sr)
+def resample_file(func_args):
+    filename, output_sr = func_args
+    y, sr = librosa.load(filename, sr=output_sr)
     librosa.output.write_wav(filename, y, sr)
 
 if __name__ == '__main__':
@@ -59,6 +59,7 @@ if __name__ == '__main__':
     print('Resampling the audio files...')
     audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True)
     print(f'Found {len(audio_files)} files...')
+    audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr]))
     with Pool(processes=args.n_jobs) as p:
         with tqdm(total=len(audio_files)) as pbar:
             for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index bc2f4830..3cafc65b 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -59,9 +59,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                       ('excl', 'exclusivement'),
                       ('boul', 'boulevard'),
                   ]] + [(re.compile('\\b%s' % x[0]), x[1])
-                  for x in [
-                      ('Mlle', 'mademoiselle'),
-                      ('Mlles', 'mesdemoiselles'),
-                      ('Mme', 'Madame'),
-                      ('Mmes', 'Mesdames'),
-                  ]]
+                        for x in [
+                            ('Mlle', 'mademoiselle'),
+                            ('Mlles', 'mesdemoiselles'),
+                            ('Mme', 'Madame'),
+                            ('Mmes', 'Mesdames'),
+                        ]]

From eb071fd3b6b101e2fa54c180e9939c57d5ff7118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:08:52 +0100
Subject: [PATCH 28/43] add more CI tests

---
 .github/workflows/main.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 01ef1a3f..60e61d1c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -58,3 +58,5 @@ jobs:
           ./tests/test_vocoder_wavegrad_train.sh
           ./tests/test_vocoder_wavernn_train.sh
           ./tests/test_speedy_speech_train.sh
+          ./tests/test_resample.sh
+          ./tests/test_compute_statistics.sh

From 13041ebfa80e0e972b0b71262c7c363381452de3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:11:28 +0100
Subject: [PATCH 29/43] update version 0.0.11

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2f78d572..de277655 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version)
     )
 
 
-version = '0.0.10.3'
+version = '0.0.11'
 cwd = os.path.dirname(os.path.abspath(__file__))
 
 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors

From bdfd1f8a8995db6d1fe9b03b01d17bb7ceb5d64f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 12:35:47 +0100
Subject: [PATCH 30/43] linter fix

---
 TTS/tts/utils/text/abbreviations.py | 79 ++++++++++++++---------------
 tests/test_vocoder_pqmf.py          |  3 +-
 tests/test_vocoder_tf_pqmf.py       |  3 +-
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py
index 3cafc65b..579d7dcd 100644
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@@ -25,43 +25,42 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
 abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('M', 'monsieur'),
-                      ('Mlle', 'mademoiselle'),
-                      ('Mlles', 'mesdemoiselles'),
-                      ('Mme', 'Madame'),
-                      ('Mmes', 'Mesdames'),
-                      ('N.B', 'nota bene'),
-                      ('M', 'monsieur'),
-                      ('p.c.q', 'parce que'),
-                      ('Pr', 'professeur'),
-                      ('qqch', 'quelque chose'),
-                      ('rdv', 'rendez-vous'),
-                      ('max', 'maximum'),
-                      ('min', 'minimum'),
-                      ('no', 'numéro'),
-                      ('adr', 'adresse'),
-                      ('dr', 'docteur'),
-                      ('st', 'saint'),
-                      ('co', 'companie'),
-                      ('jr', 'junior'),
-                      ('sgt', 'sergent'),
-                      ('capt', 'capitain'),
-                      ('col', 'colonel'),
-                      ('av', 'avenue'),
-                      ('av. J.-C', 'avant Jésus-Christ'),
-                      ('apr. J.-C', 'après Jésus-Christ'),
-                      ('art', 'article'),
-                      ('boul', 'boulevard'),
-                      ('c.-à-d', 'c’est-à-dire'),
-                      ('etc', 'et cetera'),
-                      ('ex', 'exemple'),
-                      ('excl', 'exclusivement'),
-                      ('boul', 'boulevard'),
-                  ]] + [(re.compile('\\b%s' % x[0]), x[1])
-                        for x in [
-                            ('Mlle', 'mademoiselle'),
-                            ('Mlles', 'mesdemoiselles'),
-                            ('Mme', 'Madame'),
-                            ('Mmes', 'Mesdames'),
-                        ]]
+                    for x in [
+                        ('M', 'monsieur'),
+                        ('Mlle', 'mademoiselle'),
+                        ('Mlles', 'mesdemoiselles'),
+                        ('Mme', 'Madame'),
+                        ('Mmes', 'Mesdames'),
+                        ('N.B', 'nota bene'),
+                        ('M', 'monsieur'),
+                        ('p.c.q', 'parce que'),
+                        ('Pr', 'professeur'),
+                        ('qqch', 'quelque chose'),
+                        ('rdv', 'rendez-vous'),
+                        ('max', 'maximum'),
+                        ('min', 'minimum'),
+                        ('no', 'numéro'),
+                        ('adr', 'adresse'),
+                        ('dr', 'docteur'),
+                        ('st', 'saint'),
+                        ('co', 'companie'),
+                        ('jr', 'junior'),
+                        ('sgt', 'sergent'),
+                        ('capt', 'capitain'),
+                        ('col', 'colonel'),
+                        ('av', 'avenue'),
+                        ('av. J.-C', 'avant Jésus-Christ'),
+                        ('apr. J.-C', 'après Jésus-Christ'),
+                        ('art', 'article'),
+                        ('boul', 'boulevard'),
+                        ('c.-à-d', 'c’est-à-dire'),
+                        ('etc', 'et cetera'),
+                        ('ex', 'exemple'),
+                        ('excl', 'exclusivement'),
+                        ('boul', 'boulevard'),
+                    ]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [
+                        ('Mlle', 'mademoiselle'),
+                        ('Mlles', 'mesdemoiselles'),
+                        ('Mme', 'Madame'),
+                        ('Mmes', 'Mesdames'),
+                    ]]
diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py
index 94e6ed01..74da451f 100644
--- a/tests/test_vocoder_pqmf.py
+++ b/tests/test_vocoder_pqmf.py
@@ -24,4 +24,5 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'),
+             w2_.flatten().detach(), sr)
diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py
index c80def60..16c46b2a 100644
--- a/tests/test_vocoder_tf_pqmf.py
+++ b/tests/test_vocoder_tf_pqmf.py
@@ -25,4 +25,5 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr)
+    sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'),
+             w2_.flatten(), sr)

From 255d5486acbe743e21f0c5197f931ffecb96d7bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 16:14:55 +0100
Subject: [PATCH 31/43] update CI config

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 60e61d1c..9bd88830 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -35,8 +35,8 @@ jobs:
         run: cat /etc/os-release
       - name: Install dependencies
         run: |
-          sudo apt update
-          sudo apt install espeak-ng git
+          apt update
+          apt install -y espeak-ng git
       - name: Upgrade pip
         # so we can take advantage of pyproject.toml build-dependency support
         run: python3 -m pip install --upgrade pip

From babc94f63fa5af6ff8e0499880d2906a0c63f153 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:14:53 +0100
Subject: [PATCH 32/43] fix #374

---
 .github/workflows/main.yml                    |  2 +-
 TTS/bin/train_tacotron.py                     |  2 +-
 TTS/tts/layers/losses.py                      | 22 +++++++++----------
 ...config.json => test_tacotron2_config.json} |  0
 tests/test_tacotron_train.sh                  | 14 +++++++++++-
 tests/test_train_tts.py                       |  0
 6 files changed, 25 insertions(+), 15 deletions(-)
 rename tests/inputs/{test_train_config.json => test_tacotron2_config.json} (100%)
 delete mode 100644 tests/test_train_tts.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9bd88830..5b7d0c03 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -46,7 +46,7 @@ jobs:
           python3 setup.py egg_info
       - name: Lint check
         run: |
-          cardboardlinter -n auto
+          cardboardlinter --refspec main -n auto
       - name: Unit tests
         run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
       - name: Test scripts
diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py
index 0887c2cc..331571d7 100644
--- a/TTS/bin/train_tacotron.py
+++ b/TTS/bin/train_tacotron.py
@@ -85,7 +85,7 @@ def format_data(data):
     text_input = data[0]
     text_lengths = data[1]
     speaker_names = data[2]
-    linear_input = data[3] if c.model in ["Tacotron"] else None
+    linear_input = data[3] if c.model.lower() in ["tacotron"] else None
     mel_input = data[4]
     mel_lengths = data[5]
     stop_targets = data[6]
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 50575b80..c5497054 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module):
                 stopnet_output, stopnet_target, output_lens, decoder_b_output,
                 alignments, alignment_lens, alignments_backwards, input_lens):
 
+
+        # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
+        # the target should be set acccordingly
+        postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
+
         return_dict = {}
         # remove lengths if no masking is applied
         if not self.config.loss_masking:
@@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module):
                 decoder_loss = self.criterion(decoder_output, mel_input,
                                               output_lens)
             if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input,
-                                                  output_lens)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input,
-                                                  output_lens)
+                postnet_loss = self.criterion(postnet_output, postnet_target,
+                                                output_lens)
         else:
             if self.decoder_alpha > 0:
                 decoder_loss = self.criterion(decoder_output, mel_input)
             if self.postnet_alpha > 0:
-                if self.config.model in ["Tacotron", "TacotronGST"]:
-                    postnet_loss = self.criterion(postnet_output, linear_input)
-                else:
-                    postnet_loss = self.criterion(postnet_output, mel_input)
+                postnet_loss = self.criterion(postnet_output, postnet_target)
         loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
         return_dict['decoder_loss'] = decoder_loss
         return_dict['postnet_loss'] = postnet_loss
@@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module):
 
         # postnet differential spectral loss
         if self.config.postnet_diff_spec_alpha > 0:
-            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
+            postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens)
             loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha
             return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss
 
@@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module):
 
         # postnet ssim loss
         if self.config.postnet_ssim_alpha > 0:
-            postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens)
+            postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens)
             loss += postnet_ssim_loss * self.postnet_ssim_alpha
             return_dict['postnet_ssim_loss'] = postnet_ssim_loss
 
diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_tacotron2_config.json
similarity index 100%
rename from tests/inputs/test_train_config.json
rename to tests/inputs/test_tacotron2_config.json
diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh
index 9268ea96..fa9930a7 100755
--- a/tests/test_tacotron_train.sh
+++ b/tests/test_tacotron_train.sh
@@ -3,7 +3,7 @@ set -xe
 BASEDIR=$(dirname "$0")
 echo "$BASEDIR"
 # run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
 # find the training folder
 LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
 echo $LATEST_FOLDER
@@ -11,3 +11,15 @@ echo $LATEST_FOLDER
 CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/
+
+# Tacotron2
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
diff --git a/tests/test_train_tts.py b/tests/test_train_tts.py
deleted file mode 100644
index e69de29b..00000000

From 0fa3101f2a298c995f48fdb1a62fd9f2e55af003 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:16:54 +0100
Subject: [PATCH 33/43] test config for tacotron model

---
 tests/inputs/test_tacotron_config.json | 177 +++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 tests/inputs/test_tacotron_config.json

diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json
new file mode 100644
index 00000000..a2fdd690
--- /dev/null
+++ b/tests/inputs/test_tacotron_config.json
@@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+

From 739584ec932ec240e829b806a3849eef0c37d8a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:31:26 +0100
Subject: [PATCH 34/43] CI config update

---
 .github/workflows/main.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5b7d0c03..81dbbd63 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -35,8 +35,9 @@ jobs:
         run: cat /etc/os-release
       - name: Install dependencies
         run: |
-          apt update
-          apt install -y espeak-ng git
+          sudo apt update
+          sudo apt install -y espeak-ng git
+          sudo apt install -y python3-wheel gcc
       - name: Upgrade pip
         # so we can take advantage of pyproject.toml build-dependency support
         run: python3 -m pip install --upgrade pip

From ddab50b515b6e77058575e3958c4d2067257a412 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:41:35 +0100
Subject: [PATCH 35/43] CI config update

---
 .github/workflows/main.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 81dbbd63..be7b77c7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - master
-      - dev
   pull_request:
     types: [opened, synchronize, reopened]
 jobs:
@@ -47,7 +46,7 @@ jobs:
           python3 setup.py egg_info
       - name: Lint check
         run: |
-          cardboardlinter --refspec main -n auto
+          cardboardlinter -n auto
       - name: Unit tests
         run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker
       - name: Test scripts

From bf0caba0bc481422745438c071ae49003cd0ea26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 11 Mar 2021 18:47:16 +0100
Subject: [PATCH 36/43] linter fix

---
 TTS/tts/layers/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index c5497054..213970a7 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -313,7 +313,7 @@ class TacotronLoss(torch.nn.Module):
                                               output_lens)
             if self.postnet_alpha > 0:
                 postnet_loss = self.criterion(postnet_output, postnet_target,
-                                                output_lens)
+                                              output_lens)
         else:
             if self.decoder_alpha > 0:
                 decoder_loss = self.criterion(decoder_output, mel_input)

From a8c348ffb2d4b2f277df1319f24cfc730f69d694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Sat, 13 Mar 2021 00:46:53 +0100
Subject: [PATCH 37/43] force utf8

---
 TTS/utils/manage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index b2b93eac..ef77ca4e 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -43,7 +43,7 @@ class ModelManager(object):
         Args:
             file_path (str): path to .models.json.
         """
-        with open(file_path) as json_file:
+        with open(file_path, "r", encoding="utf-8") as json_file:
             self.models_dict = json.load(json_file)
 
     def list_langs(self):

From aa8bb815a743d3d14fa07c62434b5cdcee87b938 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 15 Mar 2021 11:28:06 +0100
Subject: [PATCH 38/43] fix mozilla/TTS#685

---
 TTS/tts/models/tacotron_abstract.py |  3 +--
 tests/test_tacotron_train.sh        | 11 +++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py
index 10953269..22e86ee4 100644
--- a/TTS/tts/models/tacotron_abstract.py
+++ b/TTS/tts/models/tacotron_abstract.py
@@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module):
     def _backward_pass(self, mel_specs, encoder_outputs, mask):
         """ Run backwards decoder """
         decoder_outputs_b, alignments_b, _ = self.decoder_backward(
-            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
-            self.speaker_embeddings_projected)
+            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask)
         decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
         return decoder_outputs_b, alignments_b
 
diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh
index fa9930a7..e0a0253b 100755
--- a/tests/test_tacotron_train.sh
+++ b/tests/test_tacotron_train.sh
@@ -2,6 +2,7 @@
 set -xe
 BASEDIR=$(dirname "$0")
 echo "$BASEDIR"
+
 # run training
 CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json
 # find the training folder
@@ -12,6 +13,16 @@ CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASED
 # remove all the outputs
 rm -rf $BASEDIR/train_outputs/
 
+# run Tacotron bi-directional decoder
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
+
 # Tacotron2
 # run training
 CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json

From 0e54ab93ee6d04fca003bfbf16a48f0f8a433b43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 15 Mar 2021 11:52:20 +0100
Subject: [PATCH 39/43] add missing config file

---
 tests/inputs/test_tacotron_bd_config.json | 177 ++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 tests/inputs/test_tacotron_bd_config.json

diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json
new file mode 100644
index 00000000..b6092f4f
--- /dev/null
+++ b/tests/inputs/test_tacotron_bd_config.json
@@ -0,0 +1,177 @@
+{
+    "model": "Tacotron",
+    "run_name": "test_sample_dataset_run",
+    "run_description": "sample dataset test run",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
+    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
+    "mixed_precision": false,
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,              // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "bn",            // "original" or "bn".
+    "prenet_dropout": false,        // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": true,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": false,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 1,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": true,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": true,
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/train_outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+    "use_gst": true,       			    // use global style tokens
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) == len(gst_style_tokens).
+        "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10
+    },
+
+    // DATASETS
+    "train_portion": 0.1,  // dataset portion used for training. It is mainly for internal experiments.
+    "eval_portion": 0.1,   // dataset portion used for training. It is mainly for internal experiments.
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+
+}
+

From 01e35e06c4fcb60c166b1a1134bc51e7ad4afc9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 17:07:29 +0100
Subject: [PATCH 40/43] bug fix

---
 TTS/utils/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py
index 6a09986c..ea195767 100644
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@@ -176,7 +176,7 @@ def process_args(args, model_type):
 
     _ = os.path.dirname(os.path.realpath(__file__))
 
-    if model_type in "tacotron wavegrad wavernn" and c.mixed_precision:
+    if c.mixed_precision:
         print("   >  Mixed precision mode is ON")
 
     out_path = args.continue_path

From 4c1aed4a9cc40a4f2db5f0555867de86fb965424 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 17:04:34 +0100
Subject: [PATCH 41/43] bug fix in preprocessor

---
 TTS/tts/datasets/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 439a4091..eac121b8 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True):
             meta_data_eval_all += meta_data_eval
         meta_data_train_all += meta_data_train
         # load attention masks for duration predictor training
-        if 'meta_file_attn_mask' in dataset:
+        if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None:
             meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask']))
             for idx, ins in enumerate(meta_data_train_all):
                 attn_file = meta_data[ins[1]].strip()

From 2690ab2ee5037a4a638c3bf07492b2d5f9e0a9c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 16 Mar 2021 19:15:28 +0100
Subject: [PATCH 42/43] bug fix

---
 TTS/utils/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py
index ea195767..3f6f582e 100644
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@@ -176,7 +176,7 @@ def process_args(args, model_type):
 
     _ = os.path.dirname(os.path.realpath(__file__))
 
-    if c.mixed_precision:
+    if 'mixed_precision' in c and c.mixed_precision:
         print("   >  Mixed precision mode is ON")
 
     out_path = args.continue_path

From 281e708f47f00a66fbf01205e870706dac2cec05 Mon Sep 17 00:00:00 2001
From: Thorsten Mueller <MrThorstenM@gmx.net>
Date: Tue, 16 Mar 2021 20:56:11 +0100
Subject: [PATCH 43/43] Fix for: name 'file_names' is not defined

---
 notebooks/dataset_analysis/CheckDatasetSNR.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/notebooks/dataset_analysis/CheckDatasetSNR.ipynb b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb
index 0aa07343..b022e362 100644
--- a/notebooks/dataset_analysis/CheckDatasetSNR.ipynb
+++ b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn.\n",
+    "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn. SNR paper can be seen here: https://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf\n",
     "\n",
     "To use this notebook, you need:\n",
     "- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n",
@@ -136,7 +136,7 @@
     "snrs = [tup[0] for tup in file_snrs]\n",
     "\n",
     "error_idxs = np.where(np.isnan(snrs) == True)[0]\n",
-    "error_files = [file_names[idx] for idx in error_idxs]\n",
+    "error_files = [wav_files[idx] for idx in error_idxs]\n",
     "\n",
     "file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n",
     "file_names = [tup[1] for tup in file_snrs]\n",
@@ -236,4 +236,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file