From cd889578aaa4f7c935cd15f813f166ece343bdb9 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 3 Mar 2021 19:59:21 +0100 Subject: [PATCH 01/43] Add resample script --- TTS/bin/resample.py | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 TTS/bin/resample.py diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py new file mode 100644 index 00000000..41eced24 --- /dev/null +++ b/TTS/bin/resample.py @@ -0,0 +1,69 @@ +import argparse +import glob +import shutil +import librosa +from argparse import RawTextHelpFormatter +from multiprocessing import Pool +from tqdm import tqdm + +def resample_file(filename): + global args + y, sr = librosa.load(filename, sr=args.output_sr) + librosa.output.write_wav(filename, y, sr) + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='''Resample a folder recusively with librosa +Can be used in place or create a copy of the folder as an output.\n\n''' + +''' +Example run: + python TTS/bin/resample.py + --input_dir /root/LJSpeech-1.1/ + --output_sr 22050 + --output_dir /root/resampled_LJSpeech-1.1/ + --n_jobs 24 +''', + formatter_class=RawTextHelpFormatter) + + parser.add_argument('--input_dir', + type=str, + default=None, + required=True, + help='Path of the folder containing the audio files to resample') + + parser.add_argument('--output_sr', + type=int, + default=22050, + required=False, + help='Samlple rate to which the audio files should be resampled') + + parser.add_argument('--output_dir', + type=str, + default=None, + required=False, + help='Path of the destination folder. If not defined, the operation is done in place') + + parser.add_argument('--n_jobs', + type=int, + default=None, + help='Number of threads to use, by default it uses all cores') + + args = parser.parse_args() + + if args.output_dir: + print('Recursively copying the input folder...') + shutil.copytree(args.input_dir, args.output_dir) + args.input_dir = args.output_dir + + print('Resampling the audio files...') + audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True) + print(f'Found {len(audio_files)} files...') + with Pool(processes=args.n_jobs) as p: + with tqdm(total=len(audio_files)) as pbar: + for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)): + pbar.update() + + print('Done !') + \ No newline at end of file From fba0c828cdd5e347e9f95cf478c07c350d3478e9 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 3 Mar 2021 21:50:50 +0100 Subject: [PATCH 02/43] Using path.join instead of concat --- TTS/bin/resample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 41eced24..c5f2b5de 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -1,5 +1,6 @@ import argparse import glob +import os import shutil import librosa from argparse import RawTextHelpFormatter @@ -58,7 +59,7 @@ Example run: args.input_dir = args.output_dir print('Resampling the audio files...') - audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True) + audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: From e769a959167825214e3fdfce3b639a0fef73f07e Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 18:44:41 +0100 Subject: [PATCH 03/43] linter + test --- TTS/bin/resample.py | 26 ++++++++++++-------------- run_tests.sh | 1 + 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index c5f2b5de..42de7080 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -16,16 +16,14 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description='''Resample a folder recusively with librosa -Can be used in place or create a copy of the folder as an output.\n\n''' - -''' -Example run: - python TTS/bin/resample.py - --input_dir /root/LJSpeech-1.1/ - --output_sr 22050 - --output_dir /root/resampled_LJSpeech-1.1/ - --n_jobs 24 -''', + Can be used in place or create a copy of the folder as an output.\n\n + Example run: + python TTS/bin/resample.py + --input_dir /root/LJSpeech-1.1/ + --output_sr 22050 + --output_dir /root/resampled_LJSpeech-1.1/ + --n_jobs 24 + ''', formatter_class=RawTextHelpFormatter) parser.add_argument('--input_dir', @@ -33,7 +31,7 @@ Example run: default=None, required=True, help='Path of the folder containing the audio files to resample') - + parser.add_argument('--output_sr', type=int, default=22050, @@ -45,7 +43,7 @@ Example run: default=None, required=False, help='Path of the destination folder. If not defined, the operation is done in place') - + parser.add_argument('--n_jobs', type=int, default=None, @@ -55,11 +53,11 @@ Example run: if args.output_dir: print('Recursively copying the input folder...') - shutil.copytree(args.input_dir, args.output_dir) + copy_tree(args.input_dir, args.output_dir) args.input_dir = args.output_dir print('Resampling the audio files...') - audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True) + audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: diff --git a/run_tests.sh b/run_tests.sh index c562027c..3972306c 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -6,6 +6,7 @@ nosetests tests -x &&\ # runtime tests ./tests/test_demo_server.sh && \ +./tests/test_resample.sh && \ ./tests/test_tacotron_train.sh && \ ./tests/test_glow-tts_train.sh && \ ./tests/test_vocoder_gan_train.sh && \ From c1742c99281876514cf889618d7517f92b702c7b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 18:50:03 +0100 Subject: [PATCH 04/43] test case --- TTS/bin/resample.py | 2 +- tests/test_resample.sh | 16 ++++++++++++++++ tests/test_vocoder_pqmf.py | 4 ++-- tests/test_vocoder_tf_pqmf.py | 4 ++-- 4 files changed, 21 insertions(+), 5 deletions(-) create mode 100755 tests/test_resample.sh diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 42de7080..54599b8e 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -1,8 +1,8 @@ import argparse import glob import os -import shutil import librosa +from distutils.dir_util import copy_tree from argparse import RawTextHelpFormatter from multiprocessing import Pool from tqdm import tqdm diff --git a/tests/test_resample.sh b/tests/test_resample.sh new file mode 100755 index 00000000..ddae17ad --- /dev/null +++ b/tests/test_resample.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -xe +BASEDIR=$(dirname "$0") +TARGET_SR=16000 +echo "$BASEDIR" +#run the resample script +python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR +#check samplerate of output +OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python ) +OUT_SR=$(($OUT_SR + 0)) +if [[ $OUT_SR -ne $TARGET_SR ]]; then + echo "Missmatch between target and output sample rates" + exit 1 +fi +#cleaning up +rm -rf $BASEDIR/outputs/resample_tests diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py index 1f141dd2..94e6ed01 100644 --- a/tests/test_vocoder_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -4,7 +4,7 @@ import torch import soundfile as sf from librosa.core import load -from tests import get_tests_path, get_tests_input_path +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.vocoder.layers.pqmf import PQMF @@ -24,4 +24,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write('pqmf_output.wav', w2_.flatten().detach(), sr) + sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py index a1c4f692..c80def60 100644 --- a/tests/test_vocoder_tf_pqmf.py +++ b/tests/test_vocoder_tf_pqmf.py @@ -4,7 +4,7 @@ import tensorflow as tf import soundfile as sf from librosa.core import load -from tests import get_tests_path, get_tests_input_path +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.vocoder.tf.layers.pqmf import PQMF @@ -25,4 +25,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write('tf_pqmf_output.wav', w2_.flatten(), sr) + sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr) From a1839d32454752b2b4fe6add909cd4375302866a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 19:56:50 +0100 Subject: [PATCH 05/43] fix french_cleaners --- TTS/tts/utils/text/abbreviations.py | 76 ++++++++++++++++------------- TTS/tts/utils/text/cleaners.py | 4 +- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index fe4c1cdc..bc2f4830 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) ]] # List of (regular expression, replacement) pairs for abbreviations in french: -abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('M', 'monsieur'), - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ('N.B', 'nota bene'), - ('M', 'monsieur'), - ('p.c.q', 'parce que'), - ('Pr', 'professeur'), - ('qqch', 'quelque chose'), - ('rdv', 'rendez-vous'), - ('max', 'maximum'), - ('min', 'minimum'), - ('no', 'numéro'), - ('adr', 'adresse'), - ('dr', 'docteur'), - ('st', 'saint'), - ('co', 'companie'), - ('jr', 'junior'), - ('sgt', 'sergent'), - ('capt', 'capitain'), - ('col', 'colonel'), - ('av', 'avenue'), - ('av. J.-C', 'avant Jésus-Christ'), - ('apr. J.-C', 'après Jésus-Christ'), - ('art', 'article'), - ('boul', 'boulevard'), - ('c.-à-d', 'c’est-à-dire'), - ('etc', 'et cetera'), - ('ex', 'exemple'), - ('excl', 'exclusivement'), - ('boul', 'boulevard'), - ]] +abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('M', 'monsieur'), + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ('N.B', 'nota bene'), + ('M', 'monsieur'), + ('p.c.q', 'parce que'), + ('Pr', 'professeur'), + ('qqch', 'quelque chose'), + ('rdv', 'rendez-vous'), + ('max', 'maximum'), + ('min', 'minimum'), + ('no', 'numéro'), + ('adr', 'adresse'), + ('dr', 'docteur'), + ('st', 'saint'), + ('co', 'companie'), + ('jr', 'junior'), + ('sgt', 'sergent'), + ('capt', 'capitain'), + ('col', 'colonel'), + ('av', 'avenue'), + ('av. J.-C', 'avant Jésus-Christ'), + ('apr. J.-C', 'après Jésus-Christ'), + ('art', 'article'), + ('boul', 'boulevard'), + ('c.-à-d', 'c’est-à-dire'), + ('etc', 'et cetera'), + ('ex', 'exemple'), + ('excl', 'exclusivement'), + ('boul', 'boulevard'), + ]] + [(re.compile('\\b%s' % x[0]), x[1]) + for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 49a25557..c7a2b91a 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -108,8 +108,8 @@ def english_cleaners(text): def french_cleaners(text): '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that''' - text = lowercase(text) text = expand_abbreviations(text, lang='fr') + text = lowercase(text) text = replace_symbols(text, lang='fr') text = remove_aux_symbols(text) text = collapse_whitespace(text) @@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str: text = replace_numbers_to_characters_in_text(text) return text - - def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' text = expand_numbers(text) From 16ce4e4805962bbd8e418d51cec3814bea434673 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 6 Mar 2021 23:24:12 +0100 Subject: [PATCH 06/43] fix linter issues --- TTS/bin/resample.py | 7 ++++--- TTS/tts/utils/text/abbreviations.py | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 54599b8e..aa3f9a37 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -7,9 +7,9 @@ from argparse import RawTextHelpFormatter from multiprocessing import Pool from tqdm import tqdm -def resample_file(filename): - global args - y, sr = librosa.load(filename, sr=args.output_sr) +def resample_file(func_args): + filename, output_sr = func_args + y, sr = librosa.load(filename, sr=output_sr) librosa.output.write_wav(filename, y, sr) if __name__ == '__main__': @@ -59,6 +59,7 @@ if __name__ == '__main__': print('Resampling the audio files...') audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') + audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr])) with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)): diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index bc2f4830..3cafc65b 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -59,9 +59,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) ('excl', 'exclusivement'), ('boul', 'boulevard'), ]] + [(re.compile('\\b%s' % x[0]), x[1]) - for x in [ - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ]] + for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] From 10d7f6df022a8c9e416c11bce595b0f5f2d28c0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:08:52 +0100 Subject: [PATCH 07/43] add more CI tests --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a5b22f5b..d7a24a99 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -58,3 +58,5 @@ jobs: ./tests/test_vocoder_wavegrad_train.sh ./tests/test_vocoder_wavernn_train.sh ./tests/test_speedy_speech_train.sh + ./tests/test_resample.sh + ./tests/test_compute_statistics.sh From 0cdb100536e4384ef452454a85ac06de64e126b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:11:28 +0100 Subject: [PATCH 08/43] update version 0.0.11 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eb2b4c33..de277655 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) ) -version = '0.0.10.2' +version = '0.0.11' cwd = os.path.dirname(os.path.abspath(__file__)) class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors From b46498ca68bfda6e49a768c454220c440e4f6c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:35:47 +0100 Subject: [PATCH 09/43] linter fix --- TTS/tts/utils/text/abbreviations.py | 79 ++++++++++++++--------------- tests/test_vocoder_pqmf.py | 3 +- tests/test_vocoder_tf_pqmf.py | 3 +- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index 3cafc65b..579d7dcd 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -25,43 +25,42 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('M', 'monsieur'), - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ('N.B', 'nota bene'), - ('M', 'monsieur'), - ('p.c.q', 'parce que'), - ('Pr', 'professeur'), - ('qqch', 'quelque chose'), - ('rdv', 'rendez-vous'), - ('max', 'maximum'), - ('min', 'minimum'), - ('no', 'numéro'), - ('adr', 'adresse'), - ('dr', 'docteur'), - ('st', 'saint'), - ('co', 'companie'), - ('jr', 'junior'), - ('sgt', 'sergent'), - ('capt', 'capitain'), - ('col', 'colonel'), - ('av', 'avenue'), - ('av. J.-C', 'avant Jésus-Christ'), - ('apr. J.-C', 'après Jésus-Christ'), - ('art', 'article'), - ('boul', 'boulevard'), - ('c.-à-d', 'c’est-à-dire'), - ('etc', 'et cetera'), - ('ex', 'exemple'), - ('excl', 'exclusivement'), - ('boul', 'boulevard'), - ]] + [(re.compile('\\b%s' % x[0]), x[1]) - for x in [ - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ]] + for x in [ + ('M', 'monsieur'), + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ('N.B', 'nota bene'), + ('M', 'monsieur'), + ('p.c.q', 'parce que'), + ('Pr', 'professeur'), + ('qqch', 'quelque chose'), + ('rdv', 'rendez-vous'), + ('max', 'maximum'), + ('min', 'minimum'), + ('no', 'numéro'), + ('adr', 'adresse'), + ('dr', 'docteur'), + ('st', 'saint'), + ('co', 'companie'), + ('jr', 'junior'), + ('sgt', 'sergent'), + ('capt', 'capitain'), + ('col', 'colonel'), + ('av', 'avenue'), + ('av. J.-C', 'avant Jésus-Christ'), + ('apr. J.-C', 'après Jésus-Christ'), + ('art', 'article'), + ('boul', 'boulevard'), + ('c.-à-d', 'c’est-à-dire'), + ('etc', 'et cetera'), + ('ex', 'exemple'), + ('excl', 'exclusivement'), + ('boul', 'boulevard'), + ]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py index 94e6ed01..74da451f 100644 --- a/tests/test_vocoder_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -24,4 +24,5 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr) + sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'), + w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py index c80def60..16c46b2a 100644 --- a/tests/test_vocoder_tf_pqmf.py +++ b/tests/test_vocoder_tf_pqmf.py @@ -25,4 +25,5 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr) + sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'), + w2_.flatten(), sr) From 9ce29d8094de4dcc756edaf7653c0e027a12c1da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 16:14:55 +0100 Subject: [PATCH 10/43] update CI config --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d7a24a99..d59e9a6c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,8 +35,8 @@ jobs: run: cat /etc/os-release - name: Install dependencies run: | - sudo apt update - sudo apt install espeak-ng git + apt update + apt install -y espeak-ng git - name: Upgrade pip # so we can take advantage of pyproject.toml build-dependency support run: python3 -m pip install --upgrade pip From eabd7e6a52cb6d15446d3c5b399f14254c6a3d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:14:53 +0100 Subject: [PATCH 11/43] fix #374 --- .github/workflows/main.yml | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/tts/layers/losses.py | 22 +++++++++---------- ...config.json => test_tacotron2_config.json} | 0 tests/test_tacotron_train.sh | 14 +++++++++++- tests/test_train_tts.py | 0 6 files changed, 25 insertions(+), 15 deletions(-) rename tests/inputs/{test_train_config.json => test_tacotron2_config.json} (100%) delete mode 100644 tests/test_train_tts.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d59e9a6c..afefad2c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,7 +46,7 @@ jobs: python3 setup.py egg_info - name: Lint check run: | - cardboardlinter -n auto + cardboardlinter --refspec main -n auto - name: Unit tests run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - name: Test scripts diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 0887c2cc..331571d7 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -85,7 +85,7 @@ def format_data(data): text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron"] else None + linear_input = data[3] if c.model.lower() in ["tacotron"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 50575b80..c5497054 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module): stopnet_output, stopnet_target, output_lens, decoder_b_output, alignments, alignment_lens, alignments_backwards, input_lens): + + # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2 + # the target should be set acccordingly + postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input + return_dict = {} # remove lengths if no masking is applied if not self.config.loss_masking: @@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module): decoder_loss = self.criterion(decoder_output, mel_input, output_lens) if self.postnet_alpha > 0: - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input, - output_lens) - else: - postnet_loss = self.criterion(postnet_output, mel_input, - output_lens) + postnet_loss = self.criterion(postnet_output, postnet_target, + output_lens) else: if self.decoder_alpha > 0: decoder_loss = self.criterion(decoder_output, mel_input) if self.postnet_alpha > 0: - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input) - else: - postnet_loss = self.criterion(postnet_output, mel_input) + postnet_loss = self.criterion(postnet_output, postnet_target) loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss return_dict['decoder_loss'] = decoder_loss return_dict['postnet_loss'] = postnet_loss @@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module): # postnet differential spectral loss if self.config.postnet_diff_spec_alpha > 0: - postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens) + postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens) loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss @@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module): # postnet ssim loss if self.config.postnet_ssim_alpha > 0: - postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens) + postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens) loss += postnet_ssim_loss * self.postnet_ssim_alpha return_dict['postnet_ssim_loss'] = postnet_ssim_loss diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_tacotron2_config.json similarity index 100% rename from tests/inputs/test_train_config.json rename to tests/inputs/test_tacotron2_config.json diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh index 9268ea96..fa9930a7 100755 --- a/tests/test_tacotron_train.sh +++ b/tests/test_tacotron_train.sh @@ -3,7 +3,7 @@ set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER @@ -11,3 +11,15 @@ echo $LATEST_FOLDER CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/ + +# Tacotron2 +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ + diff --git a/tests/test_train_tts.py b/tests/test_train_tts.py deleted file mode 100644 index e69de29b..00000000 From b2ecea0a049cb5444895f34f5e89ac9c657af809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:16:54 +0100 Subject: [PATCH 12/43] test config for tacotron model --- tests/inputs/test_tacotron_config.json | 177 +++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 tests/inputs/test_tacotron_config.json diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json new file mode 100644 index 00000000..a2fdd690 --- /dev/null +++ b/tests/inputs/test_tacotron_config.json @@ -0,0 +1,177 @@ +{ + "model": "Tacotron", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "mixed_precision": false, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": true, + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + From 8dacf762becf3bc63030b60c4e48a7f49e66d785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:31:26 +0100 Subject: [PATCH 13/43] CI config update --- .github/workflows/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index afefad2c..f396f959 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,8 +35,9 @@ jobs: run: cat /etc/os-release - name: Install dependencies run: | - apt update - apt install -y espeak-ng git + sudo apt update + sudo apt install -y espeak-ng git + sudo apt install -y python3-wheel gcc - name: Upgrade pip # so we can take advantage of pyproject.toml build-dependency support run: python3 -m pip install --upgrade pip From edc5b07d75d5caba250c93e6c493726acef06dbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:41:35 +0100 Subject: [PATCH 14/43] CI config update --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f396f959..0da0228c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,7 +4,6 @@ on: push: branches: - master - - dev pull_request: types: [opened, synchronize, reopened] jobs: @@ -47,7 +46,7 @@ jobs: python3 setup.py egg_info - name: Lint check run: | - cardboardlinter --refspec main -n auto + cardboardlinter -n auto - name: Unit tests run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - name: Test scripts From d652fb4999be8c4c36d15b2efcb112741b0029bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:47:16 +0100 Subject: [PATCH 15/43] linter fix --- TTS/tts/layers/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index c5497054..213970a7 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -313,7 +313,7 @@ class TacotronLoss(torch.nn.Module): output_lens) if self.postnet_alpha > 0: postnet_loss = self.criterion(postnet_output, postnet_target, - output_lens) + output_lens) else: if self.decoder_alpha > 0: decoder_loss = self.criterion(decoder_output, mel_input) From f34b32b6cfd6ce64dacdccbc0ce11595e1206677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 13 Mar 2021 00:46:53 +0100 Subject: [PATCH 16/43] force utf8 --- TTS/utils/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index b2b93eac..ef77ca4e 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -43,7 +43,7 @@ class ModelManager(object): Args: file_path (str): path to .models.json. """ - with open(file_path) as json_file: + with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) def list_langs(self): From 09734887655858da59cf228d048716ac4bc40f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Mar 2021 11:28:06 +0100 Subject: [PATCH 17/43] fix mozilla/TTS#685 --- TTS/tts/models/tacotron_abstract.py | 3 +-- tests/test_tacotron_train.sh | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 10953269..22e86ee4 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module): def _backward_pass(self, mel_specs, encoder_outputs, mask): """ Run backwards decoder """ decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask, - self.speaker_embeddings_projected) + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask) decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() return decoder_outputs_b, alignments_b diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh index fa9930a7..e0a0253b 100755 --- a/tests/test_tacotron_train.sh +++ b/tests/test_tacotron_train.sh @@ -2,6 +2,7 @@ set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" + # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json # find the training folder @@ -12,6 +13,16 @@ CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASED # remove all the outputs rm -rf $BASEDIR/train_outputs/ +# run Tacotron bi-directional decoder +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ + # Tacotron2 # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json From 80befd1af5d7f52d034ddb80de0e2b7079b1ddf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Mar 2021 11:52:20 +0100 Subject: [PATCH 18/43] add missing config file --- tests/inputs/test_tacotron_bd_config.json | 177 ++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 tests/inputs/test_tacotron_bd_config.json diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json new file mode 100644 index 00000000..b6092f4f --- /dev/null +++ b/tests/inputs/test_tacotron_bd_config.json @@ -0,0 +1,177 @@ +{ + "model": "Tacotron", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "mixed_precision": false, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": true, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": true, + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + From b09fc48a1d12e083aa066dcf60f99c0efc6ebb91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 17:07:29 +0100 Subject: [PATCH 19/43] bug fix --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 6a09986c..ea195767 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -176,7 +176,7 @@ def process_args(args, model_type): _ = os.path.dirname(os.path.realpath(__file__)) - if model_type in "tacotron wavegrad wavernn" and c.mixed_precision: + if c.mixed_precision: print(" > Mixed precision mode is ON") out_path = args.continue_path From 31935cd39e965cbbc511104856b3f4c328a3e1cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 17:04:34 +0100 Subject: [PATCH 20/43] bug fix in preprocessor --- TTS/tts/datasets/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 439a4091..eac121b8 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True): meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training - if 'meta_file_attn_mask' in dataset: + if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None: meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask'])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() From 7bda48c81e68676cf86e742367f9bc0ff8716287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 17:23:35 +0100 Subject: [PATCH 21/43] fix #382 --- TTS/tts/models/speedy_speech.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 886d6fd4..101d77a0 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -181,8 +181,12 @@ class SpeedySpeech(nn.Module): x_lengths: [B] g: [B, C] """ + # input sequence should be greated than the max convolution size + inference_padding = 5 + if x.shape[1] < 13: + inference_padding += 13 - x.shape[1] # pad input to prevent dropping the last word - x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) + x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) # duration predictor pass o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) From d6749f030f0c16f5bc5752b9d72710d72b123ac7 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 3 Mar 2021 19:59:21 +0100 Subject: [PATCH 22/43] Add resample script --- TTS/bin/resample.py | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 TTS/bin/resample.py diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py new file mode 100644 index 00000000..41eced24 --- /dev/null +++ b/TTS/bin/resample.py @@ -0,0 +1,69 @@ +import argparse +import glob +import shutil +import librosa +from argparse import RawTextHelpFormatter +from multiprocessing import Pool +from tqdm import tqdm + +def resample_file(filename): + global args + y, sr = librosa.load(filename, sr=args.output_sr) + librosa.output.write_wav(filename, y, sr) + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='''Resample a folder recusively with librosa +Can be used in place or create a copy of the folder as an output.\n\n''' + +''' +Example run: + python TTS/bin/resample.py + --input_dir /root/LJSpeech-1.1/ + --output_sr 22050 + --output_dir /root/resampled_LJSpeech-1.1/ + --n_jobs 24 +''', + formatter_class=RawTextHelpFormatter) + + parser.add_argument('--input_dir', + type=str, + default=None, + required=True, + help='Path of the folder containing the audio files to resample') + + parser.add_argument('--output_sr', + type=int, + default=22050, + required=False, + help='Samlple rate to which the audio files should be resampled') + + parser.add_argument('--output_dir', + type=str, + default=None, + required=False, + help='Path of the destination folder. If not defined, the operation is done in place') + + parser.add_argument('--n_jobs', + type=int, + default=None, + help='Number of threads to use, by default it uses all cores') + + args = parser.parse_args() + + if args.output_dir: + print('Recursively copying the input folder...') + shutil.copytree(args.input_dir, args.output_dir) + args.input_dir = args.output_dir + + print('Resampling the audio files...') + audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True) + print(f'Found {len(audio_files)} files...') + with Pool(processes=args.n_jobs) as p: + with tqdm(total=len(audio_files)) as pbar: + for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)): + pbar.update() + + print('Done !') + \ No newline at end of file From 17f197f51ebef45e1a1e2a722f23ae4ba9d97dee Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 3 Mar 2021 21:50:50 +0100 Subject: [PATCH 23/43] Using path.join instead of concat --- TTS/bin/resample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 41eced24..c5f2b5de 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -1,5 +1,6 @@ import argparse import glob +import os import shutil import librosa from argparse import RawTextHelpFormatter @@ -58,7 +59,7 @@ Example run: args.input_dir = args.output_dir print('Resampling the audio files...') - audio_files = glob.glob(args.input_dir+'**/*.wav', recursive=True) + audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: From 93fdc0729c2e1ba6c04733e824b4f10f4083f45f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 18:44:41 +0100 Subject: [PATCH 24/43] linter + test --- TTS/bin/resample.py | 26 ++++++++++++-------------- run_tests.sh | 1 + 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index c5f2b5de..42de7080 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -16,16 +16,14 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( description='''Resample a folder recusively with librosa -Can be used in place or create a copy of the folder as an output.\n\n''' - -''' -Example run: - python TTS/bin/resample.py - --input_dir /root/LJSpeech-1.1/ - --output_sr 22050 - --output_dir /root/resampled_LJSpeech-1.1/ - --n_jobs 24 -''', + Can be used in place or create a copy of the folder as an output.\n\n + Example run: + python TTS/bin/resample.py + --input_dir /root/LJSpeech-1.1/ + --output_sr 22050 + --output_dir /root/resampled_LJSpeech-1.1/ + --n_jobs 24 + ''', formatter_class=RawTextHelpFormatter) parser.add_argument('--input_dir', @@ -33,7 +31,7 @@ Example run: default=None, required=True, help='Path of the folder containing the audio files to resample') - + parser.add_argument('--output_sr', type=int, default=22050, @@ -45,7 +43,7 @@ Example run: default=None, required=False, help='Path of the destination folder. If not defined, the operation is done in place') - + parser.add_argument('--n_jobs', type=int, default=None, @@ -55,11 +53,11 @@ Example run: if args.output_dir: print('Recursively copying the input folder...') - shutil.copytree(args.input_dir, args.output_dir) + copy_tree(args.input_dir, args.output_dir) args.input_dir = args.output_dir print('Resampling the audio files...') - audio_files = glob.glob(os.path.join(args.input_dir,'**/*.wav'), recursive=True) + audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: diff --git a/run_tests.sh b/run_tests.sh index 15e33ebf..d7bbcf89 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -6,6 +6,7 @@ nosetests tests -x &&\ # runtime tests ./tests/test_demo_server.sh && \ +./tests/test_resample.sh && \ ./tests/test_tacotron_train.sh && \ ./tests/test_glow-tts_train.sh && \ ./tests/test_vocoder_gan_train.sh && \ From b94373afb8ff5e5ec2a65bd76cba07b90c6bee0d Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 18:50:03 +0100 Subject: [PATCH 25/43] test case --- TTS/bin/resample.py | 2 +- tests/test_resample.sh | 16 ++++++++++++++++ tests/test_vocoder_pqmf.py | 4 ++-- tests/test_vocoder_tf_pqmf.py | 4 ++-- 4 files changed, 21 insertions(+), 5 deletions(-) create mode 100755 tests/test_resample.sh diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 42de7080..54599b8e 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -1,8 +1,8 @@ import argparse import glob import os -import shutil import librosa +from distutils.dir_util import copy_tree from argparse import RawTextHelpFormatter from multiprocessing import Pool from tqdm import tqdm diff --git a/tests/test_resample.sh b/tests/test_resample.sh new file mode 100755 index 00000000..ddae17ad --- /dev/null +++ b/tests/test_resample.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -xe +BASEDIR=$(dirname "$0") +TARGET_SR=16000 +echo "$BASEDIR" +#run the resample script +python TTS/bin/resample.py --input_dir $BASEDIR/data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR +#check samplerate of output +OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python ) +OUT_SR=$(($OUT_SR + 0)) +if [[ $OUT_SR -ne $TARGET_SR ]]; then + echo "Missmatch between target and output sample rates" + exit 1 +fi +#cleaning up +rm -rf $BASEDIR/outputs/resample_tests diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py index 1f141dd2..94e6ed01 100644 --- a/tests/test_vocoder_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -4,7 +4,7 @@ import torch import soundfile as sf from librosa.core import load -from tests import get_tests_path, get_tests_input_path +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.vocoder.layers.pqmf import PQMF @@ -24,4 +24,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write('pqmf_output.wav', w2_.flatten().detach(), sr) + sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py index a1c4f692..c80def60 100644 --- a/tests/test_vocoder_tf_pqmf.py +++ b/tests/test_vocoder_tf_pqmf.py @@ -4,7 +4,7 @@ import tensorflow as tf import soundfile as sf from librosa.core import load -from tests import get_tests_path, get_tests_input_path +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.vocoder.tf.layers.pqmf import PQMF @@ -25,4 +25,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write('tf_pqmf_output.wav', w2_.flatten(), sr) + sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr) From 1574d8dd39396f50b0a1905c93717117ffe7a52c Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 5 Mar 2021 19:56:50 +0100 Subject: [PATCH 26/43] fix french_cleaners --- TTS/tts/utils/text/abbreviations.py | 76 ++++++++++++++++------------- TTS/tts/utils/text/cleaners.py | 4 +- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index fe4c1cdc..bc2f4830 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -24,38 +24,44 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) ]] # List of (regular expression, replacement) pairs for abbreviations in french: -abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('M', 'monsieur'), - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ('N.B', 'nota bene'), - ('M', 'monsieur'), - ('p.c.q', 'parce que'), - ('Pr', 'professeur'), - ('qqch', 'quelque chose'), - ('rdv', 'rendez-vous'), - ('max', 'maximum'), - ('min', 'minimum'), - ('no', 'numéro'), - ('adr', 'adresse'), - ('dr', 'docteur'), - ('st', 'saint'), - ('co', 'companie'), - ('jr', 'junior'), - ('sgt', 'sergent'), - ('capt', 'capitain'), - ('col', 'colonel'), - ('av', 'avenue'), - ('av. J.-C', 'avant Jésus-Christ'), - ('apr. J.-C', 'après Jésus-Christ'), - ('art', 'article'), - ('boul', 'boulevard'), - ('c.-à-d', 'c’est-à-dire'), - ('etc', 'et cetera'), - ('ex', 'exemple'), - ('excl', 'exclusivement'), - ('boul', 'boulevard'), - ]] +abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) + for x in [ + ('M', 'monsieur'), + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ('N.B', 'nota bene'), + ('M', 'monsieur'), + ('p.c.q', 'parce que'), + ('Pr', 'professeur'), + ('qqch', 'quelque chose'), + ('rdv', 'rendez-vous'), + ('max', 'maximum'), + ('min', 'minimum'), + ('no', 'numéro'), + ('adr', 'adresse'), + ('dr', 'docteur'), + ('st', 'saint'), + ('co', 'companie'), + ('jr', 'junior'), + ('sgt', 'sergent'), + ('capt', 'capitain'), + ('col', 'colonel'), + ('av', 'avenue'), + ('av. J.-C', 'avant Jésus-Christ'), + ('apr. J.-C', 'après Jésus-Christ'), + ('art', 'article'), + ('boul', 'boulevard'), + ('c.-à-d', 'c’est-à-dire'), + ('etc', 'et cetera'), + ('ex', 'exemple'), + ('excl', 'exclusivement'), + ('boul', 'boulevard'), + ]] + [(re.compile('\\b%s' % x[0]), x[1]) + for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 49a25557..c7a2b91a 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -108,8 +108,8 @@ def english_cleaners(text): def french_cleaners(text): '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that''' - text = lowercase(text) text = expand_abbreviations(text, lang='fr') + text = lowercase(text) text = replace_symbols(text, lang='fr') text = remove_aux_symbols(text) text = collapse_whitespace(text) @@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str: text = replace_numbers_to_characters_in_text(text) return text - - def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' text = expand_numbers(text) From 11e25a71251721ac1b6c772d3ad38ace53259edb Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 6 Mar 2021 23:24:12 +0100 Subject: [PATCH 27/43] fix linter issues --- TTS/bin/resample.py | 7 ++++--- TTS/tts/utils/text/abbreviations.py | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index 54599b8e..aa3f9a37 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -7,9 +7,9 @@ from argparse import RawTextHelpFormatter from multiprocessing import Pool from tqdm import tqdm -def resample_file(filename): - global args - y, sr = librosa.load(filename, sr=args.output_sr) +def resample_file(func_args): + filename, output_sr = func_args + y, sr = librosa.load(filename, sr=output_sr) librosa.output.write_wav(filename, y, sr) if __name__ == '__main__': @@ -59,6 +59,7 @@ if __name__ == '__main__': print('Resampling the audio files...') audio_files = glob.glob(os.path.join(args.input_dir, '**/*.wav'), recursive=True) print(f'Found {len(audio_files)} files...') + audio_files = list(zip(audio_files, len(audio_files)*[args.output_sr])) with Pool(processes=args.n_jobs) as p: with tqdm(total=len(audio_files)) as pbar: for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)): diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index bc2f4830..3cafc65b 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -59,9 +59,9 @@ abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) ('excl', 'exclusivement'), ('boul', 'boulevard'), ]] + [(re.compile('\\b%s' % x[0]), x[1]) - for x in [ - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ]] + for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] From eb071fd3b6b101e2fa54c180e9939c57d5ff7118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:08:52 +0100 Subject: [PATCH 28/43] add more CI tests --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 01ef1a3f..60e61d1c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -58,3 +58,5 @@ jobs: ./tests/test_vocoder_wavegrad_train.sh ./tests/test_vocoder_wavernn_train.sh ./tests/test_speedy_speech_train.sh + ./tests/test_resample.sh + ./tests/test_compute_statistics.sh From 13041ebfa80e0e972b0b71262c7c363381452de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:11:28 +0100 Subject: [PATCH 29/43] update version 0.0.11 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2f78d572..de277655 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) ) -version = '0.0.10.3' +version = '0.0.11' cwd = os.path.dirname(os.path.abspath(__file__)) class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors From bdfd1f8a8995db6d1fe9b03b01d17bb7ceb5d64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 12:35:47 +0100 Subject: [PATCH 30/43] linter fix --- TTS/tts/utils/text/abbreviations.py | 79 ++++++++++++++--------------- tests/test_vocoder_pqmf.py | 3 +- tests/test_vocoder_tf_pqmf.py | 3 +- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/abbreviations.py index 3cafc65b..579d7dcd 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/abbreviations.py @@ -25,43 +25,42 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('M', 'monsieur'), - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ('N.B', 'nota bene'), - ('M', 'monsieur'), - ('p.c.q', 'parce que'), - ('Pr', 'professeur'), - ('qqch', 'quelque chose'), - ('rdv', 'rendez-vous'), - ('max', 'maximum'), - ('min', 'minimum'), - ('no', 'numéro'), - ('adr', 'adresse'), - ('dr', 'docteur'), - ('st', 'saint'), - ('co', 'companie'), - ('jr', 'junior'), - ('sgt', 'sergent'), - ('capt', 'capitain'), - ('col', 'colonel'), - ('av', 'avenue'), - ('av. J.-C', 'avant Jésus-Christ'), - ('apr. J.-C', 'après Jésus-Christ'), - ('art', 'article'), - ('boul', 'boulevard'), - ('c.-à-d', 'c’est-à-dire'), - ('etc', 'et cetera'), - ('ex', 'exemple'), - ('excl', 'exclusivement'), - ('boul', 'boulevard'), - ]] + [(re.compile('\\b%s' % x[0]), x[1]) - for x in [ - ('Mlle', 'mademoiselle'), - ('Mlles', 'mesdemoiselles'), - ('Mme', 'Madame'), - ('Mmes', 'Mesdames'), - ]] + for x in [ + ('M', 'monsieur'), + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ('N.B', 'nota bene'), + ('M', 'monsieur'), + ('p.c.q', 'parce que'), + ('Pr', 'professeur'), + ('qqch', 'quelque chose'), + ('rdv', 'rendez-vous'), + ('max', 'maximum'), + ('min', 'minimum'), + ('no', 'numéro'), + ('adr', 'adresse'), + ('dr', 'docteur'), + ('st', 'saint'), + ('co', 'companie'), + ('jr', 'junior'), + ('sgt', 'sergent'), + ('capt', 'capitain'), + ('col', 'colonel'), + ('av', 'avenue'), + ('av. J.-C', 'avant Jésus-Christ'), + ('apr. J.-C', 'après Jésus-Christ'), + ('art', 'article'), + ('boul', 'boulevard'), + ('c.-à-d', 'c’est-à-dire'), + ('etc', 'et cetera'), + ('ex', 'exemple'), + ('excl', 'exclusivement'), + ('boul', 'boulevard'), + ]] + [(re.compile('\\b%s' % x[0]), x[1]) for x in [ + ('Mlle', 'mademoiselle'), + ('Mlles', 'mesdemoiselles'), + ('Mme', 'Madame'), + ('Mmes', 'Mesdames'), + ]] diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py index 94e6ed01..74da451f 100644 --- a/tests/test_vocoder_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -24,4 +24,5 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(),'pqmf_output.wav'), w2_.flatten().detach(), sr) + sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'), + w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py index c80def60..16c46b2a 100644 --- a/tests/test_vocoder_tf_pqmf.py +++ b/tests/test_vocoder_tf_pqmf.py @@ -25,4 +25,5 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(),'tf_pqmf_output.wav'), w2_.flatten(), sr) + sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'), + w2_.flatten(), sr) From 255d5486acbe743e21f0c5197f931ffecb96d7bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 16:14:55 +0100 Subject: [PATCH 31/43] update CI config --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 60e61d1c..9bd88830 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,8 +35,8 @@ jobs: run: cat /etc/os-release - name: Install dependencies run: | - sudo apt update - sudo apt install espeak-ng git + apt update + apt install -y espeak-ng git - name: Upgrade pip # so we can take advantage of pyproject.toml build-dependency support run: python3 -m pip install --upgrade pip From babc94f63fa5af6ff8e0499880d2906a0c63f153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:14:53 +0100 Subject: [PATCH 32/43] fix #374 --- .github/workflows/main.yml | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/tts/layers/losses.py | 22 +++++++++---------- ...config.json => test_tacotron2_config.json} | 0 tests/test_tacotron_train.sh | 14 +++++++++++- tests/test_train_tts.py | 0 6 files changed, 25 insertions(+), 15 deletions(-) rename tests/inputs/{test_train_config.json => test_tacotron2_config.json} (100%) delete mode 100644 tests/test_train_tts.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9bd88830..5b7d0c03 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,7 +46,7 @@ jobs: python3 setup.py egg_info - name: Lint check run: | - cardboardlinter -n auto + cardboardlinter --refspec main -n auto - name: Unit tests run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - name: Test scripts diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 0887c2cc..331571d7 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -85,7 +85,7 @@ def format_data(data): text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron"] else None + linear_input = data[3] if c.model.lower() in ["tacotron"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 50575b80..c5497054 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -297,6 +297,11 @@ class TacotronLoss(torch.nn.Module): stopnet_output, stopnet_target, output_lens, decoder_b_output, alignments, alignment_lens, alignments_backwards, input_lens): + + # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2 + # the target should be set acccordingly + postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input + return_dict = {} # remove lengths if no masking is applied if not self.config.loss_masking: @@ -307,20 +312,13 @@ class TacotronLoss(torch.nn.Module): decoder_loss = self.criterion(decoder_output, mel_input, output_lens) if self.postnet_alpha > 0: - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input, - output_lens) - else: - postnet_loss = self.criterion(postnet_output, mel_input, - output_lens) + postnet_loss = self.criterion(postnet_output, postnet_target, + output_lens) else: if self.decoder_alpha > 0: decoder_loss = self.criterion(decoder_output, mel_input) if self.postnet_alpha > 0: - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input) - else: - postnet_loss = self.criterion(postnet_output, mel_input) + postnet_loss = self.criterion(postnet_output, postnet_target) loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss return_dict['decoder_loss'] = decoder_loss return_dict['postnet_loss'] = postnet_loss @@ -373,7 +371,7 @@ class TacotronLoss(torch.nn.Module): # postnet differential spectral loss if self.config.postnet_diff_spec_alpha > 0: - postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens) + postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, postnet_target, output_lens) loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss @@ -385,7 +383,7 @@ class TacotronLoss(torch.nn.Module): # postnet ssim loss if self.config.postnet_ssim_alpha > 0: - postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens) + postnet_ssim_loss = self.criterion_ssim(postnet_output, postnet_target, output_lens) loss += postnet_ssim_loss * self.postnet_ssim_alpha return_dict['postnet_ssim_loss'] = postnet_ssim_loss diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_tacotron2_config.json similarity index 100% rename from tests/inputs/test_train_config.json rename to tests/inputs/test_tacotron2_config.json diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh index 9268ea96..fa9930a7 100755 --- a/tests/test_tacotron_train.sh +++ b/tests/test_tacotron_train.sh @@ -3,7 +3,7 @@ set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER @@ -11,3 +11,15 @@ echo $LATEST_FOLDER CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/ + +# Tacotron2 +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ + diff --git a/tests/test_train_tts.py b/tests/test_train_tts.py deleted file mode 100644 index e69de29b..00000000 From 0fa3101f2a298c995f48fdb1a62fd9f2e55af003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:16:54 +0100 Subject: [PATCH 33/43] test config for tacotron model --- tests/inputs/test_tacotron_config.json | 177 +++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 tests/inputs/test_tacotron_config.json diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json new file mode 100644 index 00000000..a2fdd690 --- /dev/null +++ b/tests/inputs/test_tacotron_config.json @@ -0,0 +1,177 @@ +{ + "model": "Tacotron", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "mixed_precision": false, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": true, + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + From 739584ec932ec240e829b806a3849eef0c37d8a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:31:26 +0100 Subject: [PATCH 34/43] CI config update --- .github/workflows/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5b7d0c03..81dbbd63 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,8 +35,9 @@ jobs: run: cat /etc/os-release - name: Install dependencies run: | - apt update - apt install -y espeak-ng git + sudo apt update + sudo apt install -y espeak-ng git + sudo apt install -y python3-wheel gcc - name: Upgrade pip # so we can take advantage of pyproject.toml build-dependency support run: python3 -m pip install --upgrade pip From ddab50b515b6e77058575e3958c4d2067257a412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:41:35 +0100 Subject: [PATCH 35/43] CI config update --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 81dbbd63..be7b77c7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,7 +4,6 @@ on: push: branches: - master - - dev pull_request: types: [opened, synchronize, reopened] jobs: @@ -47,7 +46,7 @@ jobs: python3 setup.py egg_info - name: Lint check run: | - cardboardlinter --refspec main -n auto + cardboardlinter -n auto - name: Unit tests run: nosetests tests --nocapture --processes=0 --process-timeout=20 --process-restartworker - name: Test scripts From bf0caba0bc481422745438c071ae49003cd0ea26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 11 Mar 2021 18:47:16 +0100 Subject: [PATCH 36/43] linter fix --- TTS/tts/layers/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index c5497054..213970a7 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -313,7 +313,7 @@ class TacotronLoss(torch.nn.Module): output_lens) if self.postnet_alpha > 0: postnet_loss = self.criterion(postnet_output, postnet_target, - output_lens) + output_lens) else: if self.decoder_alpha > 0: decoder_loss = self.criterion(decoder_output, mel_input) From a8c348ffb2d4b2f277df1319f24cfc730f69d694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 13 Mar 2021 00:46:53 +0100 Subject: [PATCH 37/43] force utf8 --- TTS/utils/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index b2b93eac..ef77ca4e 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -43,7 +43,7 @@ class ModelManager(object): Args: file_path (str): path to .models.json. """ - with open(file_path) as json_file: + with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) def list_langs(self): From aa8bb815a743d3d14fa07c62434b5cdcee87b938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Mar 2021 11:28:06 +0100 Subject: [PATCH 38/43] fix mozilla/TTS#685 --- TTS/tts/models/tacotron_abstract.py | 3 +-- tests/test_tacotron_train.sh | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 10953269..22e86ee4 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -149,8 +149,7 @@ class TacotronAbstract(ABC, nn.Module): def _backward_pass(self, mel_specs, encoder_outputs, mask): """ Run backwards decoder """ decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask, - self.speaker_embeddings_projected) + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask) decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() return decoder_outputs_b, alignments_b diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh index fa9930a7..e0a0253b 100755 --- a/tests/test_tacotron_train.sh +++ b/tests/test_tacotron_train.sh @@ -2,6 +2,7 @@ set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" + # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_config.json # find the training folder @@ -12,6 +13,16 @@ CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASED # remove all the outputs rm -rf $BASEDIR/train_outputs/ +# run Tacotron bi-directional decoder +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron_bd_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ + # Tacotron2 # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_tacotron2_config.json From 0e54ab93ee6d04fca003bfbf16a48f0f8a433b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 15 Mar 2021 11:52:20 +0100 Subject: [PATCH 39/43] add missing config file --- tests/inputs/test_tacotron_bd_config.json | 177 ++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 tests/inputs/test_tacotron_bd_config.json diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json new file mode 100644 index 00000000..b6092f4f --- /dev/null +++ b/tests/inputs/test_tacotron_bd_config.json @@ -0,0 +1,177 @@ +{ + "model": "Tacotron", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "mixed_precision": false, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": true, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": false, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": true, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": true, + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST. + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + From 01e35e06c4fcb60c166b1a1134bc51e7ad4afc9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 17:07:29 +0100 Subject: [PATCH 40/43] bug fix --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 6a09986c..ea195767 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -176,7 +176,7 @@ def process_args(args, model_type): _ = os.path.dirname(os.path.realpath(__file__)) - if model_type in "tacotron wavegrad wavernn" and c.mixed_precision: + if c.mixed_precision: print(" > Mixed precision mode is ON") out_path = args.continue_path From 4c1aed4a9cc40a4f2db5f0555867de86fb965424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 17:04:34 +0100 Subject: [PATCH 41/43] bug fix in preprocessor --- TTS/tts/datasets/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 439a4091..eac121b8 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -35,7 +35,7 @@ def load_meta_data(datasets, eval_split=True): meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training - if 'meta_file_attn_mask' in dataset: + if 'meta_file_attn_mask' in dataset and dataset['meta_file_attn_mask'] is not None: meta_data = dict(load_attention_mask_meta_data(dataset['meta_file_attn_mask'])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() From 2690ab2ee5037a4a638c3bf07492b2d5f9e0a9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 16 Mar 2021 19:15:28 +0100 Subject: [PATCH 42/43] bug fix --- TTS/utils/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index ea195767..3f6f582e 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -176,7 +176,7 @@ def process_args(args, model_type): _ = os.path.dirname(os.path.realpath(__file__)) - if c.mixed_precision: + if 'mixed_precision' in c and c.mixed_precision: print(" > Mixed precision mode is ON") out_path = args.continue_path From 281e708f47f00a66fbf01205e870706dac2cec05 Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Tue, 16 Mar 2021 20:56:11 +0100 Subject: [PATCH 43/43] Fix for: name 'file_names' is not defined --- notebooks/dataset_analysis/CheckDatasetSNR.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/dataset_analysis/CheckDatasetSNR.ipynb b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb index 0aa07343..b022e362 100644 --- a/notebooks/dataset_analysis/CheckDatasetSNR.ipynb +++ b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn.\n", + "This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn. SNR paper can be seen here: https://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf\n", "\n", "To use this notebook, you need:\n", "- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n", @@ -136,7 +136,7 @@ "snrs = [tup[0] for tup in file_snrs]\n", "\n", "error_idxs = np.where(np.isnan(snrs) == True)[0]\n", - "error_files = [file_names[idx] for idx in error_idxs]\n", + "error_files = [wav_files[idx] for idx in error_idxs]\n", "\n", "file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n", "file_names = [tup[1] for tup in file_snrs]\n", @@ -236,4 +236,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file