From 443659a7317870d15acd4579be9f63fa92053fcf Mon Sep 17 00:00:00 2001 From: Eren Date: Thu, 6 Sep 2018 15:31:07 +0200 Subject: [PATCH 1/4] reverse post-cbhg size --- layers/tacotron.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 336f2d9e..b0327917 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -249,10 +249,10 @@ class PostCBHG(nn.Module): self.cbhg = CBHG( mel_dim, K=8, - conv_bank_features=128, - conv_projections=[256, mel_dim], - highway_features=128, - gru_features=128, + conv_bank_features=80, + conv_projections=[160, mel_dim], + highway_features=80, + gru_features=80, num_highways=4) def forward(self, x): return self.cbhg(x) From c78ec7dc363088fad42d6e78693e565c47a051ff Mon Sep 17 00:00:00 2001 From: Eren Date: Thu, 6 Sep 2018 15:43:55 +0200 Subject: [PATCH 2/4] remove redundant variables from tests --- tests/loader_tests.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/loader_tests.py b/tests/loader_tests.py index f5e6b9d5..a53cf635 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -26,9 +26,7 @@ class TestLJSpeechDataset(unittest.TestCase): ref_level_db=c.ref_level_db, num_freq=c.num_freq, power=c.power, - preemphasis=c.preemphasis, - min_mel_freq=c.min_mel_freq, - max_mel_freq=c.max_mel_freq) + preemphasis=c.preemphasis) def test_loader(self): if ok_ljspeech: @@ -169,9 +167,7 @@ class TestKusalDataset(unittest.TestCase): ref_level_db=c.ref_level_db, num_freq=c.num_freq, power=c.power, - preemphasis=c.preemphasis, - min_mel_freq=c.min_mel_freq, - max_mel_freq=c.max_mel_freq) + preemphasis=c.preemphasis) def test_loader(self): if ok_kusal: From 28ef4a7acba9d577bae80e87571788d4405f5508 Mon Sep 17 00:00:00 2001 From: Eren Date: Fri, 7 Sep 2018 09:53:30 +0200 Subject: [PATCH 3/4] README update --- README.md | 59 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index cc578d26..a4de5159 100644 --- a/README.md +++ b/README.md @@ -58,35 +58,44 @@ You can also enjoy Tensorboard, if you point the Tensorboard argument```--logdir Example ```config.json```: ``` { - "model_name": "my-model", // used in the experiment folder name - "num_mels": 80, - "num_freq": 1025, - "sample_rate": 20000, - "frame_length_ms": 50, - "frame_shift_ms": 12.5, - "preemphasis": 0.97, - "min_level_db": -100, - "ref_level_db": 20, - "embedding_size": 256, - "text_cleaner": "english_cleaners", + "model_name": "TTS", + "model_description": "what is going on with this model.", + "audio_processor": "audio", //audio or audio_lws + "num_mels": 80, + "num_freq": 1025, + "sample_rate": 22000, + "frame_length_ms": 50, + "frame_shift_ms": 12.5, + "preemphasis": 0.97, + "min_level_db": -100, + "ref_level_db": 20, + "embedding_size": 256, + "text_cleaner": "english_cleaners", - "epochs": 1000, - "lr": 0.002, - "warmup_steps": 4000, - "batch_size": 32, - "eval_batch_size":32, - "r": 5, + "num_loader_workers": 4, - "griffin_lim_iters": 60, - "power": 1.5, + "epochs": 1000, + "lr": 0.002, + "warmup_steps": 4000, + "lr_decay": 0.5, + "decay_step": 100000, + "batch_size": 32, + "eval_batch_size":-1, + "r": 5, - "num_loader_workers": 8, + "griffin_lim_iters": 60, + "power": 1.5, - "checkpoint": true, - "save_step": 376, - "data_path": "/my/training/data/path", - "min_seq_len": 0, - "output_path": "/my/experiment/folder/path" + "checkpoint": true, + "save_step": 25000, + "print_step": 10, + "run_eval": false, + "data_path": "root/path/to/your/data", + "meta_file_train": "metadata.csv", + "meta_file_val": null, + "dataset": "LJSpeech", + "min_seq_len": 0, + "output_path": "../keep/" } ``` From 48035a329f386ac3394794dcac584d27428c2423 Mon Sep 17 00:00:00 2001 From: Eren Date: Fri, 7 Sep 2018 09:53:40 +0200 Subject: [PATCH 4/4] Remove redundant import --- server/synthesizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 534b1313..815f45a8 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -4,7 +4,6 @@ import librosa import torch import scipy import numpy as np -import soundfile as sf from TTS.utils.text import text_to_sequence from TTS.utils.generic_utils import load_config from TTS.utils.audio import AudioProcessor