From 72aa88fa069ecc750a2adc69945c611453c7e322 Mon Sep 17 00:00:00 2001 From: Eren G Date: Wed, 8 Aug 2018 18:45:02 +0200 Subject: [PATCH] Formating and printing more about the model --- config.json | 4 ++-- config_kusal.json | 4 ++-- tests/tacotron_tests.py | 10 +++++----- train.py | 11 ++++++----- utils/audio.py | 16 ++++++++++------ 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/config.json b/config.json index 86cf0a01..f352f7ca 100644 --- a/config.json +++ b/config.json @@ -3,7 +3,7 @@ "audio_processor": "audio", "num_mels": 80, "num_freq": 1025, - "sample_rate": 22050, + "sample_rate": 22000, "frame_length_ms": 50, "frame_shift_ms": 12.5, "preemphasis": 0.97, @@ -21,7 +21,7 @@ "eval_batch_size":-1, "r": 5, - "griffin_lim_iters": 60, + "griffin_lim_iters": 50, "power": 1.5, "num_loader_workers": 8, diff --git a/config_kusal.json b/config_kusal.json index cd300f4d..c116d171 100644 --- a/config_kusal.json +++ b/config_kusal.json @@ -3,7 +3,7 @@ "audio_processor": "audio", "num_mels": 80, "num_freq": 1025, - "sample_rate": 22050, + "sample_rate": 22000, "frame_length_ms": 50, "frame_shift_ms": 12.5, "preemphasis": 0.97, @@ -21,7 +21,7 @@ "eval_batch_size":-1, "r": 5, - "griffin_lim_iters": 60, + "griffin_lim_iters": 50, "power": 1.5, "num_loader_workers": 8, diff --git a/tests/tacotron_tests.py b/tests/tacotron_tests.py index 52a5dfcd..37f36fa6 100644 --- a/tests/tacotron_tests.py +++ b/tests/tacotron_tests.py @@ -62,8 +62,8 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - if count not in [148, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) - count += 1 + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 \ No newline at end of file diff --git a/train.py b/train.py index 84b64242..23ac7b93 100644 --- a/train.py +++ b/train.py @@ -37,7 +37,7 @@ def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st, avg_step_time = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs), flush=True) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) - batch_n_iter = len(data_loader.dataset) / c.batch_size + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -321,13 +321,14 @@ def evaluate(model, criterion, criterion_st, data_loader, ap, current_step): # test sentences ap.griffin_lim_iters = 60 for idx, test_sentence in enumerate(test_sentences): + try: wav, linear_spec, alignments = synthesis(model, ap, test_sentence, use_cuda, c.text_cleaner) - try: - wav_name = 'TestSentences/{}'.format(idx) - tb.add_audio( - wav_name, wav, current_step, sample_rate=c.sample_rate) + wav_name = 'TestSentences/{}'.format(idx) + tb.add_audio( + wav_name, wav, current_step, sample_rate=c.sample_rate) except: + print(" !! Error as creating Test Sentence -", idx) pass align_img = alignments[0].data.cpu().numpy() linear_spec = plot_spectrogram(linear_spec, ap) diff --git a/utils/audio.py b/utils/audio.py index 92528e24..d62f9f71 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -23,6 +23,7 @@ class AudioProcessor(object): max_mel_freq, griffin_lim_iters=None): + print(" > Setting up Audio Processor...") self.sample_rate = sample_rate self.num_mels = num_mels self.min_level_db = min_level_db @@ -36,11 +37,12 @@ class AudioProcessor(object): self.max_mel_freq = max_mel_freq self.griffin_lim_iters = griffin_lim_iters self.n_fft, self.hop_length, self.win_length = self._stft_parameters() + if preemphasis == 0: + print(" | > Preemphasis is deactive.") def save_wav(self, wav, path): wav *= 32767 / max(0.01, np.max(np.abs(wav))) - librosa.output.write_wav( - path, wav.astype(np.int16), self.sample_rate) + librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate) def _linear_to_mel(self, spectrogram): global _mel_basis @@ -64,6 +66,10 @@ class AudioProcessor(object): n_fft = (self.num_freq - 1) * 2 hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) + hop_length = 256 + win_length = 1024 + print(" | > fft size: {}, hop length: {}, win length: {}".format( + n_fft, hop_length, win_length)) return n_fft, hop_length, win_length def _amp_to_db(self, x): @@ -123,13 +129,11 @@ class AudioProcessor(object): return self._normalize(S) def _stft(self, y): - n_fft, hop_length, win_length = self._stft_parameters() return librosa.stft( - y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) + y=y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length) def _istft(self, y): - _, hop_length, win_length = self._stft_parameters() - return librosa.istft(y, hop_length=hop_length, win_length=win_length) + return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec)