From d96690f83f5bc804f532b6c2d5cca208de401e7a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 2 Nov 2018 17:27:31 +0100 Subject: [PATCH] Config updates and add sigmoid to mel network again --- config.json | 6 ++++-- datasets/TTSDatasetCached.py | 5 +++-- layers/tacotron.py | 1 + utils/audio.py | 6 +++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/config.json b/config.json index 1491601d..b9214573 100644 --- a/config.json +++ b/config.json @@ -4,6 +4,7 @@ "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. + // Audio processing parameters "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. @@ -14,6 +15,7 @@ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. "power": 1.5, // value to sharpen wav signals after GL algorithm. "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters "signal_norm": true, // normalize the spec values in range [0, 1] "symmetric_norm": false, // move normalization to range [-1, 1] "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] @@ -22,7 +24,7 @@ "mel_fmax": null // maximum freq level for mel-spec. Tune for dataset!! }, - "embedding_size": 256, + "embedding_size": 256, "text_cleaner": "english_cleaners", "epochs": 1000, "lr": 0.0015, @@ -36,7 +38,7 @@ "print_step": 10, "run_eval": true, - "data_path": "../../Data/LJSpeech-1.1/tts_cache", // can overwritten from command argument + "data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument "meta_file_train": "metadata_train.csv", // metafile for training dataloader "meta_file_val": "metadata_val.csv", // metafile for validation dataloader "data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"] diff --git a/datasets/TTSDatasetCached.py b/datasets/TTSDatasetCached.py index 57a58f55..b5c6d4ce 100644 --- a/datasets/TTSDatasetCached.py +++ b/datasets/TTSDatasetCached.py @@ -24,6 +24,7 @@ class MyDataset(Dataset): min_seq_len=0, **kwargs ): + self.ap = ap self.root_path = root_path self.batch_group_size = batch_group_size self.feat_dir = os.path.join(root_path, 'loader_data') @@ -38,7 +39,7 @@ class MyDataset(Dataset): def load_wav(self, filename): try: - audio = librosa.core.load(filename, sr=self.sample_rate) + audio = self.ap.load_wav(filename) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) @@ -90,7 +91,7 @@ class MyDataset(Dataset): if wav_name.split('.')[-1] == 'npy': wav = self.load_np(wav_name) else: - wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) + wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) mel = self.load_np(mel_name) linear = self.load_np(linear_name) sample = { diff --git a/layers/tacotron.py b/layers/tacotron.py index 749d7cb3..83d0d28b 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -416,6 +416,7 @@ class Decoder(nn.Module): decoder_output = decoder_input # predict mel vectors from decoder vectors output = self.proj_to_mel(decoder_output) + output = torch.sigmoid(output) # predict stop token stopnet_input = torch.cat([decoder_input, output], -1) stop_token = self.stopnet(stopnet_input) diff --git a/utils/audio.py b/utils/audio.py index a82eaeba..8b961099 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -48,11 +48,10 @@ class AudioProcessor(object): self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.n_fft, self.hop_length, self.win_length = self._stft_parameters() - if preemphasis == 0: - print(" | > Preemphasis is deactive.") print(" | > Audio Processor attributes.") members = vars(self) - pprint(members) + for key, value in members.items(): + print(" | > {}:{}".format(key, value)) def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) @@ -226,6 +225,7 @@ class AudioProcessor(object): def load_wav(self, filename, encode=False): x, sr = librosa.load(filename, sr=self.sample_rate) + # sr, x = io.wavfile.read(filename) assert self.sample_rate == sr return x