Config updates and add sigmoid to mel network again

2018-11-02 17:27:31 +01:00 · 2018-11-02 17:27:31 +01:00 · d96690f83f
parent 4681f935b4
commit d96690f83f
4 changed files with 11 additions and 7 deletions
--- a/config.json
+++ b/config.json
@ -4,6 +4,7 @@

    "audio":{
        "audio_processor": "audio",     // to use dictate different audio processors, if available.
+        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame. 
        "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // wav sample-rate. If different than the original data, it is resampled.
@ -14,6 +15,7 @@
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
        "signal_norm": true,    // normalize the spec values in range [0, 1]
        "symmetric_norm": false, // move normalization to range [-1, 1]
        "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
@ -22,7 +24,7 @@
        "mel_fmax": null        // maximum freq level for mel-spec. Tune for dataset!!
    },

-    "embedding_size": 256,
+    "embedding_size": 256,    
    "text_cleaner": "english_cleaners",
    "epochs": 1000,
    "lr": 0.0015,
@ -36,7 +38,7 @@
    "print_step": 10,

    "run_eval": true,
-    "data_path": "../../Data/LJSpeech-1.1/tts_cache",  // can overwritten from command argument
+    "data_path": "../../Data/LJSpeech-1.1/",  // can overwritten from command argument
    "meta_file_train": "metadata_train.csv",      // metafile for training dataloader
    "meta_file_val": "metadata_val.csv",    // metafile for validation dataloader
    "data_loader": "TTSDataset",      // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]
--- a/datasets/TTSDatasetCached.py
+++ b/datasets/TTSDatasetCached.py
@ -24,6 +24,7 @@ class MyDataset(Dataset):
                 min_seq_len=0,
                 **kwargs
                 ):
+        self.ap = ap
        self.root_path = root_path
        self.batch_group_size = batch_group_size
        self.feat_dir = os.path.join(root_path, 'loader_data')
@ -38,7 +39,7 @@ class MyDataset(Dataset):

    def load_wav(self, filename):
        try:
-            audio = librosa.core.load(filename, sr=self.sample_rate)
+            audio = self.ap.load_wav(filename)
            return audio
        except RuntimeError as e:
            print(" !! Cannot read file : {}".format(filename))
@ -90,7 +91,7 @@ class MyDataset(Dataset):
        if wav_name.split('.')[-1] == 'npy':
            wav = self.load_np(wav_name)
        else:
-            wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
+            wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
        mel = self.load_np(mel_name)
        linear = self.load_np(linear_name)
        sample = {
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@ -416,6 +416,7 @@ class Decoder(nn.Module):
            decoder_output = decoder_input
            # predict mel vectors from decoder vectors
            output = self.proj_to_mel(decoder_output)
+            output = torch.sigmoid(output)
            # predict stop token
            stopnet_input = torch.cat([decoder_input, output], -1)
            stop_token = self.stopnet(stopnet_input)
--- a/utils/audio.py
+++ b/utils/audio.py
@ -48,11 +48,10 @@ class AudioProcessor(object):
        self.max_norm = 1.0 if max_norm is None else float(max_norm)
        self.clip_norm = clip_norm
        self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
-        if preemphasis == 0:
-            print(" | > Preemphasis is deactive.")
        print(" | > Audio Processor attributes.")
        members = vars(self)
-        pprint(members)
+        for key, value in members.items():
+            print("   | > {}:{}".format(key, value))

    def save_wav(self, wav, path):
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
@ -226,6 +225,7 @@ class AudioProcessor(object):

    def load_wav(self, filename, encode=False):
        x, sr = librosa.load(filename, sr=self.sample_rate)
+        # sr, x = io.wavfile.read(filename)
        assert self.sample_rate == sr
        return x