Config updates and add sigmoid to mel network again

This commit is contained in:
Eren Golge 2018-11-02 17:27:31 +01:00
parent 4681f935b4
commit d96690f83f
4 changed files with 11 additions and 7 deletions

View File

@ -4,6 +4,7 @@
"audio":{
"audio_processor": "audio", // to use dictate different audio processors, if available.
// Audio processing parameters
"num_mels": 80, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
@ -14,6 +15,7 @@
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": false, // move normalization to range [-1, 1]
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
@ -22,7 +24,7 @@
"mel_fmax": null // maximum freq level for mel-spec. Tune for dataset!!
},
"embedding_size": 256,
"embedding_size": 256,
"text_cleaner": "english_cleaners",
"epochs": 1000,
"lr": 0.0015,
@ -36,7 +38,7 @@
"print_step": 10,
"run_eval": true,
"data_path": "../../Data/LJSpeech-1.1/tts_cache", // can overwritten from command argument
"data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument
"meta_file_train": "metadata_train.csv", // metafile for training dataloader
"meta_file_val": "metadata_val.csv", // metafile for validation dataloader
"data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]

View File

@ -24,6 +24,7 @@ class MyDataset(Dataset):
min_seq_len=0,
**kwargs
):
self.ap = ap
self.root_path = root_path
self.batch_group_size = batch_group_size
self.feat_dir = os.path.join(root_path, 'loader_data')
@ -38,7 +39,7 @@ class MyDataset(Dataset):
def load_wav(self, filename):
try:
audio = librosa.core.load(filename, sr=self.sample_rate)
audio = self.ap.load_wav(filename)
return audio
except RuntimeError as e:
print(" !! Cannot read file : {}".format(filename))
@ -90,7 +91,7 @@ class MyDataset(Dataset):
if wav_name.split('.')[-1] == 'npy':
wav = self.load_np(wav_name)
else:
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
mel = self.load_np(mel_name)
linear = self.load_np(linear_name)
sample = {

View File

@ -416,6 +416,7 @@ class Decoder(nn.Module):
decoder_output = decoder_input
# predict mel vectors from decoder vectors
output = self.proj_to_mel(decoder_output)
output = torch.sigmoid(output)
# predict stop token
stopnet_input = torch.cat([decoder_input, output], -1)
stop_token = self.stopnet(stopnet_input)

View File

@ -48,11 +48,10 @@ class AudioProcessor(object):
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
if preemphasis == 0:
print(" | > Preemphasis is deactive.")
print(" | > Audio Processor attributes.")
members = vars(self)
pprint(members)
for key, value in members.items():
print(" | > {}:{}".format(key, value))
def save_wav(self, wav, path):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
@ -226,6 +225,7 @@ class AudioProcessor(object):
def load_wav(self, filename, encode=False):
x, sr = librosa.load(filename, sr=self.sample_rate)
# sr, x = io.wavfile.read(filename)
assert self.sample_rate == sr
return x