mirror of https://github.com/coqui-ai/TTS.git
Config updates and add sigmoid to mel network again
This commit is contained in:
parent
4681f935b4
commit
d96690f83f
|
@ -4,6 +4,7 @@
|
|||
|
||||
"audio":{
|
||||
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||
|
@ -14,6 +15,7 @@
|
|||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
|
@ -22,7 +24,7 @@
|
|||
"mel_fmax": null // maximum freq level for mel-spec. Tune for dataset!!
|
||||
},
|
||||
|
||||
"embedding_size": 256,
|
||||
"embedding_size": 256,
|
||||
"text_cleaner": "english_cleaners",
|
||||
"epochs": 1000,
|
||||
"lr": 0.0015,
|
||||
|
@ -36,7 +38,7 @@
|
|||
"print_step": 10,
|
||||
|
||||
"run_eval": true,
|
||||
"data_path": "../../Data/LJSpeech-1.1/tts_cache", // can overwritten from command argument
|
||||
"data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.csv", // metafile for training dataloader
|
||||
"meta_file_val": "metadata_val.csv", // metafile for validation dataloader
|
||||
"data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]
|
||||
|
|
|
@ -24,6 +24,7 @@ class MyDataset(Dataset):
|
|||
min_seq_len=0,
|
||||
**kwargs
|
||||
):
|
||||
self.ap = ap
|
||||
self.root_path = root_path
|
||||
self.batch_group_size = batch_group_size
|
||||
self.feat_dir = os.path.join(root_path, 'loader_data')
|
||||
|
@ -38,7 +39,7 @@ class MyDataset(Dataset):
|
|||
|
||||
def load_wav(self, filename):
|
||||
try:
|
||||
audio = librosa.core.load(filename, sr=self.sample_rate)
|
||||
audio = self.ap.load_wav(filename)
|
||||
return audio
|
||||
except RuntimeError as e:
|
||||
print(" !! Cannot read file : {}".format(filename))
|
||||
|
@ -90,7 +91,7 @@ class MyDataset(Dataset):
|
|||
if wav_name.split('.')[-1] == 'npy':
|
||||
wav = self.load_np(wav_name)
|
||||
else:
|
||||
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
||||
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
|
||||
mel = self.load_np(mel_name)
|
||||
linear = self.load_np(linear_name)
|
||||
sample = {
|
||||
|
|
|
@ -416,6 +416,7 @@ class Decoder(nn.Module):
|
|||
decoder_output = decoder_input
|
||||
# predict mel vectors from decoder vectors
|
||||
output = self.proj_to_mel(decoder_output)
|
||||
output = torch.sigmoid(output)
|
||||
# predict stop token
|
||||
stopnet_input = torch.cat([decoder_input, output], -1)
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
|
|
|
@ -48,11 +48,10 @@ class AudioProcessor(object):
|
|||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||
if preemphasis == 0:
|
||||
print(" | > Preemphasis is deactive.")
|
||||
print(" | > Audio Processor attributes.")
|
||||
members = vars(self)
|
||||
pprint(members)
|
||||
for key, value in members.items():
|
||||
print(" | > {}:{}".format(key, value))
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||
|
@ -226,6 +225,7 @@ class AudioProcessor(object):
|
|||
|
||||
def load_wav(self, filename, encode=False):
|
||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||
# sr, x = io.wavfile.read(filename)
|
||||
assert self.sample_rate == sr
|
||||
return x
|
||||
|
||||
|
|
Loading…
Reference in New Issue