mirror of https://github.com/coqui-ai/TTS.git
Config updates and add sigmoid to mel network again
This commit is contained in:
parent
4681f935b4
commit
d96690f83f
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
"audio":{
|
"audio":{
|
||||||
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
"audio_processor": "audio", // to use dictate different audio processors, if available.
|
||||||
|
// Audio processing parameters
|
||||||
"num_mels": 80, // size of the mel spec frame.
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||||
|
@ -14,6 +15,7 @@
|
||||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||||
|
// Normalization parameters
|
||||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||||
|
@ -36,7 +38,7 @@
|
||||||
"print_step": 10,
|
"print_step": 10,
|
||||||
|
|
||||||
"run_eval": true,
|
"run_eval": true,
|
||||||
"data_path": "../../Data/LJSpeech-1.1/tts_cache", // can overwritten from command argument
|
"data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument
|
||||||
"meta_file_train": "metadata_train.csv", // metafile for training dataloader
|
"meta_file_train": "metadata_train.csv", // metafile for training dataloader
|
||||||
"meta_file_val": "metadata_val.csv", // metafile for validation dataloader
|
"meta_file_val": "metadata_val.csv", // metafile for validation dataloader
|
||||||
"data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]
|
"data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]
|
||||||
|
|
|
@ -24,6 +24,7 @@ class MyDataset(Dataset):
|
||||||
min_seq_len=0,
|
min_seq_len=0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
|
self.ap = ap
|
||||||
self.root_path = root_path
|
self.root_path = root_path
|
||||||
self.batch_group_size = batch_group_size
|
self.batch_group_size = batch_group_size
|
||||||
self.feat_dir = os.path.join(root_path, 'loader_data')
|
self.feat_dir = os.path.join(root_path, 'loader_data')
|
||||||
|
@ -38,7 +39,7 @@ class MyDataset(Dataset):
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
try:
|
try:
|
||||||
audio = librosa.core.load(filename, sr=self.sample_rate)
|
audio = self.ap.load_wav(filename)
|
||||||
return audio
|
return audio
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
print(" !! Cannot read file : {}".format(filename))
|
print(" !! Cannot read file : {}".format(filename))
|
||||||
|
@ -90,7 +91,7 @@ class MyDataset(Dataset):
|
||||||
if wav_name.split('.')[-1] == 'npy':
|
if wav_name.split('.')[-1] == 'npy':
|
||||||
wav = self.load_np(wav_name)
|
wav = self.load_np(wav_name)
|
||||||
else:
|
else:
|
||||||
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
|
||||||
mel = self.load_np(mel_name)
|
mel = self.load_np(mel_name)
|
||||||
linear = self.load_np(linear_name)
|
linear = self.load_np(linear_name)
|
||||||
sample = {
|
sample = {
|
||||||
|
|
|
@ -416,6 +416,7 @@ class Decoder(nn.Module):
|
||||||
decoder_output = decoder_input
|
decoder_output = decoder_input
|
||||||
# predict mel vectors from decoder vectors
|
# predict mel vectors from decoder vectors
|
||||||
output = self.proj_to_mel(decoder_output)
|
output = self.proj_to_mel(decoder_output)
|
||||||
|
output = torch.sigmoid(output)
|
||||||
# predict stop token
|
# predict stop token
|
||||||
stopnet_input = torch.cat([decoder_input, output], -1)
|
stopnet_input = torch.cat([decoder_input, output], -1)
|
||||||
stop_token = self.stopnet(stopnet_input)
|
stop_token = self.stopnet(stopnet_input)
|
||||||
|
|
|
@ -48,11 +48,10 @@ class AudioProcessor(object):
|
||||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||||
self.clip_norm = clip_norm
|
self.clip_norm = clip_norm
|
||||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||||
if preemphasis == 0:
|
|
||||||
print(" | > Preemphasis is deactive.")
|
|
||||||
print(" | > Audio Processor attributes.")
|
print(" | > Audio Processor attributes.")
|
||||||
members = vars(self)
|
members = vars(self)
|
||||||
pprint(members)
|
for key, value in members.items():
|
||||||
|
print(" | > {}:{}".format(key, value))
|
||||||
|
|
||||||
def save_wav(self, wav, path):
|
def save_wav(self, wav, path):
|
||||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||||
|
@ -226,6 +225,7 @@ class AudioProcessor(object):
|
||||||
|
|
||||||
def load_wav(self, filename, encode=False):
|
def load_wav(self, filename, encode=False):
|
||||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||||
|
# sr, x = io.wavfile.read(filename)
|
||||||
assert self.sample_rate == sr
|
assert self.sample_rate == sr
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue