mirror of https://github.com/coqui-ai/TTS.git
Formating and printing more about the model
This commit is contained in:
parent
2506cd136a
commit
72aa88fa06
|
@ -3,7 +3,7 @@
|
||||||
"audio_processor": "audio",
|
"audio_processor": "audio",
|
||||||
"num_mels": 80,
|
"num_mels": 80,
|
||||||
"num_freq": 1025,
|
"num_freq": 1025,
|
||||||
"sample_rate": 22050,
|
"sample_rate": 22000,
|
||||||
"frame_length_ms": 50,
|
"frame_length_ms": 50,
|
||||||
"frame_shift_ms": 12.5,
|
"frame_shift_ms": 12.5,
|
||||||
"preemphasis": 0.97,
|
"preemphasis": 0.97,
|
||||||
|
@ -21,7 +21,7 @@
|
||||||
"eval_batch_size":-1,
|
"eval_batch_size":-1,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
"griffin_lim_iters": 60,
|
"griffin_lim_iters": 50,
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
|
|
||||||
"num_loader_workers": 8,
|
"num_loader_workers": 8,
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"audio_processor": "audio",
|
"audio_processor": "audio",
|
||||||
"num_mels": 80,
|
"num_mels": 80,
|
||||||
"num_freq": 1025,
|
"num_freq": 1025,
|
||||||
"sample_rate": 22050,
|
"sample_rate": 22000,
|
||||||
"frame_length_ms": 50,
|
"frame_length_ms": 50,
|
||||||
"frame_shift_ms": 12.5,
|
"frame_shift_ms": 12.5,
|
||||||
"preemphasis": 0.97,
|
"preemphasis": 0.97,
|
||||||
|
@ -21,7 +21,7 @@
|
||||||
"eval_batch_size":-1,
|
"eval_batch_size":-1,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
"griffin_lim_iters": 60,
|
"griffin_lim_iters": 50,
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
|
|
||||||
"num_loader_workers": 8,
|
"num_loader_workers": 8,
|
||||||
|
|
|
@ -62,8 +62,8 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
for param, param_ref in zip(model.parameters(),
|
for param, param_ref in zip(model.parameters(),
|
||||||
model_ref.parameters()):
|
model_ref.parameters()):
|
||||||
# ignore pre-higway layer since it works conditional
|
# ignore pre-higway layer since it works conditional
|
||||||
if count not in [148, 59]:
|
# if count not in [145, 59]:
|
||||||
assert (param != param_ref).any(
|
assert (param != param_ref).any(
|
||||||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||||
count, param.shape, param, param_ref)
|
count, param.shape, param, param_ref)
|
||||||
count += 1
|
count += 1
|
11
train.py
11
train.py
|
@ -37,7 +37,7 @@ def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st,
|
||||||
avg_step_time = 0
|
avg_step_time = 0
|
||||||
print(" | > Epoch {}/{}".format(epoch, c.epochs), flush=True)
|
print(" | > Epoch {}/{}".format(epoch, c.epochs), flush=True)
|
||||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
batch_n_iter = len(data_loader.dataset) / c.batch_size
|
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||||
for num_iter, data in enumerate(data_loader):
|
for num_iter, data in enumerate(data_loader):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -321,13 +321,14 @@ def evaluate(model, criterion, criterion_st, data_loader, ap, current_step):
|
||||||
# test sentences
|
# test sentences
|
||||||
ap.griffin_lim_iters = 60
|
ap.griffin_lim_iters = 60
|
||||||
for idx, test_sentence in enumerate(test_sentences):
|
for idx, test_sentence in enumerate(test_sentences):
|
||||||
|
try:
|
||||||
wav, linear_spec, alignments = synthesis(model, ap, test_sentence,
|
wav, linear_spec, alignments = synthesis(model, ap, test_sentence,
|
||||||
use_cuda, c.text_cleaner)
|
use_cuda, c.text_cleaner)
|
||||||
try:
|
wav_name = 'TestSentences/{}'.format(idx)
|
||||||
wav_name = 'TestSentences/{}'.format(idx)
|
tb.add_audio(
|
||||||
tb.add_audio(
|
wav_name, wav, current_step, sample_rate=c.sample_rate)
|
||||||
wav_name, wav, current_step, sample_rate=c.sample_rate)
|
|
||||||
except:
|
except:
|
||||||
|
print(" !! Error as creating Test Sentence -", idx)
|
||||||
pass
|
pass
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
align_img = alignments[0].data.cpu().numpy()
|
||||||
linear_spec = plot_spectrogram(linear_spec, ap)
|
linear_spec = plot_spectrogram(linear_spec, ap)
|
||||||
|
|
|
@ -23,6 +23,7 @@ class AudioProcessor(object):
|
||||||
max_mel_freq,
|
max_mel_freq,
|
||||||
griffin_lim_iters=None):
|
griffin_lim_iters=None):
|
||||||
|
|
||||||
|
print(" > Setting up Audio Processor...")
|
||||||
self.sample_rate = sample_rate
|
self.sample_rate = sample_rate
|
||||||
self.num_mels = num_mels
|
self.num_mels = num_mels
|
||||||
self.min_level_db = min_level_db
|
self.min_level_db = min_level_db
|
||||||
|
@ -36,11 +37,12 @@ class AudioProcessor(object):
|
||||||
self.max_mel_freq = max_mel_freq
|
self.max_mel_freq = max_mel_freq
|
||||||
self.griffin_lim_iters = griffin_lim_iters
|
self.griffin_lim_iters = griffin_lim_iters
|
||||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||||
|
if preemphasis == 0:
|
||||||
|
print(" | > Preemphasis is deactive.")
|
||||||
|
|
||||||
def save_wav(self, wav, path):
|
def save_wav(self, wav, path):
|
||||||
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
||||||
librosa.output.write_wav(
|
librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate)
|
||||||
path, wav.astype(np.int16), self.sample_rate)
|
|
||||||
|
|
||||||
def _linear_to_mel(self, spectrogram):
|
def _linear_to_mel(self, spectrogram):
|
||||||
global _mel_basis
|
global _mel_basis
|
||||||
|
@ -64,6 +66,10 @@ class AudioProcessor(object):
|
||||||
n_fft = (self.num_freq - 1) * 2
|
n_fft = (self.num_freq - 1) * 2
|
||||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||||
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate)
|
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate)
|
||||||
|
hop_length = 256
|
||||||
|
win_length = 1024
|
||||||
|
print(" | > fft size: {}, hop length: {}, win length: {}".format(
|
||||||
|
n_fft, hop_length, win_length))
|
||||||
return n_fft, hop_length, win_length
|
return n_fft, hop_length, win_length
|
||||||
|
|
||||||
def _amp_to_db(self, x):
|
def _amp_to_db(self, x):
|
||||||
|
@ -123,13 +129,11 @@ class AudioProcessor(object):
|
||||||
return self._normalize(S)
|
return self._normalize(S)
|
||||||
|
|
||||||
def _stft(self, y):
|
def _stft(self, y):
|
||||||
n_fft, hop_length, win_length = self._stft_parameters()
|
|
||||||
return librosa.stft(
|
return librosa.stft(
|
||||||
y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
y=y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length)
|
||||||
|
|
||||||
def _istft(self, y):
|
def _istft(self, y):
|
||||||
_, hop_length, win_length = self._stft_parameters()
|
return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)
|
||||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
|
||||||
|
|
||||||
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
||||||
window_length = int(self.sample_rate * min_silence_sec)
|
window_length = int(self.sample_rate * min_silence_sec)
|
||||||
|
|
Loading…
Reference in New Issue