mirror of https://github.com/coqui-ai/TTS.git
Bug solve on attention module and a new Notebook to experiment spectrogram reconstruction
This commit is contained in:
parent
9a22f5d085
commit
1320d5344a
File diff suppressed because one or more lines are too long
10
config.json
10
config.json
|
@ -1,8 +1,8 @@
|
||||||
{
|
{
|
||||||
"num_mels": 80,
|
"num_mels": 80,
|
||||||
"num_freq": 1024,
|
"num_freq": 1025,
|
||||||
"sample_rate": 20000,
|
"sample_rate": 20000,
|
||||||
"frame_length_ms": 50.0,
|
"frame_length_ms": 50,
|
||||||
"frame_shift_ms": 12.5,
|
"frame_shift_ms": 12.5,
|
||||||
"preemphasis": 0.97,
|
"preemphasis": 0.97,
|
||||||
"min_level_db": -100,
|
"min_level_db": -100,
|
||||||
|
@ -12,11 +12,11 @@
|
||||||
"text_cleaner": "english_cleaners",
|
"text_cleaner": "english_cleaners",
|
||||||
|
|
||||||
"epochs": 2000,
|
"epochs": 2000,
|
||||||
"lr": 0.001,
|
"lr": 0.003,
|
||||||
"lr_patience": 2,
|
"lr_patience": 5,
|
||||||
"lr_decay": 0.5,
|
"lr_decay": 0.5,
|
||||||
"batch_size": 256,
|
"batch_size": 256,
|
||||||
"griffinf_lim_iters": 60,
|
"griffin_lim_iters": 60,
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -20,6 +20,8 @@
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
|
"num_loader_workers": 16,
|
||||||
|
|
||||||
"save_step": 1,
|
"save_step": 1,
|
||||||
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
||||||
"output_path": "result",
|
"output_path": "result",
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -73,7 +73,8 @@ class AttentionWrapper(nn.Module):
|
||||||
alignment.data.masked_fill_(mask, self.score_mask_value)
|
alignment.data.masked_fill_(mask, self.score_mask_value)
|
||||||
|
|
||||||
# Normalize attention weight
|
# Normalize attention weight
|
||||||
alignment = F.softmax(alignment, dim=0)
|
alignment = F.softmax(alignment, dim=-1) ## TODO: might be buggy
|
||||||
|
print(alignment.size())
|
||||||
|
|
||||||
# Attention context vector
|
# Attention context vector
|
||||||
# (batch, 1, dim)
|
# (batch, 1, dim)
|
||||||
|
|
Binary file not shown.
|
@ -2,7 +2,7 @@
|
||||||
import torch
|
import torch
|
||||||
from torch.autograd import Variable
|
from torch.autograd import Variable
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from utils.text.symbols import symbols
|
from TTS.utils.text.symbols import symbols
|
||||||
from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
||||||
|
|
||||||
class Tacotron(nn.Module):
|
class Tacotron(nn.Module):
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,51 @@
|
||||||
|
import io
|
||||||
|
import librosa
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from TTS.utils.text import text_to_sequence
|
||||||
|
from matplotlib import pylab as plt
|
||||||
|
|
||||||
|
hop_length = 250
|
||||||
|
|
||||||
|
def create_speech(m, s, CONFIG, use_cuda, ap):
|
||||||
|
text_cleaner = [CONFIG.text_cleaner]
|
||||||
|
seq = np.array(text_to_sequence(s, text_cleaner))
|
||||||
|
|
||||||
|
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
|
||||||
|
|
||||||
|
if use_cuda:
|
||||||
|
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
|
||||||
|
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
|
||||||
|
else:
|
||||||
|
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
|
||||||
|
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
|
||||||
|
|
||||||
|
mel_out, linear_out, alignments =m.forward(chars_var)
|
||||||
|
linear_out = linear_out[0].data.cpu().numpy()
|
||||||
|
alignment = alignments[0].cpu().data.numpy()
|
||||||
|
spec = ap._denormalize(linear_out)
|
||||||
|
wav = ap.inv_spectrogram(linear_out.T)
|
||||||
|
wav = wav[:ap.find_endpoint(wav)]
|
||||||
|
out = io.BytesIO()
|
||||||
|
ap.save_wav(wav, out)
|
||||||
|
return wav, alignment, spec
|
||||||
|
|
||||||
|
|
||||||
|
def visualize(alignment, spectrogram, CONFIG):
|
||||||
|
label_fontsize = 16
|
||||||
|
plt.figure(figsize=(16,16))
|
||||||
|
|
||||||
|
plt.subplot(2,1,1)
|
||||||
|
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||||
|
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||||
|
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||||
|
plt.colorbar()
|
||||||
|
|
||||||
|
plt.subplot(2,1,2)
|
||||||
|
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
|
||||||
|
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||||
|
plt.xlabel("Time", fontsize=label_fontsize)
|
||||||
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.colorbar()
|
||||||
|
|
16
synthesis.py
16
synthesis.py
|
@ -38,17 +38,11 @@ def main(args):
|
||||||
|
|
||||||
# Sentences for generation
|
# Sentences for generation
|
||||||
sentences = [
|
sentences = [
|
||||||
"And it is worth mention in passing that, as an example of fine typography,",
|
"I try my best to translate text to speech. But I know I need more work",
|
||||||
# From July 8, 2017 New York Times:
|
"The new Firefox, Fast for good.",
|
||||||
'Scientists at the CERN laboratory say they have discovered a new particle.',
|
"Technology is continually providing us with new ways to create and publish stories.",
|
||||||
'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
|
"For these stories to achieve their full impact, it requires tool.",
|
||||||
'President Trump met with other leaders at the Group of 20 conference.',
|
"I am allien and I am here to destron your world."
|
||||||
'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
|
|
||||||
# From Google's Tacotron example page:
|
|
||||||
'Generative adversarial network or variational auto-encoder.',
|
|
||||||
'The buses aren\'t the problem, they actually provide a solution.',
|
|
||||||
'Does the quick brown fox jump over the lazy dog?',
|
|
||||||
'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Synthesis and save to wav files
|
# Synthesis and save to wav files
|
||||||
|
|
41
train.py
41
train.py
|
@ -111,6 +111,8 @@ def main(args):
|
||||||
progbar = Progbar(len(dataset) / c.batch_size)
|
progbar = Progbar(len(dataset) / c.batch_size)
|
||||||
|
|
||||||
for i, data in enumerate(dataloader):
|
for i, data in enumerate(dataloader):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
text_input = data[0]
|
text_input = data[0]
|
||||||
magnitude_input = data[1]
|
magnitude_input = data[1]
|
||||||
mel_input = data[2]
|
mel_input = data[2]
|
||||||
|
@ -128,42 +130,40 @@ def main(args):
|
||||||
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
text_input_var = Variable(torch.from_numpy(text_input).type(
|
text_input_var = Variable(torch.from_numpy(text_input).type(
|
||||||
torch.cuda.LongTensor), requires_grad=False).cuda()
|
torch.cuda.LongTensor)).cuda()
|
||||||
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
||||||
torch.cuda.FloatTensor), requires_grad=False).cuda()
|
torch.cuda.FloatTensor)).cuda()
|
||||||
mel_spec_var = Variable(torch.from_numpy(mel_input).type(
|
mel_spec_var = Variable(torch.from_numpy(mel_input).type(
|
||||||
torch.cuda.FloatTensor), requires_grad=False).cuda()
|
torch.cuda.FloatTensor)).cuda()
|
||||||
linear_spec_var = Variable(torch.from_numpy(magnitude_input)
|
linear_spec_var = Variable(torch.from_numpy(magnitude_input)
|
||||||
.type(torch.cuda.FloatTensor), requires_grad=False).cuda()
|
.type(torch.cuda.FloatTensor)).cuda()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
text_input_var = Variable(torch.from_numpy(text_input).type(
|
text_input_var = Variable(torch.from_numpy(text_input).type(
|
||||||
torch.LongTensor), requires_grad=False)
|
torch.LongTensor),)
|
||||||
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
mel_input_var = Variable(torch.from_numpy(mel_input).type(
|
||||||
torch.FloatTensor), requires_grad=False)
|
torch.FloatTensor))
|
||||||
mel_spec_var = Variable(torch.from_numpy(
|
mel_spec_var = Variable(torch.from_numpy(
|
||||||
mel_input).type(torch.FloatTensor), requires_grad=False)
|
mel_input).type(torch.FloatTensor))
|
||||||
linear_spec_var = Variable(torch.from_numpy(
|
linear_spec_var = Variable(torch.from_numpy(
|
||||||
magnitude_input).type(torch.FloatTensor),
|
magnitude_input).type(torch.FloatTensor))
|
||||||
requires_grad=False)
|
|
||||||
|
|
||||||
mel_output, linear_output, alignments =\
|
mel_output, linear_output, alignments =\
|
||||||
model.forward(text_input_var, mel_input_var)
|
model.forward(text_input_var, mel_input_var)
|
||||||
|
|
||||||
mel_loss = criterion(mel_output, mel_spec_var)
|
mel_loss = criterion(mel_output, mel_spec_var)
|
||||||
linear_loss = torch.abs(linear_output - linear_spec_var)
|
#linear_loss = torch.abs(linear_output - linear_spec_var)
|
||||||
linear_loss = 0.5 * \
|
#linear_loss = 0.5 * \
|
||||||
torch.mean(linear_loss) + 0.5 * \
|
#torch.mean(linear_loss) + 0.5 * \
|
||||||
torch.mean(linear_loss[:, :n_priority_freq, :])
|
#torch.mean(linear_loss[:, :n_priority_freq, :])
|
||||||
|
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
|
||||||
|
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||||
|
linear_spec_var[: ,: ,:n_priority_freq])
|
||||||
loss = mel_loss + linear_loss
|
loss = mel_loss + linear_loss
|
||||||
loss = loss.cuda()
|
# loss = loss.cuda()
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)
|
||||||
nn.utils.clip_grad_norm(model.parameters(), 1.)
|
|
||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
|
@ -171,7 +171,8 @@ def main(args):
|
||||||
|
|
||||||
progbar.update(i+1, values=[('total_loss', loss.data[0]),
|
progbar.update(i+1, values=[('total_loss', loss.data[0]),
|
||||||
('linear_loss', linear_loss.data[0]),
|
('linear_loss', linear_loss.data[0]),
|
||||||
('mel_loss', mel_loss.data[0])])
|
('mel_loss', mel_loss.data[0]),
|
||||||
|
('grad_norm', grad_norm)])
|
||||||
|
|
||||||
tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
|
tb.add_scalar('Train/TotalLoss', loss.data[0], current_step)
|
||||||
tb.add_scalar('Train/LinearLoss', linear_loss.data[0],
|
tb.add_scalar('Train/LinearLoss', linear_loss.data[0],
|
||||||
|
|
|
@ -81,10 +81,10 @@ class AudioProcessor(object):
|
||||||
|
|
||||||
def inv_spectrogram(self, spectrogram):
|
def inv_spectrogram(self, spectrogram):
|
||||||
'''Converts spectrogram to waveform using librosa'''
|
'''Converts spectrogram to waveform using librosa'''
|
||||||
S = _denormalize(spectrogram)
|
S = self._denormalize(spectrogram)
|
||||||
S = _db_to_amp(S + self.ref_level_db) # Convert back to linear
|
S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
|
||||||
# Reconstruct phase
|
# Reconstruct phase
|
||||||
return inv_preemphasis(_griffin_lim(S ** self.power))
|
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||||
|
|
||||||
|
|
||||||
def _griffin_lim(self, S):
|
def _griffin_lim(self, S):
|
||||||
|
@ -93,18 +93,13 @@ class AudioProcessor(object):
|
||||||
'''
|
'''
|
||||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||||
S_complex = np.abs(S).astype(np.complex)
|
S_complex = np.abs(S).astype(np.complex)
|
||||||
y = _istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
for i in range(self.griffin_lim_iters):
|
for i in range(self.griffin_lim_iters):
|
||||||
angles = np.exp(1j * np.angle(_stft(y)))
|
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||||
y = _istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
def _istft(self, y):
|
|
||||||
_, hop_length, win_length = _stft_parameters()
|
|
||||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
|
||||||
|
|
||||||
|
|
||||||
def melspectrogram(self, y):
|
def melspectrogram(self, y):
|
||||||
D = self._stft(self.apply_preemphasis(y))
|
D = self._stft(self.apply_preemphasis(y))
|
||||||
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
|
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
|
||||||
|
@ -115,11 +110,15 @@ class AudioProcessor(object):
|
||||||
n_fft, hop_length, win_length = self._stft_parameters()
|
n_fft, hop_length, win_length = self._stft_parameters()
|
||||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||||
|
|
||||||
|
def _istft(self, y):
|
||||||
|
_, hop_length, win_length = self._stft_parameters()
|
||||||
|
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||||
|
|
||||||
|
|
||||||
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
||||||
window_length = int(self.sample_rate * min_silence_sec)
|
window_length = int(self.sample_rate * min_silence_sec)
|
||||||
hop_length = int(window_length / 4)
|
hop_length = int(window_length / 4)
|
||||||
threshold = _db_to_amp(threshold_db)
|
threshold = self._db_to_amp(threshold_db)
|
||||||
for x in range(hop_length, len(wav) - window_length, hop_length):
|
for x in range(hop_length, len(wav) - window_length, hop_length):
|
||||||
if np.max(wav[x:x + window_length]) < threshold:
|
if np.max(wav[x:x + window_length]) < threshold:
|
||||||
return x + hop_length
|
return x + hop_length
|
||||||
|
|
|
@ -3,7 +3,9 @@ import numpy as np
|
||||||
|
|
||||||
def pad_data(x, length):
|
def pad_data(x, length):
|
||||||
_pad = 0
|
_pad = 0
|
||||||
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
return np.pad(x, (0, length - x.shape[0]),
|
||||||
|
mode='constant',
|
||||||
|
constant_values=_pad)
|
||||||
|
|
||||||
|
|
||||||
def prepare_data(inputs):
|
def prepare_data(inputs):
|
||||||
|
|
Loading…
Reference in New Issue