From 5e148038be5971f2c7c811d46a1d7b28c759ecda Mon Sep 17 00:00:00 2001 From: root Date: Thu, 9 Jan 2020 15:56:09 +0100 Subject: [PATCH 1/5] simpler gmm attention implementaiton --- config.json | 2 +- layers/common_layers.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/config.json b/config.json index 91863c4c..d23246a7 100644 --- a/config.json +++ b/config.json @@ -109,7 +109,7 @@ [ { "name": "ljspeech", - "path": "/data5/ro/shared/data/keithito/LJSpeech-1.1/", + "path": "/root/LJSpeech-1.1/", // "path": "/home/erogol/Data/LJSpeech-1.1", "meta_file_train": "metadata_train.csv", "meta_file_val": "metadata_val.csv" diff --git a/layers/common_layers.py b/layers/common_layers.py index 8b8ff073..112760b3 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -132,7 +132,7 @@ class GravesAttention(nn.Module): def init_states(self, inputs): if self.J is None or inputs.shape[1] > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]).to(inputs.device).expand([inputs.shape[0], self.K, inputs.shape[1]]) + self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5 self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) @@ -164,17 +164,14 @@ class GravesAttention(nn.Module): mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps - # each B x K x T_in - g_t = g_t.unsqueeze(2).expand(g_t.size(0), - g_t.size(1), - inputs.size(1)) - sig_t = sig_t.unsqueeze(2).expand_as(g_t) - mu_t_ = mu_t.unsqueeze(2).expand_as(g_t) - j = self.J[:g_t.size(0), :, :inputs.size(1)] + j = self.J[:inputs.size(1)+1] # attention weights - phi_t = g_t * torch.exp(-0.5 * (mu_t_ - j)**2 / (sig_t**2)) + phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2)) + + # discritize attention weights alpha_t = self.COEF * torch.sum(phi_t, 1) + alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1] # apply masking if mask is not None: From e5bf2719bdfd23be8c118276c1009853d1b146ca Mon Sep 17 00:00:00 2001 From: root Date: Tue, 14 Jan 2020 13:22:23 +0100 Subject: [PATCH 2/5] graves attention as in melnet paper --- layers/common_layers.py | 15 ++++++++------- utils/measures.py | 11 +++++++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 112760b3..64a3d201 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -131,8 +131,8 @@ class GravesAttention(nn.Module): torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) def init_states(self, inputs): - if self.J is None or inputs.shape[1] > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]+1).to(inputs.device) + 0.5 + if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: + self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5 self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) @@ -160,24 +160,25 @@ class GravesAttention(nn.Module): # attention GMM parameters sig_t = torch.nn.functional.softplus(b_t) + self.eps - mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps - j = self.J[:inputs.size(1)+1] # attention weights - phi_t = g_t.unsqueeze(-1) * torch.exp(-0.5 * (mu_t.unsqueeze(-1) - j)**2 / (sig_t.unsqueeze(-1)**2)) + phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) # discritize attention weights - alpha_t = self.COEF * torch.sum(phi_t, 1) + alpha_t = torch.sum(phi_t, 1) alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1] + alpha_t[alpha_t == 0] = 1e-8 # apply masking if mask is not None: alpha_t.data.masked_fill_(~mask, self._mask_value) context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) + # for better visualization + # self.attention_weights = torch.clamp(alpha_t, min=0) self.attention_weights = alpha_t self.mu_prev = mu_t return context @@ -350,7 +351,7 @@ class OriginalAttention(nn.Module): if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment - + context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment diff --git a/utils/measures.py b/utils/measures.py index a76a2225..01d25695 100644 --- a/utils/measures.py +++ b/utils/measures.py @@ -1,11 +1,18 @@ +import torch -def alignment_diagonal_score(alignments): + +def alignment_diagonal_score(alignments, binary=False): """ Compute how diagonal alignment predictions are. It is useful to measure the alignment consistency of a model Args: alignments (torch.Tensor): batch of alignments. + binary (bool): if True, ignore scores and consider attention + as a binary mask. Shape: alignments : batch x decoder_steps x encoder_steps """ - return alignments.max(dim=1)[0].mean(dim=1).mean(dim=0).item() + maxs = alignments.max(dim=1)[0] + if binary: + maxs[maxs > 0] = 1 + return maxs.mean(dim=1).mean(dim=0).item() From 3d59e61c6025f077cb0bc9d44dc830f83656080b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 15 Jan 2020 01:53:27 +0100 Subject: [PATCH 3/5] graves v2 --- config.json | 2 +- layers/common_layers.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config.json b/config.json index d23246a7..115f4dc6 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "model": "Tacotron2", // one of the model in models/ - "run_name": "ljspeech-graves", + "run_name": "ljspeech-gravesv2", "run_description": "tacotron2 wuth graves attention", // AUDIO PARAMETERS diff --git a/layers/common_layers.py b/layers/common_layers.py index 64a3d201..1337977a 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -113,7 +113,7 @@ class GravesAttention(nn.Module): def __init__(self, query_dim, K): super(GravesAttention, self).__init__() - self._mask_value = 0.0 + self._mask_value = 1e-8 self.K = K # self.attention_alignment = 0.05 self.eps = 1e-5 @@ -160,12 +160,14 @@ class GravesAttention(nn.Module): # attention GMM parameters sig_t = torch.nn.functional.softplus(b_t) + self.eps + mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps + j = self.J[:inputs.size(1)+1] # attention weights - phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.exp((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) + phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) # discritize attention weights alpha_t = torch.sum(phi_t, 1) @@ -177,8 +179,6 @@ class GravesAttention(nn.Module): alpha_t.data.masked_fill_(~mask, self._mask_value) context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) - # for better visualization - # self.attention_weights = torch.clamp(alpha_t, min=0) self.attention_weights = alpha_t self.mu_prev = mu_t return context @@ -351,7 +351,7 @@ class OriginalAttention(nn.Module): if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment - + context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment From a391a7f859463744d7f67d42f2e475945cd91336 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 18 Jan 2020 00:33:51 +0100 Subject: [PATCH 4/5] stop dividing g_t with sig_t and commenting --- layers/common_layers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 1337977a..fbedc2b9 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -127,8 +127,8 @@ class GravesAttention(nn.Module): self.init_layers() def init_layers(self): - torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) - torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) + torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean + torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std def init_states(self, inputs): if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: @@ -162,7 +162,7 @@ class GravesAttention(nn.Module): sig_t = torch.nn.functional.softplus(b_t) + self.eps mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) - g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps + g_t = torch.softmax(g_t, dim=-1) + self.eps j = self.J[:inputs.size(1)+1] From eb63c95d979a0156af95122479d92c2ebf3609e1 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 27 Jan 2020 15:42:56 +0100 Subject: [PATCH 5/5] bug fixes --- utils/audio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 05694dce..82e5aa47 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -66,12 +66,11 @@ class AudioProcessor(object): return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec)) def _build_mel_basis(self, ): - n_fft = (self.num_freq - 1) * 2 if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( self.sample_rate, - n_fft, + self.n_fft, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax) @@ -197,6 +196,7 @@ class AudioProcessor(object): n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, + pad_mode='constant' ) def _istft(self, y): @@ -217,7 +217,7 @@ class AudioProcessor(object): margin = int(self.sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim( - wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] + wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0] @staticmethod def mulaw_encode(wav, qc): @@ -247,7 +247,7 @@ class AudioProcessor(object): print(f' [!] File cannot be trimmed for silence - {filename}') assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) if self.sound_norm: - x = x / abs(x.max()) * 0.9 + x = x / abs(x).max() * 0.9 return x @staticmethod