From a12bf6c23e3f0c6f4bf4b50cc8c67172f6b2b86b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 29 Apr 2019 11:35:52 +0200 Subject: [PATCH 01/12] compute update --- .compute | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.compute b/.compute index 5ef7df1d..358722b6 100644 --- a/.compute +++ b/.compute @@ -1,12 +1,14 @@ #!/bin/bash -ls ${SHARED_DIR}/data/mozilla/Judy/ yes | apt-get install sox yes | apt-get install ffmpeg -soxi /data/ro/shared/data/mozilla/Judy/batch6/wavs_no_processing/6_126.wav -pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl yes | apt-get install espeak +yes | apt-get install tmux +yes | apt-get install zsh +pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl +wget https://www.dropbox.com/s/rowimjoosnmdgj4/best_model_4702.pth.tar?dl=0 -O best_model_4702.pth.tar +wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh +sudo sh install.sh python3 setup.py develop -# wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz -# tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR} -# python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ --restore_path ${USER_DIR}/best_model_4583.pth.tar -python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ +# python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/Mozilla/ --restore_path ${USER_DIR}/best_model_4700.pth.tar +# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ +while true; do sleep 1000000; done From 3ea34c6488058a232d0be76732595883c049afca Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 29 Apr 2019 11:36:40 +0200 Subject: [PATCH 02/12] config update. add location attn oprion --- config_cluster.json | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/config_cluster.json b/config_cluster.json index d8b066d7..e651fe25 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-nomask-fattn-bn", - "run_description": "Finetune 4702 orignal -> bn prenet - Mozilla with prenet bn, no mask, batch group size 0", + "run_name": "mozilla-fattn-agent-masking", + "run_description": "Original prenet,fattn and fattn agent with loss masking.", "audio":{ // Audio processing parameters @@ -40,10 +40,11 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn". - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "loss_masking": false, // enable / disable loss masking against the sequence padding. + "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". + "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. + "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention. From e2439fde9abfc36b823f4d88b093180b20e7d42d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 29 Apr 2019 11:37:01 +0200 Subject: [PATCH 03/12] make location attention optional and keep all attention weights in attention class --- layers/tacotron2.py | 95 ++++++++++++++++++++++++++---------------- models/tacotron2.py | 4 +- utils/generic_utils.py | 3 +- 3 files changed, 62 insertions(+), 40 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index df05e5ad..2fa6d06f 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -120,7 +120,7 @@ class LocationLayer(nn.Module): class Attention(nn.Module): - def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, + def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, location_attention, attention_location_n_filters, attention_location_kernel_size, windowing, norm, forward_attn, trans_agent): super(Attention, self).__init__() @@ -131,37 +131,64 @@ class Attention(nn.Module): self.v = Linear(attention_dim, 1, bias=True) if trans_agent: self.ta = nn.Linear(attention_dim + embedding_dim, 1, bias=True) - self.location_layer = LocationLayer(attention_location_n_filters, - attention_location_kernel_size, - attention_dim) + if location_attention: + self.location_layer = LocationLayer(attention_location_n_filters, + attention_location_kernel_size, + attention_dim) self._mask_value = -float("inf") self.windowing = windowing self.win_idx = None self.norm = norm self.forward_attn = forward_attn self.trans_agent = trans_agent + self.location_attention = location_attention def init_win_idx(self): self.win_idx = -1 self.win_back = 2 self.win_front = 6 - def init_forward_attn_state(self, inputs): - """ - Init forward attention states - """ + def init_forward_attn(self, inputs): B = inputs.shape[0] T = inputs.shape[1] self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1] + 1e-7 ], dim=1).to(inputs.device) self.u = (0.5 * torch.ones([B, 1])).to(inputs.device) - def get_attention(self, query, processed_inputs, attention_cat): + def init_location_attention(self, inputs): + B = inputs.shape[0] + T = inputs.shape[1] + self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_()) + + def init_states(self, inputs): + B = inputs.shape[0] + T = inputs.shape[1] + self.attention_weights = Variable(inputs.data.new(B, T).zero_()) + if self.location_attention: + self.init_location_attention(inputs) + if self.forward_attn: + self.init_forward_attn(inputs) + if self.windowing: + self.init_win_idx() + + def update_location_attention(self, alignments): + self.attention_weights_cum += alignments + + def get_location_attention(self, query, processed_inputs): + attention_cat = torch.cat((self.attention_weights.unsqueeze(1), + self.attention_weights_cum.unsqueeze(1)), + dim=1) processed_query = self.query_layer(query.unsqueeze(1)) processed_attention_weights = self.location_layer(attention_cat) energies = self.v( torch.tanh(processed_query + processed_attention_weights + - processed_inputs)) + processed_inputs)) + energies = energies.squeeze(-1) + return energies, processed_query + def get_attention(self, query, processed_inputs): + processed_query = self.query_layer(query.unsqueeze(1)) + energies = self.v( + torch.tanh(processed_query +processed_inputs)) energies = energies.squeeze(-1) return energies, processed_query @@ -192,13 +219,16 @@ class Attention(nn.Module): if self.trans_agent: ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1) self.u = torch.sigmoid(self.ta(ta_input)) - return context, self.alpha, alignment + return context, self.alpha def forward(self, attention_hidden_state, inputs, processed_inputs, - attention_cat, mask): - attention, processed_query = self.get_attention( - attention_hidden_state, processed_inputs, attention_cat) - + mask): + if self.location_attention: + attention, processed_query = self.get_location_attention( + attention_hidden_state, processed_inputs) + else: + attention, processed_query = self.get_attention( + attention_hidden_state, processed_inputs) # apply masking if mask is not None: attention.data.masked_fill_(1 - mask, self._mask_value) @@ -213,13 +243,15 @@ class Attention(nn.Module): attention).sum(dim=1).unsqueeze(1) else: raise RuntimeError("Unknown value for attention norm type") + if self.location_attention: + self.update_location_attention(alignment) # apply forward attention if enabled if self.forward_attn: - return self.apply_forward_attention(inputs, alignment, processed_query) + context, self.attention_weights = self.apply_forward_attention(inputs, alignment, processed_query) else: context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) - return context, alignment, alignment + return context class Postnet(nn.Module): @@ -289,7 +321,7 @@ class Encoder(nn.Module): # adapted from https://github.com/NVIDIA/tacotron2/ class Decoder(nn.Module): - def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent): + def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent, location_attn): super(Decoder, self).__init__() self.mel_channels = inputs_dim self.r = r @@ -308,8 +340,8 @@ class Decoder(nn.Module): self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, self.attention_rnn_dim) - self.attention_layer = Attention(self.attention_rnn_dim, in_features, - 128, 32, 31, attn_win, attn_norm, forward_attn, trans_agent) + self.attention_layer = Attention(self.attention_rnn_dim, in_features, 128, location_attn, + 32, 31, attn_win, attn_norm, forward_attn, trans_agent) self.decoder_rnn = nn.LSTMCell(self.attention_rnn_dim + in_features, self.decoder_rnn_dim, 1) @@ -351,9 +383,6 @@ class Decoder(nn.Module): self.context = Variable( inputs.data.new(B, self.encoder_embedding_dim).zero_()) - - self.attention_weights = Variable(inputs.data.new(B, T).zero_()) - self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_()) self.inputs = inputs self.processed_inputs = self.attention_layer.inputs_layer(inputs) @@ -384,14 +413,10 @@ class Decoder(nn.Module): self.attention_cell = F.dropout( self.attention_cell, self.p_attention_dropout, self.training) - attention_cat = torch.cat((self.attention_weights.unsqueeze(1), - self.attention_weights_cum.unsqueeze(1)), - dim=1) - self.context, self.attention_weights, alignments = self.attention_layer( + self.context = self.attention_layer( self.attention_hidden, self.inputs, self.processed_inputs, - attention_cat, self.mask) + self.mask) - self.attention_weights_cum += alignments memory = torch.cat( (self.attention_hidden, self.context), -1) self.decoder_hidden, self.decoder_cell = self.decoder_rnn( @@ -410,7 +435,7 @@ class Decoder(nn.Module): stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1) gate_prediction = self.stopnet(stopnet_input) - return decoder_output, gate_prediction, self.attention_weights + return decoder_output, gate_prediction, self.attention_layer.attention_weights def forward(self, inputs, memories, mask): memory = self.get_go_frame(inputs).unsqueeze(0) @@ -419,8 +444,7 @@ class Decoder(nn.Module): memories = self.prenet(memories) self._init_states(inputs, mask=mask) - if self.attention_layer.forward_attn: - self.attention_layer.init_forward_attn_state(inputs) + self.attention_layer.init_states(inputs) outputs, stop_tokens, alignments = [], [], [] while len(outputs) < memories.size(0) - 1: @@ -441,8 +465,7 @@ class Decoder(nn.Module): self._init_states(inputs, mask=None) self.attention_layer.init_win_idx() - if self.attention_layer.forward_attn: - self.attention_layer.init_forward_attn_state(inputs) + self.attention_layer.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [False, False, False] @@ -484,9 +507,7 @@ class Decoder(nn.Module): else: self._init_states(inputs, mask=None, keep_states=True) - self.attention_layer.init_win_idx() - if self.attention_layer.forward_attn: - self.attention_layer.init_forward_attn_state(inputs) + self.attention_layer.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [False, False, False] stop_count = 0 diff --git a/models/tacotron2.py b/models/tacotron2.py index 2e7c857b..c492c7b1 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -9,7 +9,7 @@ from utils.generic_utils import sequence_mask # TODO: match function arguments with tacotron class Tacotron2(nn.Module): - def __init__(self, num_chars, r, attn_win=False, attn_norm="softmax", prenet_type="original", forward_attn=False, trans_agent=False): + def __init__(self, num_chars, r, attn_win=False, attn_norm="softmax", prenet_type="original", forward_attn=False, trans_agent=False, location_attn=True): super(Tacotron2, self).__init__() self.n_mel_channels = 80 self.n_frames_per_step = r @@ -18,7 +18,7 @@ class Tacotron2(nn.Module): val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(512) - self.decoder = Decoder(512, self.n_mel_channels, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent) + self.decoder = Decoder(512, self.n_mel_channels, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent, location_attn) self.postnet = Postnet(self.n_mel_channels) def shape_outputs(self, mel_outputs, mel_outputs_postnet, alignments): diff --git a/utils/generic_utils.py b/utils/generic_utils.py index f22c4a3a..19d93888 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -263,5 +263,6 @@ def setup_model(num_chars, c): attn_norm=c.attention_norm, prenet_type=c.prenet_type, forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent) + trans_agent=c.transition_agent, + location_attn=c.location_attn) return model \ No newline at end of file From 8f6721a1fff5d9832f3c3d51b744796a9b883517 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 30 Apr 2019 10:47:48 +0200 Subject: [PATCH 04/12] config update --- config_cluster.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config_cluster.json b/config_cluster.json index e651fe25..48167649 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { "run_name": "mozilla-fattn-agent-masking", - "run_description": "Original prenet,fattn and fattn agent with loss masking.", + "run_description": "finetune 4706 with prenet BN. Loss masking, fattn.", "audio":{ // Audio processing parameters @@ -42,8 +42,8 @@ "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. + "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. @@ -66,7 +66,7 @@ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. From afb5a17221a2cfc48a808f1dee8c5612320cdf93 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 30 Apr 2019 10:59:29 +0200 Subject: [PATCH 05/12] bug fix --- layers/tacotron2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 2fa6d06f..0826ccc6 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -251,6 +251,7 @@ class Attention(nn.Module): else: context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) + self.attention_weights = alignment return context From e5b5ca8dff8b74c59a4ea03698f1d3a9f2ec6dd9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 6 May 2019 16:14:02 +0200 Subject: [PATCH 06/12] config update --- .compute | 6 +++--- config_cluster.json | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.compute b/.compute index 358722b6..fe1c69c7 100644 --- a/.compute +++ b/.compute @@ -5,10 +5,10 @@ yes | apt-get install espeak yes | apt-get install tmux yes | apt-get install zsh pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl -wget https://www.dropbox.com/s/rowimjoosnmdgj4/best_model_4702.pth.tar?dl=0 -O best_model_4702.pth.tar +# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop -# python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/Mozilla/ --restore_path ${USER_DIR}/best_model_4700.pth.tar +python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaAll/Mozilla/ --restore_path ${USER_DIR}/checkpoint_123000_4761.pth.tar # python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ -while true; do sleep 1000000; done +# while true; do sleep 1000000; done diff --git a/config_cluster.json b/config_cluster.json index 48167649..9b68e40c 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-fattn-agent-masking", - "run_description": "finetune 4706 with prenet BN. Loss masking, fattn.", + "run_name": "mozilla-fattn-stopnet", + "run_description": "Finetune 4761 with BN", "audio":{ // Audio processing parameters @@ -40,14 +40,14 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". + "prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn". "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "loss_masking": true, // enable / disable loss masking against the sequence padding. + "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "loss_masking": false, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. @@ -60,12 +60,12 @@ "run_eval": true, "test_delay_epochs": 1, //Until attention is aligned, testing only wastes computation time. "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "metadata.txt", // DATASET-RELATED: metafile for training dataloader. + "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. + "output_path": "/media/erogol/data_ssd/Data/models/mozilla_models/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. From fe14947b0eefc6a660cf080591861aebc7ca93f6 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 6 May 2019 16:43:32 +0200 Subject: [PATCH 07/12] config update --- config_cluster.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_cluster.json b/config_cluster.json index 9b68e40c..7b0f19ec 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,5 +1,5 @@ { - "run_name": "mozilla-fattn-stopnet", + "run_name": "mozilla-fattn-no_loc", "run_description": "Finetune 4761 with BN", "audio":{ From 820d18c9226ca451ceb79306a839d2fba4174658 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 12 May 2019 17:34:57 +0200 Subject: [PATCH 08/12] make dropout at prenet optional --- .compute | 2 +- config_cluster.json | 7 ++++--- layers/tacotron2.py | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.compute b/.compute index fe1c69c7..d2d37fc1 100644 --- a/.compute +++ b/.compute @@ -9,6 +9,6 @@ pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop -python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaAll/Mozilla/ --restore_path ${USER_DIR}/checkpoint_123000_4761.pth.tar +python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaAll2/Mozilla/ --restore_path ${USER_DIR}/checkpoint_123000_4761.pth.tar # python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ # while true; do sleep 1000000; done diff --git a/config_cluster.json b/config_cluster.json index 7b0f19ec..11c2415f 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-fattn-no_loc", - "run_description": "Finetune 4761 with BN", + "run_name": "mozilla-fattn", + "run_description": "Finetune 4761 with BN + Dropout. It is to compare to 4780 and see how dropout behaves with BN.", "audio":{ // Audio processing parameters @@ -41,6 +41,7 @@ "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn". + "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. @@ -65,7 +66,7 @@ "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "/media/erogol/data_ssd/Data/models/mozilla_models/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 0826ccc6..175a8ef3 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -53,9 +53,10 @@ class LinearBN(nn.Module): class Prenet(nn.Module): - def __init__(self, in_features, prenet_type, out_features=[256, 256]): + def __init__(self, in_features, prenet_type, prenet_dropout, out_features=[256, 256]): super(Prenet, self).__init__() self.prenet_type = prenet_type + self.prenet_dropout = prenet_dropout in_features = [in_features] + out_features[:-1] if prenet_type == "bn": self.layers = nn.ModuleList([ @@ -70,9 +71,9 @@ class Prenet(nn.Module): def forward(self, x): for linear in self.layers: - if self.prenet_type == "original": + if self.prenet_dropout: x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training) - elif self.prenet_type == "bn": + else: x = F.relu(linear(x)) return x From 6331bccefc2094ab1f4d2d4056671a88b18b637d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 12 May 2019 17:35:31 +0200 Subject: [PATCH 09/12] make dropout oprional #2 --- layers/tacotron2.py | 6 +++--- models/tacotron2.py | 4 ++-- utils/generic_utils.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 175a8ef3..c0aeafda 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -323,7 +323,7 @@ class Encoder(nn.Module): # adapted from https://github.com/NVIDIA/tacotron2/ class Decoder(nn.Module): - def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent, location_attn): + def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, location_attn): super(Decoder, self).__init__() self.mel_channels = inputs_dim self.r = r @@ -336,7 +336,7 @@ class Decoder(nn.Module): self.p_attention_dropout = 0.1 self.p_decoder_dropout = 0.1 - self.prenet = Prenet(self.mel_channels * r, prenet_type, + self.prenet = Prenet(self.mel_channels * r, prenet_type, prenet_dropout, [self.prenet_dim, self.prenet_dim]) self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, @@ -485,7 +485,7 @@ class Decoder(nn.Module): stop_flags[2] = t > inputs.shape[1] * 2 if all(stop_flags): stop_count += 1 - if stop_count > 2: + if stop_count > 5: break elif len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") diff --git a/models/tacotron2.py b/models/tacotron2.py index c492c7b1..e9ce1a1b 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -9,7 +9,7 @@ from utils.generic_utils import sequence_mask # TODO: match function arguments with tacotron class Tacotron2(nn.Module): - def __init__(self, num_chars, r, attn_win=False, attn_norm="softmax", prenet_type="original", forward_attn=False, trans_agent=False, location_attn=True): + def __init__(self, num_chars, r, attn_win=False, attn_norm="softmax", prenet_type="original", prenet_dropout=True, forward_attn=False, trans_agent=False, location_attn=True): super(Tacotron2, self).__init__() self.n_mel_channels = 80 self.n_frames_per_step = r @@ -18,7 +18,7 @@ class Tacotron2(nn.Module): val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(512) - self.decoder = Decoder(512, self.n_mel_channels, r, attn_win, attn_norm, prenet_type, forward_attn, trans_agent, location_attn) + self.decoder = Decoder(512, self.n_mel_channels, r, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, location_attn) self.postnet = Postnet(self.n_mel_channels) def shape_outputs(self, mel_outputs, mel_outputs_postnet, alignments): diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 19d93888..902affba 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -262,6 +262,7 @@ def setup_model(num_chars, c): attn_win=c.windowing, attn_norm=c.attention_norm, prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, forward_attn=c.use_forward_attn, trans_agent=c.transition_agent, location_attn=c.location_attn) From 5e679f746db2d9b49bdae43f90b0fa8b260f488d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 12 May 2019 17:35:44 +0200 Subject: [PATCH 10/12] save figures in visualize of set --- utils/visual.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/utils/visual.py b/utils/visual.py index b259bdd9..2924b034 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -30,14 +30,14 @@ def plot_spectrogram(linear_output, audio): return fig -def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None): +def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None): if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 - plt.figure(figsize=(8, 24)) + fig = plt.figure(figsize=(8, 24)) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) @@ -69,3 +69,7 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() + + if output_path: + print(output_path) + fig.savefig(output_path) From 2b60f9a73124c4342b7c4925580762ae9faeaf52 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 12 May 2019 17:39:12 +0200 Subject: [PATCH 11/12] Fix trans agent implementation in relation to the paper. Use query vector insteadd of processed_query --- layers/tacotron2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index c0aeafda..1833b0eb 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -131,7 +131,7 @@ class Attention(nn.Module): embedding_dim, attention_dim, bias=False, init_gain='tanh') self.v = Linear(attention_dim, 1, bias=True) if trans_agent: - self.ta = nn.Linear(attention_dim + embedding_dim, 1, bias=True) + self.ta = nn.Linear(attention_rnn_dim + embedding_dim, 1, bias=True) if location_attention: self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, @@ -208,7 +208,7 @@ class Attention(nn.Module): self.win_idx = torch.argmax(attention, 1).long()[0].item() return attention - def apply_forward_attention(self, inputs, alignment, processed_query): + def apply_forward_attention(self, inputs, alignment, query): # forward attention prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device) alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-8) * alignment @@ -218,7 +218,7 @@ class Attention(nn.Module): context = context.squeeze(1) # compute transition agent if self.trans_agent: - ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1) + ta_input = torch.cat([context, query.squeeze(1)], dim=-1) self.u = torch.sigmoid(self.ta(ta_input)) return context, self.alpha @@ -248,7 +248,7 @@ class Attention(nn.Module): self.update_location_attention(alignment) # apply forward attention if enabled if self.forward_attn: - context, self.attention_weights = self.apply_forward_attention(inputs, alignment, processed_query) + context, self.attention_weights = self.apply_forward_attention(inputs, alignment, attention_hidden_state) else: context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) From bb2b705e01b0eeac80cfdeeb751864629dbd5671 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 14 May 2019 13:53:26 +0200 Subject: [PATCH 12/12] small bug fixes --- layers/tacotron2.py | 1 + utils/visual.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 1833b0eb..6bb08ce1 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -509,6 +509,7 @@ class Decoder(nn.Module): else: self._init_states(inputs, mask=None, keep_states=True) + self.attention_layer.init_win_idx() self.attention_layer.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [False, False, False] diff --git a/utils/visual.py b/utils/visual.py index 2924b034..9fd7a790 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -46,6 +46,7 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON if CONFIG.use_phonemes: seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars) text = sequence_to_phoneme(seq) + print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() @@ -73,3 +74,4 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON if output_path: print(output_path) fig.savefig(output_path) + plt.close()