diff --git a/config.json b/config.json
index 01860746..ebb0187d 100644
--- a/config.json
+++ b/config.json
@@ -12,7 +12,7 @@
   "text_cleaner": "english_cleaners",
 
   "epochs": 2000,
-  "lr": 0.003,
+  "lr": 0.001,
   "batch_size": 180,
   "r": 5,
 
diff --git a/layers/tacotron.py b/layers/tacotron.py
index b977c51e..c43d3dd3 100644
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@@ -307,9 +307,13 @@ class Decoder(nn.Module):
                 else:
                     # combine prev. model output and prev. real target
                     memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
+                    memory_input = torch.nn.functional.dropout(memory_input,
+                                                               0.1,
+                                                               training=True)
                     # add a random noise
-                    memory_input += torch.autograd.Variable(
-                        torch.randn(memory_input.size())).type_as(memory_input)
+                    noise = torch.autograd.Variable(
+                        memory_input.data.new(ins.size()).normal_(0.0, 1.0))
+                    memory_input = memory_input + noise
 
             # Prenet
             processed_memory = self.prenet(memory_input)
@@ -360,5 +364,5 @@ class Decoder(nn.Module):
         return outputs, alignments
 
 
-def is_end_of_frames(output, eps=0.1): #0.2 
+def is_end_of_frames(output, eps=0.2): #0.2
     return (output.data <= eps).all()
diff --git a/train.py b/train.py
index 0d432cce..99b47a9b 100644
--- a/train.py
+++ b/train.py
@@ -90,9 +90,6 @@ def main(args):
     # onnx.export(model, dummy_input, model_proto_path, verbose=True)
     # tb.add_graph_onnx(model_proto_path)
 
-    if use_cuda:
-        model = nn.DataParallel(model.cuda())
-
     optimizer = optim.Adam(model.parameters(), lr=c.lr)
 
     if args.restore_step:
@@ -103,10 +100,20 @@ def main(args):
         print("\n > Model restored from step %d\n" % args.restore_step)
         start_epoch = checkpoint['step'] // len(dataloader)
         best_loss = checkpoint['linear_loss']
-    else:
+    elif args.restore_path:
+        checkpoint = torch.load(args.restore_path)
+        model.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        print("\n > Model restored from step %d\n" % checkpoint['step'])
+        start_epoch = checkpoint['step'] // len(dataloader)
+        best_loss = checkpoint['linear_loss']
         start_epoch = 0
+    else:
         print("\n > Starting a new training")
 
+    if use_cuda:
+        model = nn.DataParallel(model.cuda())
+
     num_params = count_parameters(model)
     print(" | > Model has {} parameters".format(num_params))
 
@@ -142,9 +149,9 @@ def main(args):
             current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1
 
             # setup lr
-            current_lr = lr_decay(c.lr, current_step)
-            for params_group in optimizer.param_groups:
-                params_group['lr'] = current_lr
+            # current_lr = lr_decay(c.lr, current_step)
+            # for params_group in optimizer.param_groups:
+            #    params_group['lr'] = current_lr
 
             optimizer.zero_grad()
 
@@ -192,7 +199,7 @@ def main(args):
             # loss = loss.cuda()
 
             loss.backward()
-            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
+            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 0.5)  ## TODO: maybe no need
             optimizer.step()
 
             step_time = time.time() - start_time
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index ca32060c..0877056b 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -7,6 +7,7 @@ import datetime
 import json
 import torch
 import numpy as np
+from collections import OrderedDict
 
 
 class AttrDict(dict):