diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py
index 182e58fb..909f4299 100644
--- a/TTS/bin/train_vocoder_gan.py
+++ b/TTS/bin/train_vocoder_gan.py
@@ -42,6 +42,7 @@ def setup_loader(ap, is_val=False, verbose=False):
                              hop_len=ap.hop_length,
                              pad_short=c.pad_short,
                              conv_pad=c.conv_pad,
+                             return_pairs=c.diff_samples_for_G_and_D if 'diff_samples_for_G_and_D' in c else False,
                              is_training=not is_val,
                              return_segments=not is_val,
                              use_noise_augment=c.use_noise_augment,
@@ -62,25 +63,19 @@ def setup_loader(ap, is_val=False, verbose=False):
 
 def format_data(data):
     if isinstance(data[0], list):
-        # setup input data
-        c_G, x_G = data[0]
-        c_D, x_D = data[1]
-
-        # dispatch data to GPU
+        x_G, y_G = data[0]
+        x_D, y_D = data[1]
         if use_cuda:
-            c_G = c_G.cuda(non_blocking=True)
             x_G = x_G.cuda(non_blocking=True)
-            c_D = c_D.cuda(non_blocking=True)
+            y_G = y_G.cuda(non_blocking=True)
             x_D = x_D.cuda(non_blocking=True)
-
-        return c_G, x_G, c_D, x_D
-
-    # return a whole audio segment
-    co, x = data
+            y_D = y_D.cuda(non_blocking=True)
+        return x_G, y_G, x_D, y_D
+    x, y = data
     if use_cuda:
-        co = co.cuda(non_blocking=True)
         x = x.cuda(non_blocking=True)
-    return co, x, None, None
+        y = y.cuda(non_blocking=True)
+    return x, y, None, None
 
 
 def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
@@ -143,13 +138,20 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
                 if D_out_real is None:
                     feats_real = None
                 else:
+                    # we don't need scores for real samples for training G since they are always 1
                     _, feats_real = D_out_real
             else:
                 scores_fake = D_out_fake
 
         # compute losses
-        loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
-                                  feats_real, y_hat_sub, y_G_sub)
+        loss_G_dict = criterion_G(y_hat=y_hat,
+                                  y=y_G,
+                                  scores_fake=scores_fake,
+                                  feats_fake=feats_fake,
+                                  feats_real=feats_real,
+                                  y_hat_sub=y_hat_sub,
+                                  y_sub=y_G_sub)
+
         loss_G = loss_G_dict['G_loss']
 
         # optimizer generator
@@ -174,16 +176,22 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
         ##############################
         if global_step >= c.steps_to_start_discriminator:
             # discriminator pass
-            with torch.no_grad():
-                y_hat = model_G(c_D)
+            if c.diff_samples_for_G_and_D:
+                # use a different sample than generator
+                with torch.no_grad():
+                    y_hat = model_G(c_D)
 
-            # PQMF formatting
-            if y_hat.shape[1] > 1:
-                y_hat = model_G.pqmf_synthesis(y_hat)
+                # PQMF formatting
+                if y_hat.shape[1] > 1:
+                    y_hat = model_G.pqmf_synthesis(y_hat)
+            else:
+                # use the same samples as generator
+                c_D = c_G.clone()
+                y_D = y_G.clone()
 
             # run D with or without cond. features
             if len(signature(model_D.forward).parameters) == 2:
-                D_out_fake = model_D(y_hat.detach(), c_D)
+                D_out_fake = model_D(y_hat.detach().clone(), c_D)
                 D_out_real = model_D(y_D, c_D)
             else:
                 D_out_fake = model_D(y_hat.detach())
@@ -191,12 +199,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
 
             # format D outputs
             if isinstance(D_out_fake, tuple):
+                # model_D returns scores and features
                 scores_fake, feats_fake = D_out_fake
                 if D_out_real is None:
                     scores_real, feats_real = None, None
                 else:
                     scores_real, feats_real = D_out_real
             else:
+                # model D returns only scores
                 scores_fake = D_out_fake
                 scores_real = D_out_real
 
@@ -283,6 +293,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
                                           {'train/audio': sample_voice},
                                           c.audio["sample_rate"])
         end_time = time.time()
+        torch.cuda.empty_cache()
 
     # print epoch stats
     c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
@@ -422,6 +433,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
         if c.print_eval:
             c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
 
+        torch.cuda.empty_cache()
+
+
     if args.rank == 0:
         # compute spectrograms
         figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
diff --git a/TTS/vocoder/configs/modified_hifigan.json b/TTS/vocoder/configs/modified_hifigan.json
index e945635a..5851d2b4 100644
--- a/TTS/vocoder/configs/modified_hifigan.json
+++ b/TTS/vocoder/configs/modified_hifigan.json
@@ -15,6 +15,7 @@
         "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
         "log_func": "np.log",
+        "do_sound_norm": true,
 
         // Silence trimming
         "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
@@ -89,6 +90,7 @@
     //    "downsample_factors":[4, 4, 4]
     //},
     "steps_to_start_discriminator": 0,      // steps required to start GAN trainining.1
+    "diff_samples_for_G_and_D": false,   // draw a new sample from the dataset for the D pass.
 
     // GENERATOR
     "generator_model": "hifigan_generator",
diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py
index 1ab2c974..c101791a 100644
--- a/TTS/vocoder/datasets/gan_dataset.py
+++ b/TTS/vocoder/datasets/gan_dataset.py
@@ -20,6 +20,7 @@ class GANDataset(Dataset):
                  hop_len,
                  pad_short,
                  conv_pad=2,
+                 return_pairs=False,
                  is_training=True,
                  return_segments=True,
                  use_noise_augment=False,
@@ -33,6 +34,7 @@ class GANDataset(Dataset):
         self.hop_len = hop_len
         self.pad_short = pad_short
         self.conv_pad = conv_pad
+        self.return_pairs = return_pairs
         self.is_training = is_training
         self.return_segments = return_segments
         self.use_cache = use_cache
@@ -65,11 +67,17 @@ class GANDataset(Dataset):
     def __getitem__(self, idx):
         """ Return different items for Generator and Discriminator and
         cache acoustic features """
+
+        # set the seed differently for each worker
+        random.seed(torch.utils.data.get_worker_info().seed)
+
         if self.return_segments:
-            idx2 = self.G_to_D_mappings[idx]
             item1 = self.load_item(idx)
-            item2 = self.load_item(idx2)
-            return item1, item2
+            if self.return_pairs:
+                idx2 = self.G_to_D_mappings[idx]
+                item2 = self.load_item(idx2)
+                return item1, item2
+            return item1
         item1 = self.load_item(idx)
         return item1