From c03768bb537c6f4deeabaf8cd1941991821c66ef Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 26 Jun 2023 17:16:26 +0200
Subject: [PATCH] Make style

---
 TTS/tts/configs/fast_speech_config.py   |  2 +-
 TTS/tts/configs/fastspeech2_config.py   |  2 +-
 TTS/tts/configs/speedy_speech_config.py | 42 +++++++++++++------------
 TTS/tts/layers/losses.py                | 11 +++++--
 tests/tts_tests/test_tacotron_model.py  |  3 +-
 5 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py
index 16a76e21..af6c2db6 100644
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
     base_model: str = "forward_tts"
 
     # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
 
     # multi-speaker settings
     num_speakers: int = 0
diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py
index 68a3eec2..d179617f 100644
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
     base_model: str = "forward_tts"
 
     # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
 
     # multi-speaker settings
     num_speakers: int = 0
diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py
index 4bf5101f..bf8517df 100644
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
     base_model: str = "forward_tts"
 
     # set model args as SpeedySpeech
-    model_args: ForwardTTSArgs = ForwardTTSArgs(
-        use_pitch=False,
-        encoder_type="residual_conv_bn",
-        encoder_params={
-            "kernel_size": 4,
-            "dilations": 4 * [1, 2, 4] + [1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 13,
-        },
-        decoder_type="residual_conv_bn",
-        decoder_params={
-            "kernel_size": 4,
-            "dilations": 4 * [1, 2, 4, 8] + [1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 17,
-        },
-        out_channels=80,
-        hidden_channels=128,
-        positional_encoding=True,
-        detach_duration_predictor=True,
+    model_args: ForwardTTSArgs = field(
+        default_factory=lambda: ForwardTTSArgs(
+            use_pitch=False,
+            encoder_type="residual_conv_bn",
+            encoder_params={
+                "kernel_size": 4,
+                "dilations": 4 * [1, 2, 4] + [1],
+                "num_conv_blocks": 2,
+                "num_res_blocks": 13,
+            },
+            decoder_type="residual_conv_bn",
+            decoder_params={
+                "kernel_size": 4,
+                "dilations": 4 * [1, 2, 4, 8] + [1],
+                "num_conv_blocks": 2,
+                "num_res_blocks": 17,
+            },
+            out_channels=80,
+            hidden_channels=128,
+            positional_encoding=True,
+            detach_duration_predictor=True,
+        )
     )
 
     # multi-speaker settings
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index e12abf20..de5f408c 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
 
     def __init__(self, pos_weight: float = None):
         super().__init__()
-        self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
+        self.register_buffer("pos_weight", torch.tensor([pos_weight]))
 
     def forward(self, x, target, length):
         """
@@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
             mask = sequence_mask(sequence_length=length, max_len=target.size(1))
             num_items = mask.sum()
             loss = functional.binary_cross_entropy_with_logits(
-                x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
+                x.masked_select(mask),
+                target.masked_select(mask),
+                pos_weight=self.pos_weight.to(x.device),
+                reduction="sum",
             )
         else:
-            loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
+            loss = functional.binary_cross_entropy_with_logits(
+                x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
+            )
             num_items = torch.numel(x)
         loss = loss / num_items
         return loss
diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py
index 07351a6a..906ec3d0 100644
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
 
 torch.manual_seed(1)
 use_cuda = torch.cuda.is_available()
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda" if use_cuda else "cpu")
 
 config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
 
@@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
             batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
         )
         batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
-
         model = Tacotron(config).to(device)
         criterion = model.get_criterion()
         optimizer = model.get_optimizer()