From c03768bb537c6f4deeabaf8cd1941991821c66ef Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 26 Jun 2023 17:16:26 +0200 Subject: [PATCH] Make style --- TTS/tts/configs/fast_speech_config.py | 2 +- TTS/tts/configs/fastspeech2_config.py | 2 +- TTS/tts/configs/speedy_speech_config.py | 42 +++++++++++++------------ TTS/tts/layers/losses.py | 11 +++++-- tests/tts_tests/test_tacotron_model.py | 3 +- 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index 16a76e21..af6c2db6 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig): base_model: str = "forward_tts" # model specific params - model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False) + model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False)) # multi-speaker settings num_speakers: int = 0 diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py index 68a3eec2..d179617f 100644 --- a/TTS/tts/configs/fastspeech2_config.py +++ b/TTS/tts/configs/fastspeech2_config.py @@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig): base_model: str = "forward_tts" # model specific params - model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True) + model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True)) # multi-speaker settings num_speakers: int = 0 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 4bf5101f..bf8517df 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig): base_model: str = "forward_tts" # set model args as SpeedySpeech - model_args: ForwardTTSArgs = ForwardTTSArgs( - use_pitch=False, - encoder_type="residual_conv_bn", - encoder_params={ - "kernel_size": 4, - "dilations": 4 * [1, 2, 4] + [1], - "num_conv_blocks": 2, - "num_res_blocks": 13, - }, - decoder_type="residual_conv_bn", - decoder_params={ - "kernel_size": 4, - "dilations": 4 * [1, 2, 4, 8] + [1], - "num_conv_blocks": 2, - "num_res_blocks": 17, - }, - out_channels=80, - hidden_channels=128, - positional_encoding=True, - detach_duration_predictor=True, + model_args: ForwardTTSArgs = field( + default_factory=lambda: ForwardTTSArgs( + use_pitch=False, + encoder_type="residual_conv_bn", + encoder_params={ + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13, + }, + decoder_type="residual_conv_bn", + decoder_params={ + "kernel_size": 4, + "dilations": 4 * [1, 2, 4, 8] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 17, + }, + out_channels=80, + hidden_channels=128, + positional_encoding=True, + detach_duration_predictor=True, + ) ) # multi-speaker settings diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index e12abf20..de5f408c 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -165,7 +165,7 @@ class BCELossMasked(nn.Module): def __init__(self, pos_weight: float = None): super().__init__() - self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False) + self.register_buffer("pos_weight", torch.tensor([pos_weight])) def forward(self, x, target, length): """ @@ -191,10 +191,15 @@ class BCELossMasked(nn.Module): mask = sequence_mask(sequence_length=length, max_len=target.size(1)) num_items = mask.sum() loss = functional.binary_cross_entropy_with_logits( - x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum" + x.masked_select(mask), + target.masked_select(mask), + pos_weight=self.pos_weight.to(x.device), + reduction="sum", ) else: - loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum") + loss = functional.binary_cross_entropy_with_logits( + x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum" + ) num_items = torch.numel(x) loss = loss / num_items return loss diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 07351a6a..906ec3d0 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor torch.manual_seed(1) use_cuda = torch.cuda.is_available() -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +device = torch.device("cuda" if use_cuda else "cpu") config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80) @@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase): batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1 ) batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze() - model = Tacotron(config).to(device) criterion = model.get_criterion() optimizer = model.get_optimizer()