diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index f46c73a9..e766ed6a 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -20,6 +20,11 @@ class DurationPredictor(nn.Module): def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): super().__init__() + + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # class arguments self.in_channels = in_channels self.filter_channels = hidden_channels diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 7c25156a..120d0944 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -185,10 +185,14 @@ class StochasticDurationPredictor(nn.Module): dropout_p: float, num_flows=4, cond_channels=0, - language_emb_dim=None, + language_emb_dim=0, ): super().__init__() + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # condition encoder text self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ccd742b1..6b1dd325 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -321,7 +321,7 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 192, 3, args.dropout_p_duration_predictor, @@ -331,7 +331,7 @@ class Vits(BaseTTS): ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 256, 3, args.dropout_p_duration_predictor,