diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 0b72fbd6..0e91c1f3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -304,7 +304,12 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels, 192, 3, args.dropout_p_duration_predictor, 4, cond_channels=self.embedded_speaker_dim + args.hidden_channels, + 192, + 3, + args.dropout_p_duration_predictor, + 4, + cond_channels=self.embedded_speaker_dim, ) else: self.duration_predictor = DurationPredictor( diff --git a/docs/source/models/glow_tts.md b/docs/source/models/glow_tts.md new file mode 100644 index 00000000..66171abd --- /dev/null +++ b/docs/source/models/glow_tts.md @@ -0,0 +1,22 @@ +# Glow TTS + +Glow TTS is a normalizing flow model for text-to-speech. It is built on the generic Glow model that is previously +used in computer vision and vocoder models. It uses "monotonic alignment search" (MAS) to fine the text-to-speech alignment +and uses the output to train a separate duration predictor network for faster inference run-time. + +## Important resources & papers +- GlowTTS: https://arxiv.org/abs/2005.11129 +- Glow (Generative Flow with invertible 1x1 Convolutions): https://arxiv.org/abs/1807.03039 +- Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html + +## GlowTTS Config +```{eval-rst} +.. autoclass:: TTS.tts.configs.glow_tts_config.GlowTTSConfig + :members: +``` + +## GlowTTS Model +```{eval-rst} +.. autoclass:: TTS.tts.models.glow_tts.GlowTTS + :members: +``` diff --git a/docs/source/models/vits.md b/docs/source/models/vits.md new file mode 100644 index 00000000..5c0e92f6 --- /dev/null +++ b/docs/source/models/vits.md @@ -0,0 +1,33 @@ +# VITS + +VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech +) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE, +Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment +using MAS as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. +It is a feed-forward model with x67.12 real-time factor on a GPU. + +## Important resources & papers +- VITS: https://arxiv.org/pdf/2106.06103.pdf +- Neural Spline Flows: https://arxiv.org/abs/1906.04032 +- Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf +- Generative Adversarial Networks: https://arxiv.org/abs/1406.2661 +- HiFiGAN: https://arxiv.org/abs/2010.05646 +- Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html + +## VitsConfig +```{eval-rst} +.. autoclass:: TTS.tts.configs.vits_config.VitsConfig + :members: +``` + +## VitsArgs +```{eval-rst} +.. autoclass:: TTS.tts.models.vits.VitsArgs + :members: +``` + +## Vits Model +```{eval-rst} +.. autoclass:: TTS.tts.models.vits.Vits + :members: +```