From d0ec4b91e5cc393fe29431fbe2c6c047fc5d5e4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 7 Dec 2021 12:55:45 +0000
Subject: [PATCH] Update Tacotron models

---
 TTS/tts/models/base_tacotron.py | 22 +++++++++++++++--
 TTS/tts/models/tacotron.py      | 43 +++++++++++++++++++++++---------
 TTS/tts/models/tacotron2.py     | 44 +++++++++++++++++++++++----------
 3 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index ca8f3bb9..54939c61 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -9,6 +9,8 @@ from torch import nn
 from TTS.tts.layers.losses import TacotronLoss
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import sequence_mask
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@@ -17,8 +19,14 @@ from TTS.utils.training import gradual_training_scheduler
 class BaseTacotron(BaseTTS):
     """Base class shared by Tacotron and Tacotron2"""
 
-    def __init__(self, config: Coqpit):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "TacotronConfig",
+        ap: "AudioProcessor",
+        tokenizer: "TTSTokenizer",
+        speaker_manager: SpeakerManager = None,
+    ):
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields as class attributes
         for key in config:
@@ -107,6 +115,16 @@ class BaseTacotron(BaseTTS):
         """Get the model criterion used in training."""
         return TacotronLoss(self.config)
 
+    @staticmethod
+    def init_from_config(config: Coqpit):
+        """Initialize model from config."""
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config)
+        return BaseTacotron(config, ap, tokenizer, speaker_manager)
+
     #############################
     # COMMON COMPUTE FUNCTIONS
     #############################
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 4e46d252..8341f5bb 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -1,7 +1,8 @@
 # coding: utf-8
 
+from typing import Dict, List, Union
+
 import torch
-from coqpit import Coqpit
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 
@@ -10,6 +11,7 @@ from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
 from TTS.tts.models.base_tacotron import BaseTacotron
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 
 
@@ -24,12 +26,15 @@ class Tacotron(BaseTacotron):
             a multi-speaker model. Defaults to None.
     """
 
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "TacotronConfig",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
 
-        self.speaker_manager = speaker_manager
-        chars, self.config, _ = self.get_characters(config)
-        config.num_chars = self.num_chars = len(chars)
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields to `self`
         # for fewer code change
@@ -302,16 +307,30 @@ class Tacotron(BaseTacotron):
     def train_log(
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
+    @staticmethod
+    def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (TacotronConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return Tacotron(new_config, ap, tokenizer, speaker_manager)
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index ead3bf2b..d4e665e3 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -1,9 +1,8 @@
 # coding: utf-8
 
-from typing import Dict
+from typing import Dict, List, Union
 
 import torch
-from coqpit import Coqpit
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 
@@ -12,6 +11,7 @@ from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
 from TTS.tts.models.base_tacotron import BaseTacotron
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 
 
@@ -40,12 +40,16 @@ class Tacotron2(BaseTacotron):
             Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
     """
 
-    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
-        super().__init__(config)
+    def __init__(
+        self,
+        config: "Tacotron2Config",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
+
+        super().__init__(config, ap, tokenizer, speaker_manager)
 
-        self.speaker_manager = speaker_manager
-        chars, self.config, _ = self.get_characters(config)
-        config.num_chars = len(chars)
         self.decoder_output_dim = config.out_channels
 
         # pass all config fields to `self`
@@ -325,16 +329,30 @@ class Tacotron2(BaseTacotron):
         self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
     ) -> None:  # pylint: disable=no-self-use
         """Log training progress."""
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
-        logger.train_audios(steps, audios, ap.sample_rate)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
 
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
     def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
-        ap = assets["audio_processor"]
-        figures, audios = self._create_logs(batch, outputs, ap)
+        figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.eval_figures(steps, figures)
-        logger.eval_audios(steps, audios, ap.sample_rate)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
+    @staticmethod
+    def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None):
+        """Initiate model from config
+
+        Args:
+            config (Tacotron2Config): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(new_config, samples)
+        return Tacotron2(new_config, ap, tokenizer, speaker_manager)