From e1accb6e2842c9e4c65162dffeac9d3d0535132b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 3 Jan 2022 17:44:57 +0100 Subject: [PATCH 1/6] Fix train_tts.py and uncomment code (#1051) * Fix SE loading and language embedding logic * remove trailing white space * Uncomment resmapling code for SCL --- TTS/bin/train_tts.py | 16 ++++++++++++---- TTS/tts/models/vits.py | 22 +++++++--------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 3360a940..0f8c4760 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,4 +1,5 @@ import os +import torch from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config from TTS.trainer import Trainer, TrainingArgs @@ -53,15 +54,22 @@ def main(): else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): - speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file")) + if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): + speaker_manager = SpeakerManager( + d_vectors_file_path=config.model_args.d_vector_file, + encoder_model_path=config.model_args.speaker_encoder_model_path, + encoder_config_path=config.model_args.speaker_encoder_config_path, + use_cuda=torch.cuda.is_available(), + ) + else: + speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file")) + config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers - else: - config.num_speakers = speaker_manager.num_speakers else: speaker_manager = None - if hasattr(config, "use_language_embedding") and config.use_language_embedding: + if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 8b09fdf9..b2e4be9e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -5,7 +5,7 @@ from typing import Dict, List, Tuple import torch -# import torchaudio +import torchaudio from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -419,21 +419,12 @@ class Vits(BaseTTS): hasattr(self.speaker_manager.speaker_encoder, "audio_config") and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): - # TODO: change this with torchaudio Resample - raise RuntimeError( - " [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!".format( - self.config.audio["sample_rate"], - self.speaker_manager.speaker_encoder.audio_config["sample_rate"], - ) - ) - # pylint: disable=W0101,W0105 - """ self.audio_transform = torchaudio.transforms.Resample( + self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], ) - else: - self.audio_transform = None - """ + else: + self.audio_transform = None def _init_speaker_embedding(self): # pylint: disable=attribute-defined-outside-init @@ -458,6 +449,7 @@ class Vits(BaseTTS): self.language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) if self.args.use_language_embedding and self.language_manager: + print(" > initialization of language-embedding layers.") self.num_languages = self.language_manager.num_languages self.embedded_language_dim = self.args.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) @@ -643,8 +635,8 @@ class Vits(BaseTTS): # resample audio to speaker encoder sample_rate # pylint: disable=W0105 - """if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch)""" + if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch) pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True) From e778bad626d94457833853bbfbe286b5d1a442fb Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 6 Jan 2022 15:07:27 +0100 Subject: [PATCH 2/6] Add argument to enable dp speaker conditioning --- TTS/tts/models/vits.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index b2e4be9e..cb349ca2 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -171,6 +171,9 @@ class VitsArgs(Coqpit): speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". + condition_dp_on_speaker (bool): + Condition the duration predictor on the speaker embedding. Defaults to True. + freeze_encoder (bool): Freeze the encoder weigths during training. Defaults to False. @@ -233,6 +236,7 @@ class VitsArgs(Coqpit): use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" + condition_dp_on_speaker: bool = True freeze_encoder: bool = False freeze_DP: bool = False freeze_PE: bool = False @@ -349,7 +353,7 @@ class Vits(BaseTTS): 3, args.dropout_p_duration_predictor, 4, - cond_channels=self.embedded_speaker_dim, + cond_channels=self.embedded_speaker_dim if self.args.condition_dp_on_speaker else 0, language_emb_dim=self.embedded_language_dim, ) else: @@ -358,7 +362,7 @@ class Vits(BaseTTS): 256, 3, args.dropout_p_duration_predictor, - cond_channels=self.embedded_speaker_dim, + cond_channels=self.embedded_speaker_dim if self.args.condition_dp_on_speaker else 0, language_emb_dim=self.embedded_language_dim, ) @@ -595,12 +599,15 @@ class Vits(BaseTTS): # duration predictor attn_durations = attn.sum(3) + g_dp = None + if self.args.condition_dp_on_speaker: + g_dp = g.detach() if self.args.detach_dp_input and g is not None else g if self.args.use_sdp: loss_duration = self.duration_predictor( x.detach() if self.args.detach_dp_input else x, x_mask, attn_durations, - g=g.detach() if self.args.detach_dp_input and g is not None else g, + g=g_dp, lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = loss_duration / torch.sum(x_mask) @@ -609,7 +616,7 @@ class Vits(BaseTTS): log_durations = self.duration_predictor( x.detach() if self.args.detach_dp_input else x, x_mask, - g=g.detach() if self.args.detach_dp_input and g is not None else g, + g=g_dp, lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask) @@ -685,10 +692,10 @@ class Vits(BaseTTS): if self.args.use_sdp: logw = self.duration_predictor( - x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb + x, x_mask, g=g if self.args.condition_dp_on_speaker else None, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb ) else: - logw = self.duration_predictor(x, x_mask, g=g, lang_emb=lang_emb) + logw = self.duration_predictor(x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb) w = torch.exp(logw) * x_mask * self.length_scale w_ceil = torch.ceil(w) From c7f5e005e17cd80f9aeba5f5b119430dfa193c4f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 4 Jan 2022 10:06:57 +0100 Subject: [PATCH 3/6] Compute embedding for new audios only --- TTS/bin/compute_embeddings.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 83a5aeae..2ac18651 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -29,6 +29,7 @@ parser.add_argument( help="Path to dataset config file.", ) parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.") +parser.add_argument("--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--eval", type=bool, help="compute eval.", default=True) @@ -40,7 +41,7 @@ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_spli wav_files = meta_data_train + meta_data_eval speaker_manager = SpeakerManager( - encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda + encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda ) # compute speaker embeddings @@ -52,11 +53,15 @@ for idx, wav_file in enumerate(tqdm(wav_files)): else: speaker_name = None - # extract the embedding - embedd = speaker_manager.compute_d_vector_from_clip(wav_file) + wav_file_name = os.path.basename(wav_file) + if args.old_file is not None and wav_file_name in speaker_manager.clip_ids: + # get the embedding from the old file + embedd = speaker_manager.get_d_vector_by_clip(wav_file_name) + else: + # extract the embedding + embedd = speaker_manager.compute_d_vector_from_clip(wav_file) # create speaker_mapping if target dataset is defined - wav_file_name = os.path.basename(wav_file) speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name]["name"] = speaker_name speaker_mapping[wav_file_name]["embedding"] = embedd From 0860d73cf804a99eb89e08133c1a6ee3f1383f4f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 10 Feb 2022 12:14:54 -0300 Subject: [PATCH 4/6] Remove Tensorflow requeriment (#1225) * Remove TF modules * Remove TF unit tests * Remove TF vocoder modules * Remove TF convert scripts * Remove TF requirement * Remove the Docs TF instructions * Remove TF inference support --- Makefile | 1 - README.md | 11 +- TTS/bin/convert_melgan_tflite.py | 25 -- TTS/bin/convert_melgan_torch_to_tf.py | 105 ----- TTS/bin/convert_tacotron2_tflite.py | 30 -- TTS/bin/convert_tacotron2_torch_to_tf.py | 187 -------- TTS/tts/layers/tacotron/tacotron2.py | 1 - TTS/tts/tf/README.md | 20 - TTS/tts/tf/__init__.py | 0 TTS/tts/tf/layers/tacotron/__init__.py | 0 TTS/tts/tf/layers/tacotron/common_layers.py | 301 ------------- TTS/tts/tf/layers/tacotron/tacotron2.py | 322 ------------- TTS/tts/tf/models/tacotron2.py | 116 ----- TTS/tts/tf/utils/convert_torch_to_tf_utils.py | 87 ---- TTS/tts/tf/utils/generic_utils.py | 105 ----- TTS/tts/tf/utils/io.py | 45 -- TTS/tts/tf/utils/tf_utils.py | 8 - TTS/tts/tf/utils/tflite.py | 27 -- TTS/tts/utils/synthesis.py | 117 +---- TTS/vocoder/tf/layers/melgan.py | 54 --- TTS/vocoder/tf/layers/pqmf.py | 60 --- TTS/vocoder/tf/models/melgan_generator.py | 133 ------ .../tf/models/multiband_melgan_generator.py | 65 --- TTS/vocoder/tf/utils/__init__.py | 0 .../tf/utils/convert_torch_to_tf_utils.py | 47 -- TTS/vocoder/tf/utils/generic_utils.py | 36 -- TTS/vocoder/tf/utils/io.py | 31 -- TTS/vocoder/tf/utils/tflite.py | 27 -- docs/source/converting_torch_to_tf.md | 21 - docs/source/index.md | 1 - docs/source/installation.md | 6 - ...l_Converting_PyTorch_to_TF_to_TFlite.ipynb | 425 ------------------ requirements.tf.txt | 1 - setup.py | 5 +- tests/tts_tests/test_tacotron2_tf_model.py | 156 ------- .../test_vocoder_tf_melgan_generator.py | 19 - tests/vocoder_tests/test_vocoder_tf_pqmf.py | 31 -- 37 files changed, 19 insertions(+), 2607 deletions(-) delete mode 100644 TTS/bin/convert_melgan_tflite.py delete mode 100644 TTS/bin/convert_melgan_torch_to_tf.py delete mode 100644 TTS/bin/convert_tacotron2_tflite.py delete mode 100644 TTS/bin/convert_tacotron2_torch_to_tf.py delete mode 100644 TTS/tts/tf/README.md delete mode 100644 TTS/tts/tf/__init__.py delete mode 100644 TTS/tts/tf/layers/tacotron/__init__.py delete mode 100644 TTS/tts/tf/layers/tacotron/common_layers.py delete mode 100644 TTS/tts/tf/layers/tacotron/tacotron2.py delete mode 100644 TTS/tts/tf/models/tacotron2.py delete mode 100644 TTS/tts/tf/utils/convert_torch_to_tf_utils.py delete mode 100644 TTS/tts/tf/utils/generic_utils.py delete mode 100644 TTS/tts/tf/utils/io.py delete mode 100644 TTS/tts/tf/utils/tf_utils.py delete mode 100644 TTS/tts/tf/utils/tflite.py delete mode 100644 TTS/vocoder/tf/layers/melgan.py delete mode 100644 TTS/vocoder/tf/layers/pqmf.py delete mode 100644 TTS/vocoder/tf/models/melgan_generator.py delete mode 100644 TTS/vocoder/tf/models/multiband_melgan_generator.py delete mode 100644 TTS/vocoder/tf/utils/__init__.py delete mode 100644 TTS/vocoder/tf/utils/convert_torch_to_tf_utils.py delete mode 100644 TTS/vocoder/tf/utils/generic_utils.py delete mode 100644 TTS/vocoder/tf/utils/io.py delete mode 100644 TTS/vocoder/tf/utils/tflite.py delete mode 100644 docs/source/converting_torch_to_tf.md delete mode 100644 notebooks/Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb delete mode 100644 requirements.tf.txt delete mode 100644 tests/tts_tests/test_tacotron2_tf_model.py delete mode 100644 tests/vocoder_tests/test_vocoder_tf_melgan_generator.py delete mode 100644 tests/vocoder_tests/test_vocoder_tf_pqmf.py diff --git a/Makefile b/Makefile index 32b4638b..2632dbab 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,6 @@ system-deps: ## install linux system deps dev-deps: ## install development deps pip install -r requirements.dev.txt - pip install -r requirements.tf.txt doc-deps: ## install docs dependencies pip install -r docs/requirements.txt diff --git a/README.md b/README.md index 4686ac67..e7774888 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,6 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models - Detailed training logs on the terminal and Tensorboard. - Support for Multi-speaker TTS. - Efficient, flexible, lightweight but feature complete `Trainer API`. -- Ability to convert PyTorch models to Tensorflow 2.0 and TFLite for inference. - Released and read-to-use models. - Tools to curate Text2Speech datasets under```dataset_analysis```. - Utilities to use and test your models. @@ -113,17 +112,11 @@ If you are only interested in [synthesizing speech](https://tts.readthedocs.io/e pip install TTS ``` -By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. - -```bash -pip install TTS[tf] -``` - If you plan to code or train models, clone 🐸TTS and install it locally. ```bash git clone https://github.com/coqui-ai/TTS -pip install -e .[all,dev,notebooks,tf] # Select the relevant extras +pip install -e .[all,dev,notebooks] # Select the relevant extras ``` If you are on Ubuntu (Debian), you can also run following commands for installation. @@ -204,12 +197,10 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht |- train*.py (train your target model.) |- distribute.py (train your TTS model using Multiple GPUs.) |- compute_statistics.py (compute dataset statistics for normalization.) - |- convert*.py (convert target torch model to TF.) |- ... |- tts/ (text to speech models) |- layers/ (model layer definitions) |- models/ (model definitions) - |- tf/ (Tensorflow 2 utilities and model implementations) |- utils/ (model specific utilities.) |- speaker_encoder/ (Speaker Encoder models.) |- (same) diff --git a/TTS/bin/convert_melgan_tflite.py b/TTS/bin/convert_melgan_tflite.py deleted file mode 100644 index a3a3fb66..00000000 --- a/TTS/bin/convert_melgan_tflite.py +++ /dev/null @@ -1,25 +0,0 @@ -# Convert Tensorflow Tacotron2 model to TF-Lite binary - -import argparse - -from TTS.utils.io import load_config -from TTS.vocoder.tf.utils.generic_utils import setup_generator -from TTS.vocoder.tf.utils.io import load_checkpoint -from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite - -parser = argparse.ArgumentParser() -parser.add_argument("--tf_model", type=str, help="Path to target torch model to be converted to TF.") -parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") -parser.add_argument("--output_path", type=str, help="path to tflite output binary.") -args = parser.parse_args() - -# Set constants -CONFIG = load_config(args.config_path) - -# load the model -model = setup_generator(CONFIG) -model.build_inference() -model = load_checkpoint(model, args.tf_model) - -# create tflite model -tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path) diff --git a/TTS/bin/convert_melgan_torch_to_tf.py b/TTS/bin/convert_melgan_torch_to_tf.py deleted file mode 100644 index c1fb8498..00000000 --- a/TTS/bin/convert_melgan_torch_to_tf.py +++ /dev/null @@ -1,105 +0,0 @@ -import argparse -import os -from difflib import SequenceMatcher - -import numpy as np -import tensorflow as tf -import torch - -from TTS.utils.io import load_config, load_fsspec -from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import ( - compare_torch_tf, - convert_tf_name, - transfer_weights_torch_to_tf, -) -from TTS.vocoder.tf.utils.generic_utils import setup_generator as setup_tf_generator -from TTS.vocoder.tf.utils.io import save_checkpoint -from TTS.vocoder.utils.generic_utils import setup_generator - -# prevent GPU use -os.environ["CUDA_VISIBLE_DEVICES"] = "" - -# define args -parser = argparse.ArgumentParser() -parser.add_argument("--torch_model_path", type=str, help="Path to target torch model to be converted to TF.") -parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") -parser.add_argument("--output_path", type=str, help="path to output file including file name to save TF model.") -args = parser.parse_args() - -# load model config -config_path = args.config_path -c = load_config(config_path) -num_speakers = 0 - -# init torch model -model = setup_generator(c) -checkpoint = load_fsspec(args.torch_model_path, map_location=torch.device("cpu")) -state_dict = checkpoint["model"] -model.load_state_dict(state_dict) -model.remove_weight_norm() -state_dict = model.state_dict() - -# init tf model -model_tf = setup_tf_generator(c) - -common_sufix = "/.ATTRIBUTES/VARIABLE_VALUE" -# get tf_model graph by passing an input -# B x D x T -dummy_input = tf.random.uniform((7, 80, 64), dtype=tf.float32) -mel_pred = model_tf(dummy_input, training=False) - -# get tf variables -tf_vars = model_tf.weights - -# match variable names with fuzzy logic -torch_var_names = list(state_dict.keys()) -tf_var_names = [we.name for we in model_tf.weights] -var_map = [] -for tf_name in tf_var_names: - # skip re-mapped layer names - if tf_name in [name[0] for name in var_map]: - continue - tf_name_edited = convert_tf_name(tf_name) - ratios = [SequenceMatcher(None, torch_name, tf_name_edited).ratio() for torch_name in torch_var_names] - max_idx = np.argmax(ratios) - matching_name = torch_var_names[max_idx] - del torch_var_names[max_idx] - var_map.append((tf_name, matching_name)) - -# pass weights -tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict) - -# Compare TF and TORCH models -# check embedding outputs -model.eval() -dummy_input_torch = torch.ones((1, 80, 10)) -dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy()) -dummy_input_tf = tf.transpose(dummy_input_tf, perm=[0, 2, 1]) -dummy_input_tf = tf.expand_dims(dummy_input_tf, 2) - -out_torch = model.layers[0](dummy_input_torch) -out_tf = model_tf.model_layers[0](dummy_input_tf) -out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :] - -assert compare_torch_tf(out_torch, out_tf_) < 1e-5 - -for i in range(1, len(model.layers)): - print(f"{i} -> {model.layers[i]} vs {model_tf.model_layers[i]}") - out_torch = model.layers[i](out_torch) - out_tf = model_tf.model_layers[i](out_tf) - out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :] - diff = compare_torch_tf(out_torch, out_tf_) - assert diff < 1e-5, diff - -torch.manual_seed(0) -dummy_input_torch = torch.rand((1, 80, 100)) -dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy()) -model.inference_padding = 0 -model_tf.inference_padding = 0 -output_torch = model.inference(dummy_input_torch) -output_tf = model_tf(dummy_input_tf, training=False) -assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(output_torch, output_tf) - -# save tf model -save_checkpoint(model_tf, checkpoint["step"], checkpoint["epoch"], args.output_path) -print(" > Model conversion is successfully completed :).") diff --git a/TTS/bin/convert_tacotron2_tflite.py b/TTS/bin/convert_tacotron2_tflite.py deleted file mode 100644 index 327d0ae8..00000000 --- a/TTS/bin/convert_tacotron2_tflite.py +++ /dev/null @@ -1,30 +0,0 @@ -# Convert Tensorflow Tacotron2 model to TF-Lite binary - -import argparse - -from TTS.tts.tf.utils.generic_utils import setup_model -from TTS.tts.tf.utils.io import load_checkpoint -from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite -from TTS.tts.utils.text.symbols import phonemes, symbols -from TTS.utils.io import load_config - -parser = argparse.ArgumentParser() -parser.add_argument("--tf_model", type=str, help="Path to target torch model to be converted to TF.") -parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") -parser.add_argument("--output_path", type=str, help="path to tflite output binary.") -args = parser.parse_args() - -# Set constants -CONFIG = load_config(args.config_path) - -# load the model -c = CONFIG -num_speakers = 0 -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c, enable_tflite=True) -model.build_inference() -model = load_checkpoint(model, args.tf_model) -model.decoder.set_max_decoder_steps(1000) - -# create tflite model -tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path) diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py deleted file mode 100644 index 78c6b362..00000000 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ /dev/null @@ -1,187 +0,0 @@ -import argparse -import os -import sys -from difflib import SequenceMatcher -from pprint import pprint - -import numpy as np -import tensorflow as tf -import torch - -from TTS.tts.models import setup_model -from TTS.tts.tf.models.tacotron2 import Tacotron2 -from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf -from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.utils.text.symbols import phonemes, symbols -from TTS.utils.io import load_config, load_fsspec - -sys.path.append("/home/erogol/Projects") -os.environ["CUDA_VISIBLE_DEVICES"] = "" - - -parser = argparse.ArgumentParser() -parser.add_argument("--torch_model_path", type=str, help="Path to target torch model to be converted to TF.") -parser.add_argument("--config_path", type=str, help="Path to config file of torch model.") -parser.add_argument("--output_path", type=str, help="path to output file including file name to save TF model.") -args = parser.parse_args() - -# load model config -config_path = args.config_path -c = load_config(config_path) -num_speakers = 0 - -# init torch model -model = setup_model(c) -checkpoint = load_fsspec(args.torch_model_path, map_location=torch.device("cpu")) -state_dict = checkpoint["model"] -model.load_state_dict(state_dict) - -# init tf model -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model_tf = Tacotron2( - num_chars=num_chars, - num_speakers=num_speakers, - r=model.decoder.r, - out_channels=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, -) - -# set initial layer mapping - these are not captured by the below heuristic approach -# TODO: set layer names so that we can remove these manual matching -common_sufix = "/.ATTRIBUTES/VARIABLE_VALUE" -var_map = [ - ("embedding/embeddings:0", "embedding.weight"), - ("encoder/lstm/forward_lstm/lstm_cell_1/kernel:0", "encoder.lstm.weight_ih_l0"), - ("encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0", "encoder.lstm.weight_hh_l0"), - ("encoder/lstm/backward_lstm/lstm_cell_2/kernel:0", "encoder.lstm.weight_ih_l0_reverse"), - ("encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0", "encoder.lstm.weight_hh_l0_reverse"), - ("encoder/lstm/forward_lstm/lstm_cell_1/bias:0", ("encoder.lstm.bias_ih_l0", "encoder.lstm.bias_hh_l0")), - ( - "encoder/lstm/backward_lstm/lstm_cell_2/bias:0", - ("encoder.lstm.bias_ih_l0_reverse", "encoder.lstm.bias_hh_l0_reverse"), - ), - ("attention/v/kernel:0", "decoder.attention.v.linear_layer.weight"), - ("decoder/linear_projection/kernel:0", "decoder.linear_projection.linear_layer.weight"), - ("decoder/stopnet/kernel:0", "decoder.stopnet.1.linear_layer.weight"), -] - -# %% -# get tf_model graph -model_tf.build_inference() - -# get tf variables -tf_vars = model_tf.weights - -# match variable names with fuzzy logic -torch_var_names = list(state_dict.keys()) -tf_var_names = [we.name for we in model_tf.weights] -for tf_name in tf_var_names: - # skip re-mapped layer names - if tf_name in [name[0] for name in var_map]: - continue - tf_name_edited = convert_tf_name(tf_name) - ratios = [SequenceMatcher(None, torch_name, tf_name_edited).ratio() for torch_name in torch_var_names] - max_idx = np.argmax(ratios) - matching_name = torch_var_names[max_idx] - del torch_var_names[max_idx] - var_map.append((tf_name, matching_name)) - -pprint(var_map) -pprint(torch_var_names) - -# pass weights -tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict) - -# Compare TF and TORCH models -# %% -# check embedding outputs -model.eval() -input_ids = torch.randint(0, 24, (1, 128)).long() - -o_t = model.embedding(input_ids) -o_tf = model_tf.embedding(input_ids.detach().numpy()) -assert abs(o_t.detach().numpy() - o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - o_tf.numpy()).sum() - -# compare encoder outputs -oo_en = model.encoder.inference(o_t.transpose(1, 2)) -ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False) -assert compare_torch_tf(oo_en, ooo_en) < 1e-5 - -# pylint: disable=redefined-builtin -# compare decoder.attention_rnn -inp = torch.rand([1, 768]) -inp_tf = inp.numpy() -model.decoder._init_states(oo_en, mask=None) # pylint: disable=protected-access -output, cell_state = model.decoder.attention_rnn(inp) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, states[2], training=False) -assert compare_torch_tf(output, output_tf).mean() < 1e-5 - -query = output -inputs = torch.rand([1, 128, 512]) -query_tf = query.detach().numpy() -inputs_tf = inputs.numpy() - -# compare decoder.attention -model.decoder.attention.init_states(inputs) -processes_inputs = model.decoder.attention.preprocess_inputs(inputs) -loc_attn, proc_query = model.decoder.attention.get_location_attention(query, processes_inputs) -context = model.decoder.attention(query, inputs, processes_inputs, None) - -attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1] -model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf)) -loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states) -context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False) - -assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5 -assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5 -assert compare_torch_tf(context, context_tf) < 1e-5 - -# compare decoder.decoder_rnn -input = torch.rand([1, 1536]) -input_tf = input.numpy() -model.decoder._init_states(oo_en, mask=None) # pylint: disable=protected-access -output, cell_state = model.decoder.decoder_rnn(input, [model.decoder.decoder_hidden, model.decoder.decoder_cell]) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, states[3], training=False) -assert abs(input - input_tf).mean() < 1e-5 -assert compare_torch_tf(output, output_tf).mean() < 1e-5 - -# compare decoder.linear_projection -input = torch.rand([1, 1536]) -input_tf = input.numpy() -output = model.decoder.linear_projection(input) -output_tf = model_tf.decoder.linear_projection(input_tf, training=False) -assert compare_torch_tf(output, output_tf) < 1e-5 - -# compare decoder outputs -model.decoder.max_decoder_steps = 100 -model_tf.decoder.set_max_decoder_steps(100) -output, align, stop = model.decoder.inference(oo_en) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False) -assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4 - -# compare the whole model output -outputs_torch = model.inference(input_ids) -outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy())) -print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean()) -assert compare_torch_tf(outputs_torch[2][:, 50, :], outputs_tf[2][:, 50, :]) < 1e-5 -assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4 - -# %% -# save tf model -save_checkpoint(model_tf, None, checkpoint["step"], checkpoint["epoch"], checkpoint["r"], args.output_path) -print(" > Model conversion is successfully completed :).") diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index 9c33623e..c79b7099 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -6,7 +6,6 @@ from .attentions import init_attn from .common_layers import Linear, Prenet -# NOTE: linter has a problem with the current TF release # pylint: disable=no-value-for-parameter # pylint: disable=unexpected-keyword-arg class ConvBNBlock(nn.Module): diff --git a/TTS/tts/tf/README.md b/TTS/tts/tf/README.md deleted file mode 100644 index 0f9d58e9..00000000 --- a/TTS/tts/tf/README.md +++ /dev/null @@ -1,20 +0,0 @@ -## Utilities to Convert Models to Tensorflow2 -Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=). - -Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations. - -Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome. - -To see how you can use TF model at inference, check the notebook. - -This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. - - -### Converting a Model -- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments. - -### Known issues ans limitations -- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors. -- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```. -- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper. -- You can only convert models trained after TF model implementation since model layers has been updated in Torch model. diff --git a/TTS/tts/tf/__init__.py b/TTS/tts/tf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/TTS/tts/tf/layers/tacotron/__init__.py b/TTS/tts/tf/layers/tacotron/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/TTS/tts/tf/layers/tacotron/common_layers.py b/TTS/tts/tf/layers/tacotron/common_layers.py deleted file mode 100644 index a6b87981..00000000 --- a/TTS/tts/tf/layers/tacotron/common_layers.py +++ /dev/null @@ -1,301 +0,0 @@ -import tensorflow as tf -from tensorflow import keras -from tensorflow.python.ops import math_ops - -# from tensorflow_addons.seq2seq import BahdanauAttention - -# NOTE: linter has a problem with the current TF release -# pylint: disable=no-value-for-parameter -# pylint: disable=unexpected-keyword-arg - - -class Linear(keras.layers.Layer): - def __init__(self, units, use_bias, **kwargs): - super().__init__(**kwargs) - self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name="linear_layer") - self.activation = keras.layers.ReLU() - - def call(self, x): - """ - shapes: - x: B x T x C - """ - return self.activation(self.linear_layer(x)) - - -class LinearBN(keras.layers.Layer): - def __init__(self, units, use_bias, **kwargs): - super().__init__(**kwargs) - self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name="linear_layer") - self.batch_normalization = keras.layers.BatchNormalization( - axis=-1, momentum=0.90, epsilon=1e-5, name="batch_normalization" - ) - self.activation = keras.layers.ReLU() - - def call(self, x, training=None): - """ - shapes: - x: B x T x C - """ - out = self.linear_layer(x) - out = self.batch_normalization(out, training=training) - return self.activation(out) - - -class Prenet(keras.layers.Layer): - def __init__(self, prenet_type, prenet_dropout, units, bias, **kwargs): - super().__init__(**kwargs) - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.linear_layers = [] - if prenet_type == "bn": - self.linear_layers += [ - LinearBN(unit, use_bias=bias, name=f"linear_layer_{idx}") for idx, unit in enumerate(units) - ] - elif prenet_type == "original": - self.linear_layers += [ - Linear(unit, use_bias=bias, name=f"linear_layer_{idx}") for idx, unit in enumerate(units) - ] - else: - raise RuntimeError(" [!] Unknown prenet type.") - if prenet_dropout: - self.dropout = keras.layers.Dropout(rate=0.5) - - def call(self, x, training=None): - """ - shapes: - x: B x T x C - """ - for linear in self.linear_layers: - if self.prenet_dropout: - x = self.dropout(linear(x), training=training) - else: - x = linear(x) - return x - - -def _sigmoid_norm(score): - attn_weights = tf.nn.sigmoid(score) - attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True) - return attn_weights - - -class Attention(keras.layers.Layer): - """TODO: implement forward_attention - TODO: location sensitive attention - TODO: implement attention windowing""" - - def __init__( - self, - attn_dim, - use_loc_attn, - loc_attn_n_filters, - loc_attn_kernel_size, - use_windowing, - norm, - use_forward_attn, - use_trans_agent, - use_forward_attn_mask, - **kwargs, - ): - super().__init__(**kwargs) - self.use_loc_attn = use_loc_attn - self.loc_attn_n_filters = loc_attn_n_filters - self.loc_attn_kernel_size = loc_attn_kernel_size - self.use_windowing = use_windowing - self.norm = norm - self.use_forward_attn = use_forward_attn - self.use_trans_agent = use_trans_agent - self.use_forward_attn_mask = use_forward_attn_mask - self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name="query_layer/linear_layer") - self.inputs_layer = tf.keras.layers.Dense( - attn_dim, use_bias=False, name=f"{self.name}/inputs_layer/linear_layer" - ) - self.v = tf.keras.layers.Dense(1, use_bias=True, name="v/linear_layer") - if use_loc_attn: - self.location_conv1d = keras.layers.Conv1D( - filters=loc_attn_n_filters, - kernel_size=loc_attn_kernel_size, - padding="same", - use_bias=False, - name="location_layer/location_conv1d", - ) - self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name="location_layer/location_dense") - if norm == "softmax": - self.norm_func = tf.nn.softmax - elif norm == "sigmoid": - self.norm_func = _sigmoid_norm - else: - raise ValueError("Unknown value for attention norm type") - - def init_states(self, batch_size, value_length): - states = [] - if self.use_loc_attn: - attention_cum = tf.zeros([batch_size, value_length]) - attention_old = tf.zeros([batch_size, value_length]) - states = [attention_cum, attention_old] - if self.use_forward_attn: - alpha = tf.concat([tf.ones([batch_size, 1]), tf.zeros([batch_size, value_length])[:, :-1] + 1e-7], 1) - states.append(alpha) - return tuple(states) - - def process_values(self, values): - """cache values for decoder iterations""" - # pylint: disable=attribute-defined-outside-init - self.processed_values = self.inputs_layer(values) - self.values = values - - def get_loc_attn(self, query, states): - """compute location attention, query layer and - unnorm. attention weights""" - attention_cum, attention_old = states[:2] - attn_cat = tf.stack([attention_old, attention_cum], axis=2) - - processed_query = self.query_layer(tf.expand_dims(query, 1)) - processed_attn = self.location_dense(self.location_conv1d(attn_cat)) - score = self.v(tf.nn.tanh(self.processed_values + processed_query + processed_attn)) - score = tf.squeeze(score, axis=2) - return score, processed_query - - def get_attn(self, query): - """compute query layer and unnormalized attention weights""" - processed_query = self.query_layer(tf.expand_dims(query, 1)) - score = self.v(tf.nn.tanh(self.processed_values + processed_query)) - score = tf.squeeze(score, axis=2) - return score, processed_query - - def apply_score_masking(self, score, mask): # pylint: disable=no-self-use - """ignore sequence paddings""" - padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) - # Bias so padding positions do not contribute to attention distribution. - score -= 1.0e9 * math_ops.cast(padding_mask, dtype=tf.float32) - return score - - def apply_forward_attention(self, alignment, alpha): # pylint: disable=no-self-use - # forward attention - fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0) - # compute transition potentials - new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment - # renormalize attention weights - new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True) - return new_alpha - - def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None): - states = [] - if self.use_loc_attn: - states = [old_states[0] + scores_norm, attn_weights] - if self.use_forward_attn: - states.append(new_alpha) - return tuple(states) - - def call(self, query, states): - """ - shapes: - query: B x D - """ - if self.use_loc_attn: - score, _ = self.get_loc_attn(query, states) - else: - score, _ = self.get_attn(query) - - # TODO: masking - # if mask is not None: - # self.apply_score_masking(score, mask) - # attn_weights shape == (batch_size, max_length, 1) - - # normalize attention scores - scores_norm = self.norm_func(score) - attn_weights = scores_norm - - # apply forward attention - new_alpha = None - if self.use_forward_attn: - new_alpha = self.apply_forward_attention(attn_weights, states[-1]) - attn_weights = new_alpha - - # update states tuple - # states = (cum_attn_weights, attn_weights, new_alpha) - states = self.update_states(states, scores_norm, attn_weights, new_alpha) - - # context_vector shape after sum == (batch_size, hidden_size) - context_vector = tf.matmul( - tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False - ) - context_vector = tf.squeeze(context_vector, axis=1) - return context_vector, attn_weights, states - - -# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b): -# dtype = processed_query.dtype -# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1] -# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2]) - - -# class LocationSensitiveAttention(BahdanauAttention): -# def __init__(self, -# units, -# memory=None, -# memory_sequence_length=None, -# normalize=False, -# probability_fn="softmax", -# kernel_initializer="glorot_uniform", -# dtype=None, -# name="LocationSensitiveAttention", -# location_attention_filters=32, -# location_attention_kernel_size=31): - -# super( self).__init__(units=units, -# memory=memory, -# memory_sequence_length=memory_sequence_length, -# normalize=normalize, -# probability_fn='softmax', ## parent module default -# kernel_initializer=kernel_initializer, -# dtype=dtype, -# name=name) -# if probability_fn == 'sigmoid': -# self.probability_fn = lambda score, _: self._sigmoid_normalization(score) -# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False) -# self.location_dense = keras.layers.Dense(units, use_bias=False) -# # self.v = keras.layers.Dense(1, use_bias=True) - -# def _location_sensitive_score(self, processed_query, keys, processed_loc): -# processed_query = tf.expand_dims(processed_query, 1) -# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2]) - -# def _location_sensitive(self, alignment_cum, alignment_old): -# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2) -# return self.location_dense(self.location_conv(alignment_cat)) - -# def _sigmoid_normalization(self, score): -# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True) - -# # def _apply_masking(self, score, mask): -# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) -# # # Bias so padding positions do not contribute to attention distribution. -# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) -# # return score - -# def _calculate_attention(self, query, state): -# alignment_cum, alignment_old = state[:2] -# processed_query = self.query_layer( -# query) if self.query_layer else query -# processed_loc = self._location_sensitive(alignment_cum, alignment_old) -# score = self._location_sensitive_score( -# processed_query, -# self.keys, -# processed_loc) -# alignment = self.probability_fn(score, state) -# alignment_cum = alignment_cum + alignment -# state[0] = alignment_cum -# state[1] = alignment -# return alignment, state - -# def compute_context(self, alignments): -# expanded_alignments = tf.expand_dims(alignments, 1) -# context = tf.matmul(expanded_alignments, self.values) -# context = tf.squeeze(context, [1]) -# return context - -# # def call(self, query, state): -# # alignment, next_state = self._calculate_attention(query, state) -# # return alignment, next_state diff --git a/TTS/tts/tf/layers/tacotron/tacotron2.py b/TTS/tts/tf/layers/tacotron/tacotron2.py deleted file mode 100644 index 1fe679d2..00000000 --- a/TTS/tts/tf/layers/tacotron/tacotron2.py +++ /dev/null @@ -1,322 +0,0 @@ -import tensorflow as tf -from tensorflow import keras - -from TTS.tts.tf.layers.tacotron.common_layers import Attention, Prenet -from TTS.tts.tf.utils.tf_utils import shape_list - - -# NOTE: linter has a problem with the current TF release -# pylint: disable=no-value-for-parameter -# pylint: disable=unexpected-keyword-arg -class ConvBNBlock(keras.layers.Layer): - def __init__(self, filters, kernel_size, activation, **kwargs): - super().__init__(**kwargs) - self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding="same", name="convolution1d") - self.batch_normalization = keras.layers.BatchNormalization( - axis=2, momentum=0.90, epsilon=1e-5, name="batch_normalization" - ) - self.dropout = keras.layers.Dropout(rate=0.5, name="dropout") - self.activation = keras.layers.Activation(activation, name="activation") - - def call(self, x, training=None): - o = self.convolution1d(x) - o = self.batch_normalization(o, training=training) - o = self.activation(o) - o = self.dropout(o, training=training) - return o - - -class Postnet(keras.layers.Layer): - def __init__(self, output_filters, num_convs, **kwargs): - super().__init__(**kwargs) - self.convolutions = [] - self.convolutions.append(ConvBNBlock(512, 5, "tanh", name="convolutions_0")) - for idx in range(1, num_convs - 1): - self.convolutions.append(ConvBNBlock(512, 5, "tanh", name=f"convolutions_{idx}")) - self.convolutions.append(ConvBNBlock(output_filters, 5, "linear", name=f"convolutions_{idx+1}")) - - def call(self, x, training=None): - o = x - for layer in self.convolutions: - o = layer(o, training=training) - return o - - -class Encoder(keras.layers.Layer): - def __init__(self, output_input_dim, **kwargs): - super().__init__(**kwargs) - self.convolutions = [] - for idx in range(3): - self.convolutions.append(ConvBNBlock(output_input_dim, 5, "relu", name=f"convolutions_{idx}")) - self.lstm = keras.layers.Bidirectional( - keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name="lstm" - ) - - def call(self, x, training=None): - o = x - for layer in self.convolutions: - o = layer(o, training=training) - o = self.lstm(o) - return o - - -class Decoder(keras.layers.Layer): - # pylint: disable=unused-argument - def __init__( - self, - frame_dim, - r, - attn_type, - use_attn_win, - attn_norm, - prenet_type, - prenet_dropout, - use_forward_attn, - use_trans_agent, - use_forward_attn_mask, - use_location_attn, - attn_K, - separate_stopnet, - speaker_emb_dim, - enable_tflite, - **kwargs, - ): - super().__init__(**kwargs) - self.frame_dim = frame_dim - self.r_init = tf.constant(r, dtype=tf.int32) - self.r = tf.constant(r, dtype=tf.int32) - self.output_dim = r * self.frame_dim - self.separate_stopnet = separate_stopnet - self.enable_tflite = enable_tflite - - # layer constants - self.max_decoder_steps = tf.constant(1000, dtype=tf.int32) - self.stop_thresh = tf.constant(0.5, dtype=tf.float32) - - # model dimensions - self.query_dim = 1024 - self.decoder_rnn_dim = 1024 - self.prenet_dim = 256 - self.attn_dim = 128 - self.p_attention_dropout = 0.1 - self.p_decoder_dropout = 0.1 - - self.prenet = Prenet(prenet_type, prenet_dropout, [self.prenet_dim, self.prenet_dim], bias=False, name="prenet") - self.attention_rnn = keras.layers.LSTMCell( - self.query_dim, - use_bias=True, - name="attention_rnn", - ) - self.attention_rnn_dropout = keras.layers.Dropout(0.5) - - # TODO: implement other attn options - self.attention = Attention( - attn_dim=self.attn_dim, - use_loc_attn=True, - loc_attn_n_filters=32, - loc_attn_kernel_size=31, - use_windowing=False, - norm=attn_norm, - use_forward_attn=use_forward_attn, - use_trans_agent=use_trans_agent, - use_forward_attn_mask=use_forward_attn_mask, - name="attention", - ) - self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name="decoder_rnn") - self.decoder_rnn_dropout = keras.layers.Dropout(0.5) - self.linear_projection = keras.layers.Dense(self.frame_dim * r, name="linear_projection/linear_layer") - self.stopnet = keras.layers.Dense(1, name="stopnet/linear_layer") - - def set_max_decoder_steps(self, new_max_steps): - self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32) - - def set_r(self, new_r): - self.r = tf.constant(new_r, dtype=tf.int32) - self.output_dim = self.frame_dim * new_r - - def build_decoder_initial_states(self, batch_size, memory_dim, memory_length): - zero_frame = tf.zeros([batch_size, self.frame_dim]) - zero_context = tf.zeros([batch_size, memory_dim]) - attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) - decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) - attention_states = self.attention.init_states(batch_size, memory_length) - return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states - - def step(self, prenet_next, states, memory_seq_length=None, training=None): - _, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states - attention_rnn_input = tf.concat([prenet_next, context_next], -1) - attention_rnn_output, attention_rnn_state = self.attention_rnn( - attention_rnn_input, attention_rnn_state, training=training - ) - attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training) - context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training) - decoder_rnn_input = tf.concat([attention_rnn_output, context], -1) - decoder_rnn_output, decoder_rnn_state = self.decoder_rnn( - decoder_rnn_input, decoder_rnn_state, training=training - ) - decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training) - linear_projection_input = tf.concat([decoder_rnn_output, context], -1) - output_frame = self.linear_projection(linear_projection_input, training=training) - stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1) - stopnet_output = self.stopnet(stopnet_input, training=training) - output_frame = output_frame[:, : self.r * self.frame_dim] - states = ( - output_frame[:, self.frame_dim * (self.r - 1) :], - context, - attention_rnn_state, - decoder_rnn_state, - attention_states, - ) - return output_frame, stopnet_output, states, attention - - def decode(self, memory, states, frames, memory_seq_length=None): - B, _, _ = shape_list(memory) - num_iter = shape_list(frames)[1] // self.r - # init states - frame_zero = tf.expand_dims(states[0], 1) - frames = tf.concat([frame_zero, frames], axis=1) - outputs = tf.TensorArray(dtype=tf.float32, size=num_iter) - attentions = tf.TensorArray(dtype=tf.float32, size=num_iter) - stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter) - # pre-computes - self.attention.process_values(memory) - prenet_output = self.prenet(frames, training=True) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions): - prenet_next = prenet_output[:, step] - output, stop_token, states, attention = self.step(prenet_next, states, memory_seq_length) - outputs = outputs.write(step, output) - attentions = attentions.write(step, attention) - stop_tokens = stop_tokens.write(step, stop_token) - return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions - - _, memory, _, states, outputs, stop_tokens, attentions = tf.while_loop( - lambda *arg: True, - _body, - loop_vars=(step_count, memory, prenet_output, states, outputs, stop_tokens, attentions), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=num_iter, - ) - - outputs = outputs.stack() - attentions = attentions.stack() - stop_tokens = stop_tokens.stack() - outputs = tf.transpose(outputs, [1, 0, 2]) - attentions = tf.transpose(attentions, [1, 0, 2]) - stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) - stop_tokens = tf.squeeze(stop_tokens, axis=2) - outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - def decode_inference(self, memory, states): - B, _, _ = shape_list(memory) - # init states - outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - - # pre-computes - self.attention.process_values(memory) - - # iter vars - stop_flag = tf.constant(False, dtype=tf.bool) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag): - frame_next = states[0] - prenet_next = self.prenet(frame_next, training=False) - output, stop_token, states, attention = self.step(prenet_next, states, None, training=False) - stop_token = tf.math.sigmoid(stop_token) - outputs = outputs.write(step, output) - attentions = attentions.write(step, attention) - stop_tokens = stop_tokens.write(step, stop_token) - stop_flag = tf.greater(stop_token, self.stop_thresh) - stop_flag = tf.reduce_all(stop_flag) - return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag - - cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) - _, memory, states, outputs, stop_tokens, attentions, stop_flag = tf.while_loop( - cond, - _body, - loop_vars=(step_count, memory, states, outputs, stop_tokens, attentions, stop_flag), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=self.max_decoder_steps, - ) - - outputs = outputs.stack() - attentions = attentions.stack() - stop_tokens = stop_tokens.stack() - - outputs = tf.transpose(outputs, [1, 0, 2]) - attentions = tf.transpose(attentions, [1, 0, 2]) - stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) - stop_tokens = tf.squeeze(stop_tokens, axis=2) - outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - def decode_inference_tflite(self, memory, states): - """Inference with TF-Lite compatibility. It assumes - batch_size is 1""" - # init states - # dynamic_shape is not supported in TFLite - outputs = tf.TensorArray( - dtype=tf.float32, - size=self.max_decoder_steps, - element_shape=tf.TensorShape([self.output_dim]), - clear_after_read=False, - dynamic_size=False, - ) - # stop_flags = tf.TensorArray(dtype=tf.bool, - # size=self.max_decoder_steps, - # element_shape=tf.TensorShape( - # []), - # clear_after_read=False, - # dynamic_size=False) - attentions = () - stop_tokens = () - - # pre-computes - self.attention.process_values(memory) - - # iter vars - stop_flag = tf.constant(False, dtype=tf.bool) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, states, outputs, stop_flag): - frame_next = states[0] - prenet_next = self.prenet(frame_next, training=False) - output, stop_token, states, _ = self.step(prenet_next, states, None, training=False) - stop_token = tf.math.sigmoid(stop_token) - stop_flag = tf.greater(stop_token, self.stop_thresh) - stop_flag = tf.reduce_all(stop_flag) - # stop_flags = stop_flags.write(step, tf.logical_not(stop_flag)) - - outputs = outputs.write(step, tf.reshape(output, [-1])) - return step + 1, memory, states, outputs, stop_flag - - cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) - step_count, memory, states, outputs, stop_flag = tf.while_loop( - cond, - _body, - loop_vars=(step_count, memory, states, outputs, stop_flag), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=self.max_decoder_steps, - ) - - outputs = outputs.stack() - outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter - outputs = tf.expand_dims(outputs, axis=[0]) - outputs = tf.transpose(outputs, [1, 0, 2]) - outputs = tf.reshape(outputs, [1, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - def call(self, memory, states, frames=None, memory_seq_length=None, training=False): - if training: - return self.decode(memory, states, frames, memory_seq_length) - if self.enable_tflite: - return self.decode_inference_tflite(memory, states) - return self.decode_inference(memory, states) diff --git a/TTS/tts/tf/models/tacotron2.py b/TTS/tts/tf/models/tacotron2.py deleted file mode 100644 index 7a1d695d..00000000 --- a/TTS/tts/tf/models/tacotron2.py +++ /dev/null @@ -1,116 +0,0 @@ -import tensorflow as tf -from tensorflow import keras - -from TTS.tts.tf.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet -from TTS.tts.tf.utils.tf_utils import shape_list - - -# pylint: disable=too-many-ancestors, abstract-method -class Tacotron2(keras.models.Model): - def __init__( - self, - num_chars, - num_speakers, - r, - out_channels=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - attn_K=4, - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - separate_stopnet=True, - bidirectional_decoder=False, - enable_tflite=False, - ): - super().__init__() - self.r = r - self.decoder_output_dim = decoder_output_dim - self.out_channels = out_channels - self.bidirectional_decoder = bidirectional_decoder - self.num_speakers = num_speakers - self.speaker_embed_dim = 256 - self.enable_tflite = enable_tflite - - self.embedding = keras.layers.Embedding(num_chars, 512, name="embedding") - self.encoder = Encoder(512, name="encoder") - # TODO: most of the decoder args have no use at the momment - self.decoder = Decoder( - decoder_output_dim, - r, - attn_type=attn_type, - use_attn_win=attn_win, - attn_norm=attn_norm, - prenet_type=prenet_type, - prenet_dropout=prenet_dropout, - use_forward_attn=forward_attn, - use_trans_agent=trans_agent, - use_forward_attn_mask=forward_attn_mask, - use_location_attn=location_attn, - attn_K=attn_K, - separate_stopnet=separate_stopnet, - speaker_emb_dim=self.speaker_embed_dim, - name="decoder", - enable_tflite=enable_tflite, - ) - self.postnet = Postnet(out_channels, 5, name="postnet") - - @tf.function(experimental_relax_shapes=True) - def call(self, characters, text_lengths=None, frames=None, training=None): - if training: - return self.training(characters, text_lengths, frames) - if not training: - return self.inference(characters) - raise RuntimeError(" [!] Set model training mode True or False") - - def training(self, characters, text_lengths, frames): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=True) - encoder_output = self.encoder(embedding_vectors, training=True) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder( - encoder_output, decoder_states, frames, text_lengths, training=True - ) - postnet_frames = self.postnet(decoder_frames, training=True) - output_frames = decoder_frames + postnet_frames - return decoder_frames, output_frames, attentions, stop_tokens - - def inference(self, characters): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=False) - encoder_output = self.encoder(embedding_vectors, training=False) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) - postnet_frames = self.postnet(decoder_frames, training=False) - output_frames = decoder_frames + postnet_frames - print(output_frames.shape) - return decoder_frames, output_frames, attentions, stop_tokens - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, None], dtype=tf.int32), - ], - ) - def inference_tflite(self, characters): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=False) - encoder_output = self.encoder(embedding_vectors, training=False) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) - postnet_frames = self.postnet(decoder_frames, training=False) - output_frames = decoder_frames + postnet_frames - print(output_frames.shape) - return decoder_frames, output_frames, attentions, stop_tokens - - def build_inference( - self, - ): - # TODO: issue https://github.com/PyCQA/pylint/issues/3613 - input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) # pylint: disable=unexpected-keyword-arg - self(input_ids) diff --git a/TTS/tts/tf/utils/convert_torch_to_tf_utils.py b/TTS/tts/tf/utils/convert_torch_to_tf_utils.py deleted file mode 100644 index 2c615a7d..00000000 --- a/TTS/tts/tf/utils/convert_torch_to_tf_utils.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -import tensorflow as tf - -# NOTE: linter has a problem with the current TF release -# pylint: disable=no-value-for-parameter -# pylint: disable=unexpected-keyword-arg - - -def tf_create_dummy_inputs(): - """Create dummy inputs for TF Tacotron2 model""" - batch_size = 4 - max_input_length = 32 - max_mel_length = 128 - pad = 1 - n_chars = 24 - input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32) - input_lengths = np.random.randint(0, high=max_input_length + 1 + pad, size=[batch_size]) - input_lengths[-1] = max_input_length - input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32) - mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80]) - mel_lengths = np.random.randint(0, high=max_mel_length + 1 + pad, size=[batch_size]) - mel_lengths[-1] = max_mel_length - mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32) - return input_ids, input_lengths, mel_outputs, mel_lengths - - -def compare_torch_tf(torch_tensor, tf_tensor): - """Compute the average absolute difference b/w torch and tf tensors""" - return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() - - -def convert_tf_name(tf_name): - """Convert certain patterns in TF layer names to Torch patterns""" - tf_name_tmp = tf_name - tf_name_tmp = tf_name_tmp.replace(":0", "") - tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_1/recurrent_kernel", "/weight_hh_l0") - tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_2/kernel", "/weight_ih_l1") - tf_name_tmp = tf_name_tmp.replace("/recurrent_kernel", "/weight_hh") - tf_name_tmp = tf_name_tmp.replace("/kernel", "/weight") - tf_name_tmp = tf_name_tmp.replace("/gamma", "/weight") - tf_name_tmp = tf_name_tmp.replace("/beta", "/bias") - tf_name_tmp = tf_name_tmp.replace("/", ".") - return tf_name_tmp - - -def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): - """Transfer weigths from torch state_dict to TF variables""" - print(" > Passing weights from Torch to TF ...") - for tf_var in tf_vars: - torch_var_name = var_map_dict[tf_var.name] - print(f" | > {tf_var.name} <-- {torch_var_name}") - # if tuple, it is a bias variable - if not isinstance(torch_var_name, tuple): - torch_layer_name = ".".join(torch_var_name.split(".")[-2:]) - torch_weight = state_dict[torch_var_name] - if "convolution1d/kernel" in tf_var.name or "conv1d/kernel" in tf_var.name: - # out_dim, in_dim, filter -> filter, in_dim, out_dim - numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy() - elif "lstm_cell" in tf_var.name and "kernel" in tf_var.name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - # if variable is for bidirectional lstm and it is a bias vector there - # needs to be pre-defined two matching torch bias vectors - elif "_lstm/lstm_cell_" in tf_var.name and "bias" in tf_var.name: - bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name] - assert len(bias_vectors) == 2 - numpy_weight = bias_vectors[0] + bias_vectors[1] - elif "rnn" in tf_var.name and "kernel" in tf_var.name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - elif "rnn" in tf_var.name and "bias" in tf_var.name: - bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key] - assert len(bias_vectors) == 2 - numpy_weight = bias_vectors[0] + bias_vectors[1] - elif "linear_layer" in torch_layer_name and "weight" in torch_var_name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - else: - numpy_weight = torch_weight.detach().cpu().numpy() - assert np.all( - tf_var.shape == numpy_weight.shape - ), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" - tf.keras.backend.set_value(tf_var, numpy_weight) - return tf_vars - - -def load_tf_vars(model_tf, tf_vars): - for tf_var in tf_vars: - model_tf.get_layer(tf_var.name).set_weights(tf_var) - return model_tf diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py deleted file mode 100644 index 681a9457..00000000 --- a/TTS/tts/tf/utils/generic_utils.py +++ /dev/null @@ -1,105 +0,0 @@ -import datetime -import importlib -import pickle - -import fsspec -import numpy as np -import tensorflow as tf - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): - state = { - "model": model.weights, - "optimizer": optimizer, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - "r": r, - } - state.update(kwargs) - with fsspec.open(output_path, "wb") as f: - pickle.dump(state, f) - - -def load_checkpoint(model, checkpoint_path): - with fsspec.open(checkpoint_path, "rb") as f: - checkpoint = pickle.load(f) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - try: - chkp_var_value = chkp_var_dict[layer_name] - except KeyError: - class_name = list(chkp_var_dict.keys())[0].split("/")[0] - layer_name = f"{class_name}/{layer_name}" - chkp_var_value = chkp_var_dict[layer_name] - - tf.keras.backend.set_value(tf_var, chkp_var_value) - if "r" in checkpoint.keys(): - model.decoder.set_r(checkpoint["r"]) - return model - - -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.max() - batch_size = sequence_length.size(0) - seq_range = np.empty([0, max_len], dtype=np.int8) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - seq_range_expand = seq_range_expand.type_as(sequence_length) - seq_length_expand = sequence_length.unsqueeze(1).expand_as(seq_range_expand) - # B x T_max - return seq_range_expand < seq_length_expand - - -# @tf.custom_gradient -def check_gradient(x, grad_clip): - x_normed = tf.clip_by_norm(x, grad_clip) - grad_norm = tf.norm(grad_clip) - return x_normed, grad_norm - - -def count_parameters(model, c): - try: - return model.count_params() - except RuntimeError: - input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype("int32")) - input_lengths = np.random.randint(100, 129, (8,)) - input_lengths[-1] = 128 - input_lengths = tf.convert_to_tensor(input_lengths.astype("int32")) - mel_spec = np.random.rand(8, 2 * c.r, c.audio["num_mels"]).astype("float32") - mel_spec = tf.convert_to_tensor(mel_spec) - speaker_ids = np.random.randint(0, 5, (8,)) if c.use_speaker_embedding else None - _ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids) - return model.count_params() - - -def setup_model(num_chars, num_speakers, c, enable_tflite=False): - print(" > Using model: {}".format(c.model)) - MyModel = importlib.import_module("TTS.tts.tf.models." + c.model.lower()) - MyModel = getattr(MyModel, c.model) - if c.model.lower() in "tacotron": - raise NotImplementedError(" [!] Tacotron model is not ready.") - # tacotron2 - model = MyModel( - num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - out_channels=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - enable_tflite=enable_tflite, - ) - return model diff --git a/TTS/tts/tf/utils/io.py b/TTS/tts/tf/utils/io.py deleted file mode 100644 index de6acff9..00000000 --- a/TTS/tts/tf/utils/io.py +++ /dev/null @@ -1,45 +0,0 @@ -import datetime -import pickle - -import fsspec -import tensorflow as tf - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): - state = { - "model": model.weights, - "optimizer": optimizer, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - "r": r, - } - state.update(kwargs) - with fsspec.open(output_path, "wb") as f: - pickle.dump(state, f) - - -def load_checkpoint(model, checkpoint_path): - with fsspec.open(checkpoint_path, "rb") as f: - checkpoint = pickle.load(f) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - try: - chkp_var_value = chkp_var_dict[layer_name] - except KeyError: - class_name = list(chkp_var_dict.keys())[0].split("/")[0] - layer_name = f"{class_name}/{layer_name}" - chkp_var_value = chkp_var_dict[layer_name] - - tf.keras.backend.set_value(tf_var, chkp_var_value) - if "r" in checkpoint.keys(): - model.decoder.set_r(checkpoint["r"]) - return model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model diff --git a/TTS/tts/tf/utils/tf_utils.py b/TTS/tts/tf/utils/tf_utils.py deleted file mode 100644 index 558936d5..00000000 --- a/TTS/tts/tf/utils/tf_utils.py +++ /dev/null @@ -1,8 +0,0 @@ -import tensorflow as tf - - -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] diff --git a/TTS/tts/tf/utils/tflite.py b/TTS/tts/tf/utils/tflite.py deleted file mode 100644 index 2f76aa50..00000000 --- a/TTS/tts/tf/utils/tflite.py +++ /dev/null @@ -1,27 +0,0 @@ -import fsspec -import tensorflow as tf - - -def convert_tacotron2_to_tflite(model, output_path=None, experimental_converter=True): - """Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is - provided, else return TFLite model.""" - - concrete_function = model.inference_tflite.get_concrete_function() - converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function]) - converter.experimental_new_converter = experimental_converter - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] - tflite_model = converter.convert() - print(f"Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.") - if output_path is not None: - # same model binary if outputpath is provided - with fsspec.open(output_path, "wb") as f: - f.write(tflite_model) - return None - return tflite_model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 24b747be..b2ea4208 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,19 +1,11 @@ -import os from typing import Dict import numpy as np -import pkg_resources import torch from torch import nn from .text import phoneme_to_sequence, text_to_sequence -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" - -installed = {pkg.key for pkg in pkg_resources.working_set} # pylint: disable=not-an-iterable -if "tensorflow" in installed or "tensorflow-gpu" in installed: - import tensorflow as tf - def text_to_seq(text, CONFIG, custom_symbols=None, language=None): text_cleaner = [CONFIG.text_cleaner] @@ -51,13 +43,6 @@ def numpy_to_torch(np_array, dtype, cuda=False): return tensor -def numpy_to_tf(np_array, dtype): - if np_array is None: - return None - tensor = tf.convert_to_tensor(np_array, dtype=dtype) - return tensor - - def compute_style_mel(style_wav, ap, cuda=False): style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) if cuda: @@ -103,53 +88,6 @@ def run_model_torch( return outputs -def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): - if CONFIG.gst and style_mel is not None: - raise NotImplementedError(" [!] GST inference not implemented for TF") - if speaker_id is not None: - raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") - # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model(inputs, training=False) - return decoder_output, postnet_output, alignments, stop_tokens - - -def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): - if CONFIG.gst and style_mel is not None: - raise NotImplementedError(" [!] GST inference not implemented for TfLite") - if speaker_id is not None: - raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") - # get input and output details - input_details = model.get_input_details() - output_details = model.get_output_details() - # reshape input tensor for the new input shape - model.resize_tensor_input(input_details[0]["index"], inputs.shape) - model.allocate_tensors() - detail = input_details[0] - # input_shape = detail['shape'] - model.set_tensor(detail["index"], inputs) - # run the model - model.invoke() - # collect outputs - decoder_output = model.get_tensor(output_details[0]["index"]) - postnet_output = model.get_tensor(output_details[1]["index"]) - # tflite model only returns feature frames - return decoder_output, postnet_output, None, None - - -def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].numpy() - decoder_output = decoder_output[0].numpy() - alignment = alignments[0].numpy() - stop_tokens = stop_tokens[0].numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - -def parse_outputs_tflite(postnet_output, decoder_output): - postnet_output = postnet_output[0] - decoder_output = decoder_output[0] - return postnet_output, decoder_output - - def trim_silence(wav, ap): return wav[: ap.find_endpoint(wav)] @@ -213,7 +151,6 @@ def synthesis( d_vector=None, language_id=None, language_name=None, - backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to the vocoder model. @@ -254,9 +191,6 @@ def synthesis( language_name (str): Language name corresponding to the language code used by the phonemizer. Defaults to None. - - backend (str): - tf or torch. Defaults to "torch". """ # GST processing style_mel = None @@ -270,44 +204,27 @@ def synthesis( custom_symbols = model.make_symbols(CONFIG) # preprocess the given text text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols, language=language_name) - # pass tensors to backend - if backend == "torch": - if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=use_cuda) - if d_vector is not None: - d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) - if language_id is not None: - language_id = id_to_torch(language_id, cuda=use_cuda) + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + + if language_id is not None: + language_id = id_to_torch(language_id, cuda=use_cuda) + + if not isinstance(style_mel, dict): + style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) + text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) + text_inputs = text_inputs.unsqueeze(0) - if not isinstance(style_mel, dict): - style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) - text_inputs = text_inputs.unsqueeze(0) - elif backend in ["tf", "tflite"]: - # TODO: handle speaker id for tf model - style_mel = numpy_to_tf(style_mel, tf.float32) - text_inputs = numpy_to_tf(text_inputs, tf.int32) - text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice - if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id) - model_outputs = outputs["model_outputs"] - model_outputs = model_outputs[0].data.cpu().numpy() - alignments = outputs["alignments"] - elif backend == "tf": - decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, text_inputs, CONFIG, speaker_id, style_mel - ) - model_outputs, decoder_output, alignments, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens - ) - elif backend == "tflite": - decoder_output, postnet_output, alignments, stop_tokens = run_model_tflite( - model, text_inputs, CONFIG, speaker_id, style_mel - ) - model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id) + model_outputs = outputs["model_outputs"] + model_outputs = model_outputs[0].data.cpu().numpy() + alignments = outputs["alignments"] + # convert outputs to numpy # plot results wav = None diff --git a/TTS/vocoder/tf/layers/melgan.py b/TTS/vocoder/tf/layers/melgan.py deleted file mode 100644 index 90bce6f1..00000000 --- a/TTS/vocoder/tf/layers/melgan.py +++ /dev/null @@ -1,54 +0,0 @@ -import tensorflow as tf - - -class ReflectionPad1d(tf.keras.layers.Layer): - def __init__(self, padding): - super().__init__() - self.padding = padding - - def call(self, x): - return tf.pad(x, [[0, 0], [self.padding, self.padding], [0, 0], [0, 0]], "REFLECT") - - -class ResidualStack(tf.keras.layers.Layer): - def __init__(self, channels, num_res_blocks, kernel_size, name): - super().__init__(name=name) - - assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." - base_padding = (kernel_size - 1) // 2 - - self.blocks = [] - num_layers = 2 - for idx in range(num_res_blocks): - layer_kernel_size = kernel_size - layer_dilation = layer_kernel_size ** idx - layer_padding = base_padding * layer_dilation - block = [ - tf.keras.layers.LeakyReLU(0.2), - ReflectionPad1d(layer_padding), - tf.keras.layers.Conv2D( - filters=channels, - kernel_size=(kernel_size, 1), - dilation_rate=(layer_dilation, 1), - use_bias=True, - padding="valid", - name=f"blocks.{idx}.{num_layers}", - ), - tf.keras.layers.LeakyReLU(0.2), - tf.keras.layers.Conv2D( - filters=channels, kernel_size=(1, 1), use_bias=True, name=f"blocks.{idx}.{num_layers + 2}" - ), - ] - self.blocks.append(block) - self.shortcuts = [ - tf.keras.layers.Conv2D(channels, kernel_size=1, use_bias=True, name=f"shortcuts.{i}") - for i in range(num_res_blocks) - ] - - def call(self, x): - for block, shortcut in zip(self.blocks, self.shortcuts): - res = shortcut(x) - for layer in block: - x = layer(x) - x += res - return x diff --git a/TTS/vocoder/tf/layers/pqmf.py b/TTS/vocoder/tf/layers/pqmf.py deleted file mode 100644 index 042f2f08..00000000 --- a/TTS/vocoder/tf/layers/pqmf.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np -import tensorflow as tf -from scipy import signal as sig - - -class PQMF(tf.keras.layers.Layer): - def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): - super().__init__() - # define filter coefficient - self.N = N - self.taps = taps - self.cutoff = cutoff - self.beta = beta - - QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta)) - H = np.zeros((N, len(QMF))) - G = np.zeros((N, len(QMF))) - for k in range(N): - constant_factor = (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2)) - phase = (-1) ** k * np.pi / 4 - H[k] = 2 * QMF * np.cos(constant_factor + phase) - - G[k] = 2 * QMF * np.cos(constant_factor - phase) - - # [N, 1, taps + 1] == [filter_width, in_channels, out_channels] - self.H = np.transpose(H[:, None, :], (2, 1, 0)).astype("float32") - self.G = np.transpose(G[None, :, :], (2, 1, 0)).astype("float32") - - # filter for downsampling & upsampling - updown_filter = np.zeros((N, N, N), dtype=np.float32) - for k in range(N): - updown_filter[0, k, k] = 1.0 - self.updown_filter = updown_filter.astype(np.float32) - - def analysis(self, x): - """ - x : :math:`[B, 1, T]` - """ - x = tf.transpose(x, perm=[0, 2, 1]) - x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) - x = tf.nn.conv1d(x, self.H, stride=1, padding="VALID") - x = tf.nn.conv1d(x, self.updown_filter, stride=self.N, padding="VALID") - x = tf.transpose(x, perm=[0, 2, 1]) - return x - - def synthesis(self, x): - """ - x : B x D x T - """ - x = tf.transpose(x, perm=[0, 2, 1]) - x = tf.nn.conv1d_transpose( - x, - self.updown_filter * self.N, - strides=self.N, - output_shape=(tf.shape(x)[0], tf.shape(x)[1] * self.N, self.N), - ) - x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) - x = tf.nn.conv1d(x, self.G, stride=1, padding="VALID") - x = tf.transpose(x, perm=[0, 2, 1]) - return x diff --git a/TTS/vocoder/tf/models/melgan_generator.py b/TTS/vocoder/tf/models/melgan_generator.py deleted file mode 100644 index 09ee9530..00000000 --- a/TTS/vocoder/tf/models/melgan_generator.py +++ /dev/null @@ -1,133 +0,0 @@ -import logging -import os - -import tensorflow as tf - -from TTS.vocoder.tf.layers.melgan import ReflectionPad1d, ResidualStack - -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # FATAL -logging.getLogger("tensorflow").setLevel(logging.FATAL) - -from TTS.vocoder.tf.layers.melgan import ReflectionPad1d, ResidualStack - - -# pylint: disable=too-many-ancestors -# pylint: disable=abstract-method -class MelganGenerator(tf.keras.models.Model): - """Melgan Generator TF implementation dedicated for inference with no - weight norm""" - - def __init__( - self, - in_channels=80, - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=(8, 8, 2, 2), - res_kernel=3, - num_res_blocks=3, - ): - super().__init__() - - self.in_channels = in_channels - - # assert model parameters - assert (proj_kernel - 1) % 2 == 0, " [!] proj_kernel should be an odd number." - - # setup additional model parameters - base_padding = (proj_kernel - 1) // 2 - act_slope = 0.2 - self.inference_padding = 2 - - # initial layer - self.initial_layer = [ - ReflectionPad1d(base_padding), - tf.keras.layers.Conv2D( - filters=base_channels, kernel_size=(proj_kernel, 1), strides=1, padding="valid", use_bias=True, name="1" - ), - ] - num_layers = 3 # count number of layers for layer naming - - # upsampling layers and residual stacks - self.upsample_layers = [] - for idx, upsample_factor in enumerate(upsample_factors): - layer_out_channels = base_channels // (2 ** (idx + 1)) - layer_filter_size = upsample_factor * 2 - layer_stride = upsample_factor - # layer_output_padding = upsample_factor % 2 - self.upsample_layers += [ - tf.keras.layers.LeakyReLU(act_slope), - tf.keras.layers.Conv2DTranspose( - filters=layer_out_channels, - kernel_size=(layer_filter_size, 1), - strides=(layer_stride, 1), - padding="same", - # output_padding=layer_output_padding, - use_bias=True, - name=f"{num_layers}", - ), - ResidualStack( - channels=layer_out_channels, - num_res_blocks=num_res_blocks, - kernel_size=res_kernel, - name=f"layers.{num_layers + 1}", - ), - ] - num_layers += num_res_blocks - 1 - - self.upsample_layers += [tf.keras.layers.LeakyReLU(act_slope)] - - # final layer - self.final_layers = [ - ReflectionPad1d(base_padding), - tf.keras.layers.Conv2D( - filters=out_channels, kernel_size=(proj_kernel, 1), use_bias=True, name=f"layers.{num_layers + 1}" - ), - tf.keras.layers.Activation("tanh"), - ] - - # self.model_layers = tf.keras.models.Sequential(self.initial_layer + self.upsample_layers + self.final_layers, name="layers") - self.model_layers = self.initial_layer + self.upsample_layers + self.final_layers - - @tf.function(experimental_relax_shapes=True) - def call(self, c, training=False): - """ - c : :math:`[B, C, T]` - """ - if training: - raise NotImplementedError() - return self.inference(c) - - def inference(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - # o = self.model_layers(c) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - return o[:, :, 0, :] - - def build_inference(self): - x = tf.random.uniform((1, self.in_channels, 4), dtype=tf.float32) - self(x, training=False) - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, None, None], dtype=tf.float32), - ], - ) - def inference_tflite(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - # o = self.model_layers(c) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - return o[:, :, 0, :] diff --git a/TTS/vocoder/tf/models/multiband_melgan_generator.py b/TTS/vocoder/tf/models/multiband_melgan_generator.py deleted file mode 100644 index 24d899b2..00000000 --- a/TTS/vocoder/tf/models/multiband_melgan_generator.py +++ /dev/null @@ -1,65 +0,0 @@ -import tensorflow as tf - -from TTS.vocoder.tf.layers.pqmf import PQMF -from TTS.vocoder.tf.models.melgan_generator import MelganGenerator - - -# pylint: disable=too-many-ancestors -# pylint: disable=abstract-method -class MultibandMelganGenerator(MelganGenerator): - def __init__( - self, - in_channels=80, - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=(2, 8, 2, 2), - res_kernel=3, - num_res_blocks=3, - ): - super().__init__( - in_channels=in_channels, - out_channels=out_channels, - proj_kernel=proj_kernel, - base_channels=base_channels, - upsample_factors=upsample_factors, - res_kernel=res_kernel, - num_res_blocks=num_res_blocks, - ) - self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) - - def pqmf_analysis(self, x): - return self.pqmf_layer.analysis(x) - - def pqmf_synthesis(self, x): - return self.pqmf_layer.synthesis(x) - - def inference(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - o = self.pqmf_layer.synthesis(o[:, :, 0, :]) - return o - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, 80, None], dtype=tf.float32), - ], - ) - def inference_tflite(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - o = self.pqmf_layer.synthesis(o[:, :, 0, :]) - return o diff --git a/TTS/vocoder/tf/utils/__init__.py b/TTS/vocoder/tf/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/TTS/vocoder/tf/utils/convert_torch_to_tf_utils.py b/TTS/vocoder/tf/utils/convert_torch_to_tf_utils.py deleted file mode 100644 index 453d8b78..00000000 --- a/TTS/vocoder/tf/utils/convert_torch_to_tf_utils.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np -import tensorflow as tf - - -def compare_torch_tf(torch_tensor, tf_tensor): - """Compute the average absolute difference b/w torch and tf tensors""" - return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() - - -def convert_tf_name(tf_name): - """Convert certain patterns in TF layer names to Torch patterns""" - tf_name_tmp = tf_name - tf_name_tmp = tf_name_tmp.replace(":0", "") - tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_1/recurrent_kernel", "/weight_hh_l0") - tf_name_tmp = tf_name_tmp.replace("/forward_lstm/lstm_cell_2/kernel", "/weight_ih_l1") - tf_name_tmp = tf_name_tmp.replace("/recurrent_kernel", "/weight_hh") - tf_name_tmp = tf_name_tmp.replace("/kernel", "/weight") - tf_name_tmp = tf_name_tmp.replace("/gamma", "/weight") - tf_name_tmp = tf_name_tmp.replace("/beta", "/bias") - tf_name_tmp = tf_name_tmp.replace("/", ".") - return tf_name_tmp - - -def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): - """Transfer weigths from torch state_dict to TF variables""" - print(" > Passing weights from Torch to TF ...") - for tf_var in tf_vars: - torch_var_name = var_map_dict[tf_var.name] - print(f" | > {tf_var.name} <-- {torch_var_name}") - # if tuple, it is a bias variable - if "kernel" in tf_var.name: - torch_weight = state_dict[torch_var_name] - numpy_weight = torch_weight.permute([2, 1, 0]).numpy()[:, None, :, :] - if "bias" in tf_var.name: - torch_weight = state_dict[torch_var_name] - numpy_weight = torch_weight - assert np.all( - tf_var.shape == numpy_weight.shape - ), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" - tf.keras.backend.set_value(tf_var, numpy_weight) - return tf_vars - - -def load_tf_vars(model_tf, tf_vars): - for tf_var in tf_vars: - model_tf.get_layer(tf_var.name).set_weights(tf_var) - return model_tf diff --git a/TTS/vocoder/tf/utils/generic_utils.py b/TTS/vocoder/tf/utils/generic_utils.py deleted file mode 100644 index 94364ab4..00000000 --- a/TTS/vocoder/tf/utils/generic_utils.py +++ /dev/null @@ -1,36 +0,0 @@ -import importlib -import re - - -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.tf.models." + c.generator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in "melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "melgan_fb_generator": - pass - if c.generator_model in "multiband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - return model diff --git a/TTS/vocoder/tf/utils/io.py b/TTS/vocoder/tf/utils/io.py deleted file mode 100644 index 3de8adab..00000000 --- a/TTS/vocoder/tf/utils/io.py +++ /dev/null @@ -1,31 +0,0 @@ -import datetime -import pickle - -import fsspec -import tensorflow as tf - - -def save_checkpoint(model, current_step, epoch, output_path, **kwargs): - """Save TF Vocoder model""" - state = { - "model": model.weights, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - with fsspec.open(output_path, "wb") as f: - pickle.dump(state, f) - - -def load_checkpoint(model, checkpoint_path): - """Load TF Vocoder model""" - with fsspec.open(checkpoint_path, "rb") as f: - checkpoint = pickle.load(f) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - chkp_var_value = chkp_var_dict[layer_name] - tf.keras.backend.set_value(tf_var, chkp_var_value) - return model diff --git a/TTS/vocoder/tf/utils/tflite.py b/TTS/vocoder/tf/utils/tflite.py deleted file mode 100644 index 876739fd..00000000 --- a/TTS/vocoder/tf/utils/tflite.py +++ /dev/null @@ -1,27 +0,0 @@ -import fsspec -import tensorflow as tf - - -def convert_melgan_to_tflite(model, output_path=None, experimental_converter=True): - """Convert Tensorflow MelGAN model to TFLite. Save a binary file if output_path is - provided, else return TFLite model.""" - - concrete_function = model.inference_tflite.get_concrete_function() - converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function]) - converter.experimental_new_converter = experimental_converter - converter.optimizations = [] - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] - tflite_model = converter.convert() - print(f"Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.") - if output_path is not None: - # same model binary if outputpath is provided - with fsspec.open(output_path, "wb") as f: - f.write(tflite_model) - return None - return tflite_model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model diff --git a/docs/source/converting_torch_to_tf.md b/docs/source/converting_torch_to_tf.md deleted file mode 100644 index 20a0be6b..00000000 --- a/docs/source/converting_torch_to_tf.md +++ /dev/null @@ -1,21 +0,0 @@ -# Converting Torch to TF 2 - -Currently, 🐸TTS supports the vanilla Tacotron2 and MelGAN models in TF 2.It does not support advanced attention methods and other small tricks used by the Torch models. You can convert any Torch model trained after v0.0.2. - -You can also export TF 2 models to TFLite for even faster inference. - -## How to convert from Torch to TF 2.0 -Make sure you installed Tensorflow v2.2. It is not installed by default by :frog: TTS. - -All the TF related code stays under ```tf``` folder. - -To convert a **compatible** Torch model, run the following command with the right arguments: - -```bash -python TTS/bin/convert_tacotron2_torch_to_tf.py\ - --torch_model_path /path/to/torch/model.pth.tar \ - --config_path /path/to/model/config.json\ - --output_path /path/to/output/tf/model -``` - -This will create a TF model file. Notice that our model format is not compatible with the official TF checkpoints. We created our custom format to match Torch checkpoints we use. Therefore, use the ```load_checkpoint``` and ```save_checkpoint``` functions provided under ```TTS.tf.generic_utils```. diff --git a/docs/source/index.md b/docs/source/index.md index 756cea8e..9dc5bfce 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -27,7 +27,6 @@ formatting_your_dataset what_makes_a_good_dataset tts_datasets - converting_torch_to_tf .. toctree:: :maxdepth: 2 diff --git a/docs/source/installation.md b/docs/source/installation.md index 6532ee8e..0122271d 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -12,12 +12,6 @@ You can install from PyPI as follows: pip install TTS # from PyPI ``` -By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. - -```bash -pip install TTS[tf] -``` - Or install from Github: ```bash diff --git a/notebooks/Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb b/notebooks/Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb deleted file mode 100644 index 8a25132c..00000000 --- a/notebooks/Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb +++ /dev/null @@ -1,425 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Converting Pytorch models to Tensorflow and TFLite by CoquiTTS" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "This is a tutorial demonstrating Coqui TTS capabilities to convert \n", - "trained PyTorch models to Tensorflow and Tflite.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MBJjGYnoEo4v" - }, - "source": [ - "# Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download TF Models and configs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "b461952f-8507-4dd2-af06-4e6b8692765d", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", - "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "f67c3138-bda0-4b3e-ffcc-647f9feec23e", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", - "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", - "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "3IGvvCRMEwqn" - }, - "source": [ - "# Model Conversion PyTorch -> TF -> TFLite" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tLhz8SAf8Pgp" - }, - "source": [ - "## Converting PyTorch to Tensorflow\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "Xsrvr_WQ8Ib5", - "outputId": "dae96616-e5f7-41b6-cdb9-5026cfcd3214", - "tags": [] - }, - "outputs": [], - "source": [ - "# convert TTS model to Tensorflow\n", - "!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "VJ4NA5If9ljv", - "outputId": "1520dca8-1db8-4e07-bc0c-b1d5941c775e", - "tags": [] - }, - "outputs": [], - "source": [ - "# convert Vocoder model to Tensorflow\n", - "!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "7d5vTkBZ-BYQ" - }, - "source": [ - "## Converting Tensorflow to TFLite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 927 - }, - "colab_type": "code", - "id": "33hTfpuU99cg", - "outputId": "8a0e5be1-23a2-4128-ee37-8232adcb8ff0", - "tags": [] - }, - "outputs": [], - "source": [ - "# convert TTS model to TFLite\n", - "!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 364 - }, - "colab_type": "code", - "id": "e00Hm75Y-wZ2", - "outputId": "42381b05-3c9d-44f0-dac7-d81efd95eadf", - "tags": [] - }, - "outputs": [], - "source": [ - "# convert Vocoder model to TFLite\n", - "!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "# Run Inference with TFLite " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def run_vocoder(mel_spec):\n", - " vocoder_inputs = mel_spec[None, :, :]\n", - " # get input and output details\n", - " input_details = vocoder_model.get_input_details()\n", - " # reshape input tensor for the new input shape\n", - " vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n", - " vocoder_model.allocate_tensors()\n", - " detail = input_details[0]\n", - " vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n", - " # run the model\n", - " vocoder_model.invoke()\n", - " # collect outputs\n", - " output_details = vocoder_model.get_output_details()\n", - " waveform = vocoder_model.get_tensor(output_details[0]['index'])\n", - " return waveform \n", - "\n", - "\n", - "def tts(model, text, CONFIG, p):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", - " backend='tflite')\n", - " waveform = run_vocoder(mel_postnet_spec.T)\n", - " waveform = waveform[0, 0]\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load TF Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tts.tf.utils.tflite import load_tflite_model\n", - "from TTS.tts.tf.utils.io import load_checkpoint\n", - "from TTS.utils.io import load_config\n", - "from TTS.tts.utils.text.symbols import symbols, phonemes\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"data/tts_model.tflite\"\n", - "TTS_CONFIG = \"data/config.json\"\n", - "VOCODER_MODEL = \"data/vocoder_model.tflite\"\n", - "VOCODER_CONFIG = \"data/config_vocoder.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "21cda136-de87-4d55-fd46-7d5306103d90", - "tags": [] - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8fLoI4ipqMeS" - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the models\n", - "model = load_tflite_model(TTS_MODEL)\n", - "vocoder_model = load_tflite_model(VOCODER_MODEL)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Sample Sentence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "535c2df1-c27c-458b-e14b-41a977635aa1", - "tags": [] - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/requirements.tf.txt b/requirements.tf.txt deleted file mode 100644 index 8e256a90..00000000 --- a/requirements.tf.txt +++ /dev/null @@ -1 +0,0 @@ -tensorflow==2.5.0 diff --git a/setup.py b/setup.py index 95f0841b..1d4dbf1c 100644 --- a/setup.py +++ b/setup.py @@ -65,9 +65,7 @@ with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f: requirements_notebooks = f.readlines() with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f: requirements_dev = f.readlines() -with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f: - requirements_tf = f.readlines() -requirements_all = requirements_dev + requirements_notebooks + requirements_tf +requirements_all = requirements_dev + requirements_notebooks with open("README.md", "r", encoding="utf-8") as readme_file: README = readme_file.read() @@ -116,7 +114,6 @@ setup( "all": requirements_all, "dev": requirements_dev, "notebooks": requirements_notebooks, - "tf": requirements_tf, }, python_requires=">=3.6.0, <3.10", entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, diff --git a/tests/tts_tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py deleted file mode 100644 index fb1efcde..00000000 --- a/tests/tts_tests/test_tacotron2_tf_model.py +++ /dev/null @@ -1,156 +0,0 @@ -import os -import unittest - -import numpy as np -import tensorflow as tf -import torch - -from TTS.tts.configs.tacotron2_config import Tacotron2Config -from TTS.tts.tf.models.tacotron2 import Tacotron2 -from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model - -tf.get_logger().setLevel("INFO") - - -# pylint: disable=unused-variable - -torch.manual_seed(1) -use_cuda = torch.cuda.is_available() -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -c = Tacotron2Config() - - -class TacotronTFTrainTest(unittest.TestCase): - @staticmethod - def generate_dummy_inputs(): - chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) - chars_seq_lengths = torch.randint(100, 128, (8,)).long().to(device) - chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_lengths = torch.randint(20, 30, (8,)).long().to(device) - stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8,)).long().to(device) - - chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy()) - chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy()) - mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy()) - return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids - - @unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.") - def test_train_step(self): - """test forward pass""" - ( - chars_seq, - chars_seq_lengths, - mel_spec, - mel_postnet_spec, - mel_lengths, - stop_targets, - speaker_ids, - ) = self.generate_dummy_inputs() - - for idx in mel_lengths: - stop_targets[:, int(idx.item()) :, 0] = 1.0 - - stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5) - # training pass - output = model(chars_seq, chars_seq_lengths, mel_spec, training=True) - - # check model output shapes - assert np.all(output[0].shape == mel_spec.shape) - assert np.all(output[1].shape == mel_spec.shape) - assert output[2].shape[2] == chars_seq.shape[1] - assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) - assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) - - # inference pass - output = model(chars_seq, training=False) - - @unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.") - def test_forward_attention( - self, - ): - ( - chars_seq, - chars_seq_lengths, - mel_spec, - mel_postnet_spec, - mel_lengths, - stop_targets, - speaker_ids, - ) = self.generate_dummy_inputs() - - for idx in mel_lengths: - stop_targets[:, int(idx.item()) :, 0] = 1.0 - - stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True) - # training pass - output = model(chars_seq, chars_seq_lengths, mel_spec, training=True) - - # check model output shapes - assert np.all(output[0].shape == mel_spec.shape) - assert np.all(output[1].shape == mel_spec.shape) - assert output[2].shape[2] == chars_seq.shape[1] - assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) - assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) - - # inference pass - output = model(chars_seq, training=False) - - @unittest.skipIf(use_cuda, " [!] Skip Test: TfLite conversion does not work on GPU.") - def test_tflite_conversion( - self, - ): # pylint:disable=no-self-use - model = Tacotron2( - num_chars=24, - num_speakers=0, - r=3, - out_channels=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=0, - separate_stopnet=True, - bidirectional_decoder=False, - enable_tflite=True, - ) - model.build_inference() - convert_tacotron2_to_tflite(model, output_path="test_tacotron2.tflite", experimental_converter=True) - # init tflite model - tflite_model = load_tflite_model("test_tacotron2.tflite") - # fake input - inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) # pylint:disable=unexpected-keyword-arg - # run inference - # get input and output details - input_details = tflite_model.get_input_details() - output_details = tflite_model.get_output_details() - # reshape input tensor for the new input shape - tflite_model.resize_tensor_input( - input_details[0]["index"], inputs.shape - ) # pylint:disable=unexpected-keyword-arg - tflite_model.allocate_tensors() - detail = input_details[0] - input_shape = detail["shape"] - tflite_model.set_tensor(detail["index"], inputs) - # run the tflite_model - tflite_model.invoke() - # collect outputs - decoder_output = tflite_model.get_tensor(output_details[0]["index"]) - postnet_output = tflite_model.get_tensor(output_details[1]["index"]) - # remove tflite binary - os.remove("test_tacotron2.tflite") diff --git a/tests/vocoder_tests/test_vocoder_tf_melgan_generator.py b/tests/vocoder_tests/test_vocoder_tf_melgan_generator.py deleted file mode 100644 index 225ceaf5..00000000 --- a/tests/vocoder_tests/test_vocoder_tf_melgan_generator.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest - -import numpy as np -import tensorflow as tf -import torch - -from TTS.vocoder.tf.models.melgan_generator import MelganGenerator - -use_cuda = torch.cuda.is_available() - - -@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.") -def test_melgan_generator(): - hop_length = 256 - model = MelganGenerator() - # pylint: disable=no-value-for-parameter - dummy_input = tf.random.uniform((4, 80, 64)) - output = model(dummy_input, training=False) - assert np.all(output.shape == (4, 1, 64 * hop_length)), output.shape diff --git a/tests/vocoder_tests/test_vocoder_tf_pqmf.py b/tests/vocoder_tests/test_vocoder_tf_pqmf.py deleted file mode 100644 index 6acb20d9..00000000 --- a/tests/vocoder_tests/test_vocoder_tf_pqmf.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import unittest - -import soundfile as sf -import tensorflow as tf -import torch -from librosa.core import load - -from tests import get_tests_input_path, get_tests_output_path, get_tests_path -from TTS.vocoder.tf.layers.pqmf import PQMF - -TESTS_PATH = get_tests_path() -WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -use_cuda = torch.cuda.is_available() - - -@unittest.skipIf(use_cuda, " [!] Skip Test: Loosy TF support.") -def test_pqmf(): - w, sr = load(WAV_FILE) - - layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) - w, sr = load(WAV_FILE) - w2 = tf.convert_to_tensor(w[None, None, :]) - b2 = layer.analysis(w2) - w2_ = layer.synthesis(b2) - w2_ = w2.numpy() - - print(w2_.max()) - print(w2_.min()) - print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), "tf_pqmf_output.wav"), w2_.flatten(), sr) From 5e3f499a69555eb1aaffefed79f0c132ef57d59b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 11 Feb 2022 13:27:59 +0100 Subject: [PATCH 5/6] Fix #1187 (#1227) --- TTS/vocoder/configs/parallel_wavegan_config.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index a89b1f3f..f536ba98 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -70,11 +70,11 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): lr_scheduler_gen (torch.optim.Scheduler): Learning rate scheduler for the generator. Defaults to `ExponentialLR`. lr_scheduler_gen_params (dict): - Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.5, "step_size": 200000, "last_epoch": -1}`. lr_scheduler_disc (torch.optim.Scheduler): Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`. lr_scheduler_dict_params (dict): - Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.5, "step_size": 200000, "last_epoch": -1}`. """ model: str = "parallel_wavegan" @@ -124,7 +124,8 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): lr_disc: float = 0.0002 # Initial learning rate. optimizer: str = "AdamW" optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) - lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) - lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) + lr_scheduler_gen: str = "StepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.5, "step_size": 200000, "last_epoch": -1}) + lr_scheduler_disc: str = "StepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.5, "step_size": 200000, "last_epoch": -1}) + scheduler_after_epoch: bool = False From 127118c6378168e3d36a1e5d19ede777fd20684f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 11 Feb 2022 23:03:43 +0100 Subject: [PATCH 6/6] Update TTS.tts formatters (#1228) * Return Dict from tts formatters * Make style --- TTS/bin/compute_embeddings.py | 9 ++- TTS/bin/compute_statistics.py | 10 ++-- TTS/bin/find_unique_chars.py | 1 + TTS/bin/find_unique_phonemes.py | 5 ++ TTS/bin/train_tts.py | 1 + TTS/speaker_encoder/dataset.py | 10 ++-- TTS/speaker_encoder/models/resnet.py | 2 +- TTS/speaker_encoder/utils/generic_utils.py | 6 +- TTS/tts/datasets/__init__.py | 12 ++-- TTS/tts/datasets/dataset.py | 60 ++++++++----------- TTS/tts/datasets/formatters.py | 36 +++++------ TTS/tts/layers/generic/normalization.py | 2 +- TTS/tts/layers/generic/wavenet.py | 2 +- TTS/tts/layers/glow_tts/encoder.py | 2 +- TTS/tts/layers/glow_tts/transformer.py | 4 +- TTS/tts/layers/losses.py | 4 +- TTS/tts/layers/tacotron/gst_layers.py | 2 +- TTS/tts/layers/vits/networks.py | 2 +- .../vits/stochastic_duration_predictor.py | 6 +- TTS/tts/models/glow_tts.py | 8 +-- TTS/tts/models/vits.py | 22 ++++--- TTS/tts/utils/languages.py | 2 +- TTS/tts/utils/speakers.py | 4 +- TTS/tts/utils/ssim.py | 6 +- TTS/utils/audio.py | 24 ++++---- TTS/utils/download.py | 2 +- TTS/utils/training.py | 4 +- .../configs/parallel_wavegan_config.py | 4 +- TTS/vocoder/datasets/wavernn_dataset.py | 2 +- TTS/vocoder/layers/lvc_block.py | 4 +- TTS/vocoder/layers/melgan.py | 2 +- TTS/vocoder/layers/parallel_wavegan.py | 2 +- TTS/vocoder/models/hifigan_generator.py | 2 +- TTS/vocoder/models/melgan_generator.py | 2 +- .../models/parallel_wavegan_discriminator.py | 2 +- .../models/parallel_wavegan_generator.py | 2 +- TTS/vocoder/models/univnet_generator.py | 2 +- TTS/vocoder/models/wavegrad.py | 8 +-- TTS/vocoder/models/wavernn.py | 2 +- tests/data_tests/test_dataset_formatters.py | 10 ++-- tests/vocoder_tests/test_vocoder_wavernn.py | 2 +- 41 files changed, 153 insertions(+), 141 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 2ac18651..50817154 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -29,7 +29,9 @@ parser.add_argument( help="Path to dataset config file.", ) parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.") -parser.add_argument("--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None) +parser.add_argument( + "--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None +) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--eval", type=bool, help="compute eval.", default=True) @@ -41,7 +43,10 @@ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_spli wav_files = meta_data_train + meta_data_eval speaker_manager = SpeakerManager( - encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda + encoder_model_path=args.model_path, + encoder_config_path=args.config_path, + d_vectors_file_path=args.old_file, + use_cuda=args.use_cuda, ) # compute speaker embeddings diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index e1974ae7..3ab7ea7a 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -51,7 +51,7 @@ def main(): N = 0 for item in tqdm(dataset_items): # compute features - wav = ap.load_wav(item if isinstance(item, str) else item[1]) + wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) @@ -59,13 +59,13 @@ def main(): N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) - mel_square_sum += (mel ** 2).sum(axis=1) - linear_square_sum += (linear ** 2).sum(axis=1) + mel_square_sum += (mel**2).sum(axis=1) + linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N - mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) + mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N - linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) + linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = args.out_path stats = {} diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 437c2d60..fb98bab5 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -24,6 +24,7 @@ def main(): # load all datasets train_items, eval_items = load_tts_samples(c.datasets, eval_split=True) + items = train_items + eval_items texts = "".join(item[0] for item in items) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index d3143ca3..02a783c7 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -43,6 +43,11 @@ def main(): items = train_items + eval_items print("Num items:", len(items)) + is_lang_def = all(item["language"] for item in items) + + if not c.phoneme_language or not is_lang_def: + raise ValueError("Phoneme language must be defined in config.") + phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phones = [] for ph in phonemes: diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 0f8c4760..a7ce8ef3 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,4 +1,5 @@ import os + import torch from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 5b0fee22..28a23e2f 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -78,12 +78,12 @@ class SpeakerEncoderDataset(Dataset): mel = self.ap.melspectrogram(wav).astype("float32") # sample seq_len - assert text.size > 0, self.items[idx][1] - assert wav.size > 0, self.items[idx][1] + assert text.size > 0, self.items[idx]["audio_file"] + assert wav.size > 0, self.items[idx]["audio_file"] sample = { "mel": mel, - "item_idx": self.items[idx][1], + "item_idx": self.items[idx]["audio_file"], "speaker_name": speaker_name, } return sample @@ -91,8 +91,8 @@ class SpeakerEncoderDataset(Dataset): def __parse_items(self): self.speaker_to_utters = {} for i in self.items: - path_ = i[1] - speaker_ = i[2] + path_ = i["audio_file"] + speaker_ = i["speaker_name"] if speaker_ in self.speaker_to_utters.keys(): self.speaker_to_utters[speaker_].append(path_) else: diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index d6c3dad4..a799fc52 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -229,7 +229,7 @@ class ResNetSpeakerEncoder(nn.Module): x = torch.sum(x * w, dim=2) elif self.encoder_type == "ASP": mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5)) x = torch.cat((mu, sg), 1) x = x.view(x.size()[0], -1) diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index b8aa4093..4ab4e923 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -113,7 +113,7 @@ class AugmentWAV(object): def additive_noise(self, noise_type, audio): - clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4) + clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) noise_list = random.sample( self.noise_list[noise_type], @@ -135,7 +135,7 @@ class AugmentWAV(object): self.additive_noise_config[noise_type]["min_snr_in_db"], self.additive_noise_config[noise_type]["max_num_noises"], ) - noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4) + noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4) noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio if noises_wav is None: @@ -154,7 +154,7 @@ class AugmentWAV(object): rir_file = random.choice(self.rir_files) rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) - rir = rir / np.sqrt(np.sum(rir ** 2)) + rir = rir / np.sqrt(np.sum(rir**2)) return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len] def apply_one(self, audio): diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 40eed7e3..455413fa 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -75,14 +75,14 @@ def load_tts_samples( formatter = _get_formatter_by_name(name) # load train set meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers) - meta_data_train = [[*item, language] for item in meta_data_train] + meta_data_train = [{**item, **{"language": language}} for item in meta_data_train] print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers) - meta_data_eval = [[*item, language] for item in meta_data_eval] + meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval] else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval @@ -91,12 +91,12 @@ def load_tts_samples( if dataset.meta_file_attn_mask: meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): - attn_file = meta_data[ins[1]].strip() - meta_data_train_all[idx].append(attn_file) + attn_file = meta_data[ins["audio_file"]].strip() + meta_data_train_all[idx].update({"alignment_file": attn_file}) if meta_data_eval_all: for idx, ins in enumerate(meta_data_eval_all): - attn_file = meta_data[ins[1]].strip() - meta_data_eval_all[idx].append(attn_file) + attn_file = meta_data[ins["audio_file"]].strip() + meta_data_eval_all[idx].update({"alignment_file": attn_file}) # set none for the next iter formatter = None return meta_data_train_all, meta_data_eval_all diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 2f20c865..546f012d 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -21,7 +21,7 @@ class TTSDataset(Dataset): text_cleaner: list, compute_linear_spec: bool, ap: AudioProcessor, - meta_data: List[List], + meta_data: List[Dict], compute_f0: bool = False, f0_cache_path: str = None, characters: Dict = None, @@ -54,7 +54,7 @@ class TTSDataset(Dataset): ap (TTS.tts.utils.AudioProcessor): Audio processor object. - meta_data (list): List of dataset instances. + meta_data (list): List of dataset samples. compute_f0 (bool): compute f0 if True. Defaults to False. @@ -199,15 +199,9 @@ class TTSDataset(Dataset): def load_data(self, idx): item = self.items[idx] + raw_text = item["text"] - if len(item) == 5: - text, wav_file, speaker_name, language_name, attn_file = item - else: - text, wav_file, speaker_name, language_name = item - attn = None - raw_text = text - - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) + wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32) # apply noise for augmentation if self.use_noise_augment: @@ -216,12 +210,12 @@ class TTSDataset(Dataset): if not self.input_seq_computed: if self.use_phonemes: text = self._load_or_generate_phoneme_sequence( - wav_file, - text, + item["audio_file"], + item["text"], self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, - language_name if language_name else self.phoneme_language, + item["language"] if item["language"] else self.phoneme_language, self.custom_symbols, self.characters, self.add_blank, @@ -229,7 +223,7 @@ class TTSDataset(Dataset): else: text = np.asarray( text_to_sequence( - text, + item["text"], [self.cleaners], custom_symbols=self.custom_symbols, tp=self.characters, @@ -238,11 +232,12 @@ class TTSDataset(Dataset): dtype=np.int32, ) - assert text.size > 0, self.items[idx][1] - assert wav.size > 0, self.items[idx][1] + assert text.size > 0, self.items[idx]["audio_file"] + assert wav.size > 0, self.items[idx]["audio_file"] - if "attn_file" in locals(): - attn = np.load(attn_file) + attn = None + if "alignment_file" in item: + attn = np.load(item["alignment_file"]) if len(text) > self.max_seq_len: # return a different sample if the phonemized @@ -252,7 +247,7 @@ class TTSDataset(Dataset): pitch = None if self.compute_f0: - pitch = self.pitch_extractor.load_or_compute_pitch(self.ap, wav_file, self.f0_cache_path) + pitch = self.pitch_extractor.load_or_compute_pitch(self.ap, item["audio_file"], self.f0_cache_path) pitch = self.pitch_extractor.normalize_pitch(pitch.astype(np.float32)) sample = { @@ -261,10 +256,10 @@ class TTSDataset(Dataset): "wav": wav, "pitch": pitch, "attn": attn, - "item_idx": self.items[idx][1], - "speaker_name": speaker_name, - "language_name": language_name, - "wav_file_name": os.path.basename(wav_file), + "item_idx": item["audio_file"], + "speaker_name": item["speaker_name"], + "language_name": item["language"], + "wav_file_name": os.path.basename(item["audio_file"]), } return sample @@ -272,11 +267,10 @@ class TTSDataset(Dataset): def _phoneme_worker(args): item = args[0] func_args = args[1] - text, wav_file, *_ = item func_args[3] = ( - item[3] if item[3] else func_args[3] + item["language"] if "language" in item and item["language"] else func_args[3] ) # override phoneme language if specified by the dataset formatter - phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = TTSDataset._load_or_generate_phoneme_sequence(item["audio_file"], item["text"], *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -286,10 +280,9 @@ class TTSDataset(Dataset): if self.verbose: print(" | > Computing input sequences ...") for idx, item in enumerate(tqdm.tqdm(self.items)): - text, *_ = item sequence = np.asarray( text_to_sequence( - text, + item["text"], [self.cleaners], custom_symbols=self.custom_symbols, tp=self.characters, @@ -337,10 +330,10 @@ class TTSDataset(Dataset): if by_audio_len: lengths = [] for item in self.items: - lengths.append(os.path.getsize(item[1]) / 16 * 8) # assuming 16bit audio + lengths.append(os.path.getsize(item["audio_file"]) / 16 * 8) # assuming 16bit audio lengths = np.array(lengths) else: - lengths = np.array([len(ins[0]) for ins in self.items]) + lengths = np.array([len(ins["text"]) for ins in self.items]) idxs = np.argsort(lengths) new_items = [] @@ -555,7 +548,7 @@ class PitchExtractor: def __init__( self, - items: List[List], + items: List[Dict], verbose=False, ): self.items = items @@ -614,10 +607,9 @@ class PitchExtractor: item = args[0] ap = args[1] cache_path = args[2] - _, wav_file, *_ = item - pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) + pitch_file = PitchExtractor.create_pitch_file_path(item["audio_file"], cache_path) if not os.path.exists(pitch_file): - pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) + pitch = PitchExtractor._compute_and_save_pitch(ap, item["audio_file"], pitch_file) return pitch return None diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 1f23f85e..28eb0e0f 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -24,7 +24,7 @@ def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("\t") wav_file = os.path.join(root_path, cols[0] + ".wav") text = cols[1] - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -39,7 +39,7 @@ def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument wav_file = cols[1].strip() text = cols[0].strip() wav_file = os.path.join(root_path, "wavs", wav_file) - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -55,7 +55,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume text = cols[1].strip() folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL" wav_file = os.path.join(root_path, folder_name, wav_file) - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -101,7 +101,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav") if os.path.isfile(wav_file): text = cols[1].strip() - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) else: # M-AI-Labs have some missing samples, so just print the warning print("> File %s does not exist!" % (wav_file)) @@ -119,7 +119,7 @@ def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") text = cols[2] - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -133,7 +133,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") text = cols[2] - items.append([text, wav_file, f"ljspeech-{idx}"]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{idx}"}) return items @@ -150,7 +150,7 @@ def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-arg if not os.path.exists(wav_file): print(f" [!] {wav_file} in metafile does not exist. Skipping...") continue - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -165,7 +165,7 @@ def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav") text = cols[1] - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -179,7 +179,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) text = cols[1] - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -193,7 +193,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] wav_file = os.path.join(root_path, "wavn", utt_id + ".wav") - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items @@ -213,7 +213,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None): if speaker_name in ignored_speakers: continue wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) - items.append([text, wav_file, "MCV_" + speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "MCV_" + speaker_name}) return items @@ -240,7 +240,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): if isinstance(ignored_speakers, list): if speaker_name in ignored_speakers: continue - items.append([text, wav_file, "LTTS_" + speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"}) for item in items: assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" return items @@ -259,7 +259,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar skipped_files.append(wav_file) continue text = cols[1].strip() - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) print(f" [!] {len(skipped_files)} files skipped. They don't exist...") return items @@ -281,7 +281,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - items.append([text, wav_file, speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_id}) return items @@ -299,7 +299,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([text, wav_file, "VCTK_" + speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id}) return items @@ -334,7 +334,7 @@ def mls(root_path, meta_files=None, ignored_speakers=None): if isinstance(ignored_speakers, list): if speaker in ignored_speakers: continue - items.append([text, wav_file, "MLS_" + speaker]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "MLS_" + speaker}) return items @@ -404,7 +404,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylin for line in ttf: wav_name, text = line.rstrip("\n").split("|") wav_path = os.path.join(root_path, "clips_22", wav_name) - items.append([text, wav_path, speaker_name]) + items.append({"text": text, "audio_file": wav_path, "speaker_name": speaker_name}) return items @@ -418,5 +418,5 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") text = cols[2].replace(" ", "") - items.append([text, wav_file, speaker_name]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) return items diff --git a/TTS/tts/layers/generic/normalization.py b/TTS/tts/layers/generic/normalization.py index 4766c77d..c0270e40 100644 --- a/TTS/tts/layers/generic/normalization.py +++ b/TTS/tts/layers/generic/normalization.py @@ -113,7 +113,7 @@ class ActNorm(nn.Module): denom = torch.sum(x_mask, [0, 2]) m = torch.sum(x * x_mask, [0, 2]) / denom m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom - v = m_sq - (m ** 2) + v = m_sq - (m**2) logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py index 0c87e9df..aeb45c7b 100644 --- a/TTS/tts/layers/generic/wavenet.py +++ b/TTS/tts/layers/generic/wavenet.py @@ -65,7 +65,7 @@ class WN(torch.nn.Module): self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") # intermediate layers for i in range(num_layers): - dilation = dilation_rate ** i + dilation = dilation_rate**i padding = int((kernel_size * dilation - dilation) / 2) in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index 36ed668b..3b43e527 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -101,7 +101,7 @@ class Encoder(nn.Module): self.encoder_type = encoder_type # embedding layer self.emb = nn.Embedding(num_chars, hidden_channels) - nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) # init encoder module if encoder_type.lower() == "rel_pos_transformer": if use_prenet: diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py index ba6aa1e2..0f837abf 100644 --- a/TTS/tts/layers/glow_tts/transformer.py +++ b/TTS/tts/layers/glow_tts/transformer.py @@ -88,7 +88,7 @@ class RelativePositionMultiHeadAttention(nn.Module): # relative positional encoding layers if rel_attn_window_size is not None: n_heads_rel = 1 if heads_share else num_heads - rel_stddev = self.k_channels ** -0.5 + rel_stddev = self.k_channels**-0.5 emb_rel_k = nn.Parameter( torch.randn(n_heads_rel, rel_attn_window_size * 2 + 1, self.k_channels) * rel_stddev ) @@ -235,7 +235,7 @@ class RelativePositionMultiHeadAttention(nn.Module): batch, heads, length, _ = x.size() # padd along column x = F.pad(x, [0, length - 1, 0, 0, 0, 0, 0, 0]) - x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)]) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, [length, 0, 0, 0, 0, 0]) x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 7de45041..d770a536 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -218,7 +218,7 @@ class GuidedAttentionLoss(torch.nn.Module): def _make_ga_mask(ilen, olen, sigma): grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen)) grid_x, grid_y = grid_x.float(), grid_y.float() - return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2))) + return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2))) @staticmethod def _make_masks(ilens, olens): @@ -665,7 +665,7 @@ class VitsDiscriminatorLoss(nn.Module): dr = dr.float() dg = dg.float() real_loss = torch.mean((1 - dr) ** 2) - fake_loss = torch.mean(dg ** 2) + fake_loss = torch.mean(dg**2) loss += real_loss + fake_loss real_losses.append(real_loss.item()) fake_losses.append(fake_loss.item()) diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index 01a81e0b..7d751bc0 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -141,7 +141,7 @@ class MultiHeadAttention(nn.Module): # score = softmax(QK^T / (d_k ** 0.5)) scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k] - scores = scores / (self.key_dim ** 0.5) + scores = scores / (self.key_dim**0.5) scores = F.softmax(scores, dim=3) # out = score * V diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index ef426ace..7c225344 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -57,7 +57,7 @@ class TextEncoder(nn.Module): self.emb = nn.Embedding(n_vocab, hidden_channels) - nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) if language_emb_dim: hidden_channels += language_emb_dim diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 120d0944..738ee341 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -33,7 +33,7 @@ class DilatedDepthSeparableConv(nn.Module): self.norms_1 = nn.ModuleList() self.norms_2 = nn.ModuleList() for i in range(num_layers): - dilation = kernel_size ** i + dilation = kernel_size**i padding = (kernel_size * dilation - dilation) // 2 self.convs_sep.append( nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding) @@ -264,7 +264,7 @@ class StochasticDurationPredictor(nn.Module): # posterior encoder - neg log likelihood logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) nll_posterior_encoder = ( - torch.sum(-0.5 * (math.log(2 * math.pi) + (noise ** 2)) * x_mask, [1, 2]) - logdet_tot_q + torch.sum(-0.5 * (math.log(2 * math.pi) + (noise**2)) * x_mask, [1, 2]) - logdet_tot_q ) z0 = torch.log(torch.clamp_min(z0, 1e-5)) * x_mask @@ -279,7 +279,7 @@ class StochasticDurationPredictor(nn.Module): z = torch.flip(z, [1]) # flow layers - neg log likelihood - nll_flow_layers = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot + nll_flow_layers = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot return nll_flow_layers + nll_posterior_encoder flows = list(reversed(self.flows)) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index c1e4c2ac..7dbfdd09 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -206,9 +206,9 @@ class GlowTTS(BaseTTS): with torch.no_grad(): o_scale = torch.exp(-2 * o_log_scale) logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) @@ -255,9 +255,9 @@ class GlowTTS(BaseTTS): # find the alignment path between z and encoder output o_scale = torch.exp(-2 * o_log_scale) logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index cb349ca2..ae24a99e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,7 +4,6 @@ from itertools import chain from typing import Dict, List, Tuple import torch - import torchaudio from coqpit import Coqpit from torch import nn @@ -424,9 +423,9 @@ class Vits(BaseTTS): and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): self.audio_transform = torchaudio.transforms.Resample( - orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], - ) + orig_freq=self.audio_config["sample_rate"], + new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], + ) else: self.audio_transform = None @@ -591,9 +590,9 @@ class Vits(BaseTTS): with torch.no_grad(): o_scale = torch.exp(-2 * logs_p) logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) + logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p**2)]) logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp4 = torch.sum(-0.5 * (m_p**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp2 + logp3 + logp1 + logp4 attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() @@ -692,10 +691,17 @@ class Vits(BaseTTS): if self.args.use_sdp: logw = self.duration_predictor( - x, x_mask, g=g if self.args.condition_dp_on_speaker else None, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb + x, + x_mask, + g=g if self.args.condition_dp_on_speaker else None, + reverse=True, + noise_scale=self.inference_noise_scale_dp, + lang_emb=lang_emb, ) else: - logw = self.duration_predictor(x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb) + logw = self.duration_predictor( + x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb + ) w = torch.exp(logw) * x_mask * self.length_scale w_ceil = torch.ceil(w) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index fc7eec57..a4f41be5 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -113,7 +113,7 @@ def _set_file_path(path): def get_language_weighted_sampler(items: list): - language_names = np.array([item[3] for item in items]) + language_names = np.array([item["language"] for item in items]) unique_language_names = np.unique(language_names).tolist() language_ids = [unique_language_names.index(l) for l in language_names] language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 07076d90..441296ac 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -118,7 +118,7 @@ class SpeakerManager: Returns: Tuple[Dict, int]: speaker IDs and number of speakers. """ - speakers = sorted({item[2] for item in items}) + speakers = sorted({item["speaker_name"] for item in items}) speaker_ids = {name: i for i, name in enumerate(speakers)} num_speakers = len(speaker_ids) return speaker_ids, num_speakers @@ -414,7 +414,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, def get_speaker_weighted_sampler(items: list): - speaker_names = np.array([item[2] for item in items]) + speaker_names = np.array([item["speaker_name"] for item in items]) unique_speaker_names = np.unique(speaker_names).tolist() speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 883efdb8..ab2c6991 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -8,7 +8,7 @@ from torch.autograd import Variable def gaussian(window_size, sigma): - gauss = torch.Tensor([exp(-((x - window_size // 2) ** 2) / float(2 * sigma ** 2)) for x in range(window_size)]) + gauss = torch.Tensor([exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2)) for x in range(window_size)]) return gauss / gauss.sum() @@ -33,8 +33,8 @@ def _ssim(img1, img2, window, window_size, channel, size_average=True): sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 - C1 = 0.01 ** 2 - C2 = 0.03 ** 2 + C1 = 0.01**2 + C2 = 0.03**2 ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 25f93c34..0253f918 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -142,10 +142,10 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method ) M = o[:, :, :, 0] P = o[:, :, :, 1] - S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8)) if self.power is not None: - S = S ** self.power + S = S**self.power if self.use_mel: S = torch.matmul(self.mel_basis.to(x), S) @@ -634,8 +634,8 @@ class AudioProcessor(object): S = self._db_to_amp(S) # Reconstruct phase if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) - return self._griffin_lim(S ** self.power) + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" @@ -643,8 +643,8 @@ class AudioProcessor(object): S = self._db_to_amp(D) S = self._mel_to_linear(S) # Convert back to linear if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) - return self._griffin_lim(S ** self.power) + return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) + return self._griffin_lim(S**self.power) def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: """Convert a full scale linear spectrogram output of a network to a melspectrogram. @@ -781,7 +781,7 @@ class AudioProcessor(object): @staticmethod def _rms_norm(wav, db_level=-27): r = 10 ** (db_level / 20) - a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2)) + a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2)) return wav * a def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: @@ -853,7 +853,7 @@ class AudioProcessor(object): @staticmethod def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: - mu = 2 ** qc - 1 + mu = 2**qc - 1 # wav_abs = np.minimum(np.abs(wav), 1.0) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) # Quantize signal to the specified number of levels. @@ -865,13 +865,13 @@ class AudioProcessor(object): @staticmethod def mulaw_decode(wav, qc): """Recovers waveform from quantized values.""" - mu = 2 ** qc - 1 + mu = 2**qc - 1 x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) return x @staticmethod def encode_16bits(x): - return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16) + return np.clip(x * 2**15, -(2**15), 2**15 - 1).astype(np.int16) @staticmethod def quantize(x: np.ndarray, bits: int) -> np.ndarray: @@ -884,12 +884,12 @@ class AudioProcessor(object): Returns: np.ndarray: Quantized waveform. """ - return (x + 1.0) * (2 ** bits - 1) / 2 + return (x + 1.0) * (2**bits - 1) / 2 @staticmethod def dequantize(x, bits): """Dequantize a waveform from the given number of bits.""" - return 2 * x / (2 ** bits - 1) - 1 + return 2 * x / (2**bits - 1) - 1 def _log(x, base): diff --git a/TTS/utils/download.py b/TTS/utils/download.py index 241a106b..de9b31a7 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -128,7 +128,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> while True: # Read by chunk to avoid filling memory - chunk = file_obj.read(1024 ** 2) + chunk = file_obj.read(1024**2) if not chunk: break hash_func.update(chunk) diff --git a/TTS/utils/training.py b/TTS/utils/training.py index aa5651c5..9f01b310 100644 --- a/TTS/utils/training.py +++ b/TTS/utils/training.py @@ -39,7 +39,7 @@ class NoamLR(torch.optim.lr_scheduler._LRScheduler): def get_lr(self): step = max(self.last_epoch, 1) return [ - base_lr * self.warmup_steps ** 0.5 * min(step * self.warmup_steps ** -1.5, step ** -0.5) + base_lr * self.warmup_steps**0.5 * min(step * self.warmup_steps**-1.5, step**-0.5) for base_lr in self.base_lrs ] @@ -63,7 +63,7 @@ def lr_decay(init_lr, global_step, warmup_steps): It is only being used by the Speaker Encoder trainer.""" warmup_steps = float(warmup_steps) step = global_step + 1.0 - lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) + lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5, step**-0.5) return lr diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index f536ba98..7845dd6b 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -127,5 +127,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): lr_scheduler_gen: str = "StepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.5, "step_size": 200000, "last_epoch": -1}) lr_scheduler_disc: str = "StepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html - lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.5, "step_size": 200000, "last_epoch": -1}) + lr_scheduler_disc_params: dict = field( + default_factory=lambda: {"gamma": 0.5, "step_size": 200000, "last_epoch": -1} + ) scheduler_after_epoch: bool = False diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index d648b68c..2c771cf0 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -111,7 +111,7 @@ class WaveRNNDataset(Dataset): elif isinstance(self.mode, int): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) - x_input = 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + x_input = 2 * coarse[:, : self.seq_len].float() / (2**self.mode - 1.0) - 1.0 y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) return x_input, mels, y_coarse diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py index 0e29ee3c..8913a113 100644 --- a/TTS/vocoder/layers/lvc_block.py +++ b/TTS/vocoder/layers/lvc_block.py @@ -126,9 +126,9 @@ class LVCBlock(torch.nn.Module): ) for i in range(conv_layers): - padding = (3 ** i) * int((conv_kernel_size - 1) / 2) + padding = (3**i) * int((conv_kernel_size - 1) / 2) conv = torch.nn.Conv1d( - in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i + in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3**i ) self.convs.append(conv) diff --git a/TTS/vocoder/layers/melgan.py b/TTS/vocoder/layers/melgan.py index 7fd999d9..4bb328e9 100644 --- a/TTS/vocoder/layers/melgan.py +++ b/TTS/vocoder/layers/melgan.py @@ -12,7 +12,7 @@ class ResidualStack(nn.Module): self.blocks = nn.ModuleList() for idx in range(num_res_blocks): layer_kernel_size = kernel_size - layer_dilation = layer_kernel_size ** idx + layer_dilation = layer_kernel_size**idx layer_padding = base_padding * layer_dilation self.blocks += [ nn.Sequential( diff --git a/TTS/vocoder/layers/parallel_wavegan.py b/TTS/vocoder/layers/parallel_wavegan.py index 889e8aa6..51142e5e 100644 --- a/TTS/vocoder/layers/parallel_wavegan.py +++ b/TTS/vocoder/layers/parallel_wavegan.py @@ -72,6 +72,6 @@ class ResidualBlock(torch.nn.Module): s = self.conv1x1_skip(x) # for residual connection - x = (self.conv1x1_out(x) + residual) * (0.5 ** 2) + x = (self.conv1x1_out(x) + residual) * (0.5**2) return x, s diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 4ce743b3..fc15f3af 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -207,7 +207,7 @@ class HifiganGenerator(torch.nn.Module): self.ups.append( weight_norm( ConvTranspose1d( - upsample_initial_channel // (2 ** i), + upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index e60baa9d..80b47870 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -36,7 +36,7 @@ class MelganGenerator(nn.Module): # upsampling layers and residual stacks for idx, upsample_factor in enumerate(upsample_factors): - layer_in_channels = base_channels // (2 ** idx) + layer_in_channels = base_channels // (2**idx) layer_out_channels = base_channels // (2 ** (idx + 1)) layer_filter_size = upsample_factor * 2 layer_stride = upsample_factor diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index 9cc1061c..adf1bdae 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -35,7 +35,7 @@ class ParallelWaveganDiscriminator(nn.Module): if i == 0: dilation = 1 else: - dilation = i if dilation_factor == 1 else dilation_factor ** i + dilation = i if dilation_factor == 1 else dilation_factor**i conv_in_channels = conv_channels padding = (kernel_size - 1) // 2 * dilation conv_layer = [ diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index b8e78d03..ee9d8ad5 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -142,7 +142,7 @@ class ParallelWaveganGenerator(torch.nn.Module): self.apply(_apply_weight_norm) @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2 ** x): + def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): assert layers % stacks == 0 layers_per_cycle = layers // stacks dilations = [dilation(i % layers_per_cycle) for i in range(layers)] diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 8a66c537..2ee28c7b 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -130,7 +130,7 @@ class UnivnetGenerator(torch.nn.Module): self.apply(_apply_weight_norm) @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2 ** x): + def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): assert layers % stacks == 0 layers_per_cycle = layers // stacks dilations = [dilation(i % layers_per_cycle) for i in range(layers)] diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index ed4f4b37..00142c91 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -153,7 +153,7 @@ class Wavegrad(BaseVocoder): noise_scale = l_a + torch.rand(y_0.shape[0]).to(y_0) * (l_b - l_a) noise_scale = noise_scale.unsqueeze(1) noise = torch.randn_like(y_0) - noisy_audio = noise_scale * y_0 + (1.0 - noise_scale ** 2) ** 0.5 * noise + noisy_audio = noise_scale * y_0 + (1.0 - noise_scale**2) ** 0.5 * noise return noise.unsqueeze(1), noisy_audio.unsqueeze(1), noise_scale[:, 0] def compute_noise_level(self, beta): @@ -161,8 +161,8 @@ class Wavegrad(BaseVocoder): self.num_steps = len(beta) alpha = 1 - beta alpha_hat = np.cumprod(alpha) - noise_level = np.concatenate([[1.0], alpha_hat ** 0.5], axis=0) - noise_level = alpha_hat ** 0.5 + noise_level = np.concatenate([[1.0], alpha_hat**0.5], axis=0) + noise_level = alpha_hat**0.5 # pylint: disable=not-callable self.beta = torch.tensor(beta.astype(np.float32)) @@ -170,7 +170,7 @@ class Wavegrad(BaseVocoder): self.alpha_hat = torch.tensor(alpha_hat.astype(np.float32)) self.noise_level = torch.tensor(noise_level.astype(np.float32)) - self.c1 = 1 / self.alpha ** 0.5 + self.c1 = 1 / self.alpha**0.5 self.c2 = (1 - self.alpha) / (1 - self.alpha_hat) ** 0.5 self.sigma = ((1.0 - self.alpha_hat[:-1]) / (1.0 - self.alpha_hat[1:]) * self.beta[1:]) ** 0.5 diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 1977efb6..b5b2343a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,7 +225,7 @@ class Wavernn(BaseVocoder): super().__init__(config) if isinstance(self.args.mode, int): - self.n_classes = 2 ** self.args.mode + self.n_classes = 2**self.args.mode elif self.args.mode == "mold": self.n_classes = 3 * 10 elif self.args.mode == "gauss": diff --git a/tests/data_tests/test_dataset_formatters.py b/tests/data_tests/test_dataset_formatters.py index bd83002c..30fb79a8 100644 --- a/tests/data_tests/test_dataset_formatters.py +++ b/tests/data_tests/test_dataset_formatters.py @@ -5,13 +5,13 @@ from tests import get_tests_input_path from TTS.tts.datasets.formatters import common_voice -class TestPreprocessors(unittest.TestCase): +class TestTTSFormatters(unittest.TestCase): def test_common_voice_preprocessor(self): # pylint: disable=no-self-use root_path = get_tests_input_path() meta_file = "common_voice.tsv" items = common_voice(root_path, meta_file) - assert items[0][0] == "The applicants are invited for coffee and visa is given immediately." - assert items[0][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") + assert items[0]["text"] == "The applicants are invited for coffee and visa is given immediately." + assert items[0]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") - assert items[-1][0] == "Competition for limited resources has also resulted in some local conflicts." - assert items[-1][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav") + assert items[-1]["text"] == "Competition for limited resources has also resulted in some local conflicts." + assert items[-1]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav") diff --git a/tests/vocoder_tests/test_vocoder_wavernn.py b/tests/vocoder_tests/test_vocoder_wavernn.py index d4a7b8dd..966ea3dd 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn.py +++ b/tests/vocoder_tests/test_vocoder_wavernn.py @@ -46,6 +46,6 @@ def test_wavernn(): config.model_args.mode = 4 model = Wavernn(config) output = model(dummy_x, dummy_m) - assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape + assert np.all(output.shape == (2, 1280, 2**4)), output.shape output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),))