diff --git a/.travis.yml b/.travis.yml
index e2f77491..645f9861 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ matrix:
env: TEST_SUITE="lint"
- name: "Unit tests"
python: "3.6"
- install: pip install --quiet -r requirements.txt
+ install: pip install --quiet -r requirements_tests.txt
env: TEST_SUITE="unittest"
script: ./.travis/script
diff --git a/README.md b/README.md
index 918328b2..7d9884b0 100644
--- a/README.md
+++ b/README.md
@@ -9,16 +9,25 @@ TTS includes two different model implementations which are based on [Tacotron](h
If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
+[](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/7)
+
## TTS Performance

[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
-## Utilities under this Project
-- Deep Learning based Text2Speech model.
-- ```dataset_analysis```: Tools to curate a Text2Speech dataset.
-- ```speaker_encoder```: Speaker Encoder model computing embedding vectors for voice files.
-- ```server```: Basic server implementation with packaging.
+## Features
+- High performance Text2Speech models on Torch and Tensorflow 2.0.
+- High performance Speaker Encoder to compute speaker embeddings efficiently.
+- Integration with various Neural Vocoders (PWGAN, MelGAN, WaveRNN)
+- Released trained models.
+- Efficient training codes for PyTorch. (soon for Tensorflow 2.0)
+- Codes to convert Torch models to Tensorflow 2.0.
+- Detailed training anlaysis on console and Tensorboard.
+- Tools to curate Text2Speech datasets under```dataset_analysis```.
+- Demo server for model testing.
+- Notebooks for extensive model benchmarking.
+- Modular (but not too much) code base enabling easy testing for new ideas.
## Requirements and Installation
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
diff --git a/best_model_config.json b/best_model_config.json
deleted file mode 100644
index c62e88cb..00000000
--- a/best_model_config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
- "model_name": "best-model",
- "num_mels": 80,
- "num_freq": 1025,
- "sample_rate": 20000,
- "frame_length_ms": 50,
- "frame_shift_ms": 12.5,
- "preemphasis": 0.97,
- "min_level_db": -100,
- "ref_level_db": 20,
- "embedding_size": 256,
- "text_cleaner": "english_cleaners",
-
- "epochs": 1000,
- "lr": 0.002,
- "warmup_steps": 4000,
- "batch_size": 32,
- "eval_batch_size":32,
- "r": 5,
-
- "griffin_lim_iters": 60,
- "power": 1.5,
-
- "num_loader_workers": 8,
-
- "checkpoint": true,
- "save_step": 376,
- "data_path": "/run/shm/erogol/LJSpeech-1.0",
- "min_seq_len": 0,
- "output_path": "/data/shared/erogol_models/"
-}
diff --git a/compute_statistics.py b/compute_statistics.py
new file mode 100755
index 00000000..bbedf7af
--- /dev/null
+++ b/compute_statistics.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import argparse
+
+import numpy as np
+from tqdm import tqdm
+
+from TTS.datasets.preprocess import load_meta_data
+from TTS.utils.generic_utils import load_config
+from TTS.utils.audio import AudioProcessor
+
+def main():
+ """Run preprocessing process."""
+ parser = argparse.ArgumentParser(
+ description="Compute mean and variance of spectrogtram features.")
+ parser.add_argument("--config_path", type=str, required=True,
+ help="TTS config file path.")
+ parser.add_argument("--out_path", default=None, type=str,
+ help="directory to save the output file.")
+ args = parser.parse_args()
+
+ # load config
+ CONFIG = load_config(args.config_path)
+ CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
+ CONFIG.audio['stats_path'] = None # discard pre-defined stats
+
+ # load audio processor
+ ap = AudioProcessor(**CONFIG.audio)
+
+ # load the meta data of target dataset
+ dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data
+ print(f" > There are {len(dataset_items)} files.")
+
+ mel_sum = 0
+ mel_square_sum = 0
+ linear_sum = 0
+ linear_square_sum = 0
+ N = 0
+ for item in tqdm(dataset_items):
+ # compute features
+ wav = ap.load_wav(item[1])
+ linear = ap.spectrogram(wav)
+ mel = ap.melspectrogram(wav)
+
+ # compute stats
+ N += mel.shape[1]
+ mel_sum += mel.sum(1)
+ linear_sum += linear.sum(1)
+ mel_square_sum += (mel ** 2).sum(axis=1)
+ linear_square_sum += (linear ** 2).sum(axis=1)
+
+ mel_mean = mel_sum / N
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
+ linear_mean = linear_sum / N
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
+
+ output_file_path = os.path.join(args.out_path, "scale_stats.npy")
+ stats = {}
+ stats['mel_mean'] = mel_mean
+ stats['mel_std'] = mel_scale
+ stats['linear_mean'] = linear_mean
+ stats['linear_std'] = linear_scale
+
+ # set default config values for mean-var scaling
+ CONFIG.audio['stats_path'] = output_file_path
+ CONFIG.audio['signal_norm'] = True
+ # remove redundant values
+ del CONFIG.audio['max_norm']
+ del CONFIG.audio['min_level_db']
+ del CONFIG.audio['symmetric_norm']
+ del CONFIG.audio['clip_norm']
+ stats['audio_config'] = CONFIG.audio
+ np.save(output_file_path, stats, allow_pickle=True)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/config.json b/config.json
index 2b2b03a5..03907fb0 100644
--- a/config.json
+++ b/config.json
@@ -1,46 +1,56 @@
{
- "model": "Tacotron2", // one of the model in models/
- "run_name": "ljspeech-stft_params",
- "run_description": "tacotron2 cosntant stf parameters",
+ "model": "Tacotron2",
+ "run_name": "ljspeech",
+ "run_description": "tacotron2",
// AUDIO PARAMETERS
"audio":{
+ // stft parameters
+ "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
+ "win_length": 1024, // stft window length in ms.
+ "hop_length": 256, // stft window hop-lengh in ms.
+ "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+ "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
// Audio processing parameters
- "num_mels": 80, // size of the mel spec frame.
- "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
- "win_length": 1024, // stft window length in ms.
- "hop_length": 256, // stft window hop-lengh in ms.
- "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
- "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
- "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
- "min_level_db": -100, // normalization range
+ "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
+
+ // Silence trimming
+ "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+ "trim_db": 60, // threshold for timming silence. Set this according to your dataset.
+
+ // Griffin-Lim
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+ // MelSpectrogram parameters
+ "num_mels": 80, // size of the mel spec frame.
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
+
// Normalization parameters
- "signal_norm": true, // normalize the spec values in range [0, 1]
+ "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+ "min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
- "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+ "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
- "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
- "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
- "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
- "trim_db": 60 // threshold for timming silence. Set this according to your dataset.
+ "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// VOCABULARY PARAMETERS
// if custom character set is not defined,
// default set in symbols.py is used
- "characters":{
- "pad": "_",
- "eos": "~",
- "bos": "^",
- "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
- "punctuations":"!'(),-.:;? ",
- "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
- },
-
+ // "characters":{
+ // "pad": "_",
+ // "eos": "~",
+ // "bos": "^",
+ // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+ // "punctuations":"!'(),-.:;? ",
+ // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+ // },
+
// DISTRIBUTED TRAINING
"distributed":{
"backend": "nccl",
@@ -51,10 +61,11 @@
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
- "eval_batch_size":16,
- "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+ "eval_batch_size":16,
+ "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
+ "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
// VALIDATION
"run_eval": true,
@@ -63,39 +74,40 @@
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
- "grad_clip": 1.0, // upper limit for gradients for clipping.
+ "grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
- "wd": 0.000001, // Weight decay weight.
+ "wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
- "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
-
+ "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
// TACOTRON PRENET
- "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+ "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "original", // "original" or "bn".
- "prenet_dropout": true, // enable/disable dropout at prenet.
+ "prenet_dropout": true, // enable/disable dropout at prenet.
// ATTENTION
"attention_type": "original", // 'original' or 'graves'
"attention_heads": 4, // number of attention heads (only for 'graves')
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"windowing": false, // Enables attention windowing. Used only in eval mode.
- "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
+ "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
"transition_agent": false, // enable/disable transition agent of forward attention.
- "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+ "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
// STOPNET
- "stopnet": true, // Train stopnet predicting the end of synthesis.
- "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+ "stopnet": true, // Train stopnet predicting the end of synthesis.
+ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
+ "print_eval": false, // If True, it prints loss values in evalulation.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
- "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
-
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
// DATA LOADING
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
@@ -103,14 +115,14 @@
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
- "max_seq_len": 150, // DATASET-RELATED: maximum text length
+ "max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS
- "output_path": "/data4/rw/home/Trainings/",
-
+ "output_path": "/home/erogol/Models/LJSpeech/",
+
// PHONEMES
- "phoneme_cache_path": "mozilla_us_phonemes_2_1", // phoneme computation is slow, therefore, it caches results in the given folder.
- "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
+ "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
+ "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST
@@ -123,7 +135,7 @@
[
{
"name": "ljspeech",
- "path": "/root/LJSpeech-1.1/",
+ "path": "/home/erogol/Data/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"meta_file_val": null
}
diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index d3a6f486..0d884c00 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -13,6 +13,7 @@ class MyDataset(Dataset):
def __init__(self,
outputs_per_step,
text_cleaner,
+ compute_linear_spec,
ap,
meta_data,
tp=None,
@@ -28,6 +29,7 @@ class MyDataset(Dataset):
Args:
outputs_per_step (int): number of time frames predicted per step.
text_cleaner (str): text cleaner used for the dataset.
+ compute_linear_spec (bool): compute linear spectrogram if True.
ap (TTS.utils.AudioProcessor): audio processor object.
meta_data (list): list of dataset instances.
batch_group_size (int): (0) range of batch randomization after sorting
@@ -47,6 +49,7 @@ class MyDataset(Dataset):
self.outputs_per_step = outputs_per_step
self.sample_rate = ap.sample_rate
self.cleaners = text_cleaner
+ self.compute_linear_spec = compute_linear_spec
self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len
self.ap = ap
@@ -193,7 +196,6 @@ class MyDataset(Dataset):
# compute features
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
- linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
mel_lengths = [m.shape[1] for m in mel]
@@ -208,25 +210,29 @@ class MyDataset(Dataset):
# PAD sequences with longest instance in the batch
text = prepare_data(text).astype(np.int32)
- wav = prepare_data(wav)
# PAD features with longest instance
- linear = prepare_tensor(linear, self.outputs_per_step)
mel = prepare_tensor(mel, self.outputs_per_step)
- assert mel.shape[2] == linear.shape[2]
# B x D x T --> B x T x D
- linear = linear.transpose(0, 2, 1)
mel = mel.transpose(0, 2, 1)
# convert things to pytorch
text_lenghts = torch.LongTensor(text_lenghts)
text = torch.LongTensor(text)
- linear = torch.FloatTensor(linear).contiguous()
mel = torch.FloatTensor(mel).contiguous()
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
+ # compute linear spectrogram
+ if self.compute_linear_spec:
+ linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
+ linear = prepare_tensor(linear, self.outputs_per_step)
+ linear = linear.transpose(0, 2, 1)
+ assert mel.shape[1] == linear.shape[1]
+ linear = torch.FloatTensor(linear).contiguous()
+ else:
+ linear = None
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
stop_targets, item_idxs
diff --git a/datasets/preprocess.py b/datasets/preprocess.py
index 029922d3..ce876edc 100644
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@@ -187,3 +187,21 @@ def libri_tts(root_path, meta_files=None):
for item in items:
assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}"
return items
+
+
+def custom_turkish(root_path, meta_file):
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "turkish-female"
+ skipped_files = []
+ with open(txt_file, 'r', encoding='utf-8') as ttf:
+ for line in ttf:
+ cols = line.split('|')
+ wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
+ if not os.path.exists(wav_file):
+ skipped_files.append(wav_file)
+ continue
+ text = cols[1].strip()
+ items.append([text, wav_file, speaker_name])
+ print(f" [!] {len(skipped_files)} files skipped. They are not exist...")
+ return items
diff --git a/de_sentences.txt b/de_sentences.txt
deleted file mode 100644
index 7c7651d8..00000000
--- a/de_sentences.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Herzlieb, fragte er noch einmal, ist Papa wohl?
-Eine große Ueberraschung.
-Dann gab ihm sein kleines zärtliches Herz plötzlich ein, beide Aermchen um den Hals der Mutter zu schlingen und sie wieder und wieder zu küssen und seine weiche.
-als ob sie ihn nie mehr von sich lassen wollte, und weinte bitterlich.
\ No newline at end of file
diff --git a/debug_config.json b/debug_config.json
deleted file mode 100644
index 51f08ce8..00000000
--- a/debug_config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "num_mels": 80,
- "num_freq": 1024,
- "sample_rate": 20000,
- "frame_length_ms": 50.0,
- "frame_shift_ms": 12.5,
- "preemphasis": 0.97,
- "min_level_db": -100,
- "ref_level_db": 20,
- "hidden_size": 128,
- "embedding_size": 256,
- "text_cleaner": "english_cleaners",
- "epochs": 200,
- "lr": 0.01,
- "lr_patience": 2,
- "lr_decay": 0.5,
- "batch_size": 32,
- "griffinf_lim_iters": 60,
- "power": 1.5,
- "r": 5,
- "num_loader_workers": 16,
- "save_step": 1,
- "data_path": "/data/shared/KeithIto/LJSpeech-1.0",
- "output_path": "result",
- "log_dir": "/home/erogol/projects/TTS/logs/"
-}
diff --git a/distribute.py b/distribute.py
index a5fdb373..b0fc8b07 100644
--- a/distribute.py
+++ b/distribute.py
@@ -9,7 +9,7 @@ import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from TTS.utils.generic_utils import load_config, create_experiment_folder
+from TTS.utils.generic_utils import create_experiment_folder
class DistributedSampler(Sampler):
diff --git a/layers/common_layers.py b/layers/common_layers.py
index 592f017c..b7d02c2d 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -33,7 +33,7 @@ class LinearBN(nn.Module):
super(LinearBN, self).__init__()
self.linear_layer = torch.nn.Linear(
in_features, out_features, bias=bias)
- self.bn = nn.BatchNorm1d(out_features)
+ self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
self._init_w(init_gain)
def _init_w(self, init_gain):
@@ -45,7 +45,7 @@ class LinearBN(nn.Module):
out = self.linear_layer(x)
if len(out.shape) == 3:
out = out.permute(1, 2, 0)
- out = self.bn(out)
+ out = self.batch_normalization(out)
if len(out.shape) == 3:
out = out.permute(2, 0, 1)
return out
@@ -63,18 +63,18 @@ class Prenet(nn.Module):
self.prenet_dropout = prenet_dropout
in_features = [in_features] + out_features[:-1]
if prenet_type == "bn":
- self.layers = nn.ModuleList([
+ self.linear_layers = nn.ModuleList([
LinearBN(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
elif prenet_type == "original":
- self.layers = nn.ModuleList([
+ self.linear_layers = nn.ModuleList([
Linear(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
def forward(self, x):
- for linear in self.layers:
+ for linear in self.linear_layers:
if self.prenet_dropout:
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
else:
@@ -93,7 +93,7 @@ class LocationLayer(nn.Module):
attention_n_filters=32,
attention_kernel_size=31):
super(LocationLayer, self).__init__()
- self.location_conv = nn.Conv1d(
+ self.location_conv1d = nn.Conv1d(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
@@ -104,7 +104,7 @@ class LocationLayer(nn.Module):
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
def forward(self, attention_cat):
- processed_attention = self.location_conv(attention_cat)
+ processed_attention = self.location_conv1d(attention_cat)
processed_attention = self.location_dense(
processed_attention.transpose(1, 2))
return processed_attention
@@ -138,7 +138,7 @@ class GravesAttention(nn.Module):
def init_states(self, inputs):
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
- self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5
+ self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
@@ -164,6 +164,9 @@ class GravesAttention(nn.Module):
b_t = gbk_t[:, 1, :]
k_t = gbk_t[:, 2, :]
+ # dropout to decorrelate attention heads
+ g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
+
# attention GMM parameters
sig_t = torch.nn.functional.softplus(b_t) + self.eps
diff --git a/layers/losses.py b/layers/losses.py
index 7e5671b2..608e247d 100644
--- a/layers/losses.py
+++ b/layers/losses.py
@@ -125,3 +125,113 @@ class BCELossMasked(nn.Module):
x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum')
loss = loss / mask.sum()
return loss
+
+
+class GuidedAttentionLoss(torch.nn.Module):
+ def __init__(self, sigma=0.4):
+ super(GuidedAttentionLoss, self).__init__()
+ self.sigma = sigma
+
+ def _make_ga_masks(self, ilens, olens):
+ B = len(ilens)
+ max_ilen = max(ilens)
+ max_olen = max(olens)
+ ga_masks = torch.zeros((B, max_olen, max_ilen))
+ for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+ ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma)
+ return ga_masks
+
+ def forward(self, att_ws, ilens, olens):
+ ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device)
+ seq_masks = self._make_masks(ilens, olens).to(att_ws.device)
+ losses = ga_masks * att_ws
+ loss = torch.mean(losses.masked_select(seq_masks))
+ return loss
+
+ @staticmethod
+ def _make_ga_mask(ilen, olen, sigma):
+ grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
+ grid_x, grid_y = grid_x.float(), grid_y.float()
+ return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))
+
+ @staticmethod
+ def _make_masks(ilens, olens):
+ in_masks = sequence_mask(ilens)
+ out_masks = sequence_mask(olens)
+ return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
+
+
+class TacotronLoss(torch.nn.Module):
+ def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
+ super(TacotronLoss, self).__init__()
+ self.stopnet_pos_weight = stopnet_pos_weight
+ self.ga_alpha = c.ga_alpha
+ self.config = c
+ # postnet decoder loss
+ if c.loss_masking:
+ self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
+ "Tacotron"
+ ] else MSELossMasked(c.seq_len_norm)
+ else:
+ self.criterion = nn.L1Loss() if c.model in ["Tacotron"
+ ] else nn.MSELoss()
+ # guided attention loss
+ if c.ga_alpha > 0:
+ self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
+ # stopnet loss
+ # pylint: disable=not-callable
+ self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None
+
+ def forward(self, postnet_output, decoder_output, mel_input, linear_input,
+ stopnet_output, stopnet_target, output_lens, decoder_b_output,
+ alignments, alignment_lens, input_lens):
+
+ return_dict = {}
+ # decoder and postnet losses
+ if self.config.loss_masking:
+ decoder_loss = self.criterion(decoder_output, mel_input,
+ output_lens)
+ if self.config.model in ["Tacotron", "TacotronGST"]:
+ postnet_loss = self.criterion(postnet_output, linear_input,
+ output_lens)
+ else:
+ postnet_loss = self.criterion(postnet_output, mel_input,
+ output_lens)
+ else:
+ decoder_loss = self.criterion(decoder_output, mel_input)
+ if self.config.model in ["Tacotron", "TacotronGST"]:
+ postnet_loss = self.criterion(postnet_output, linear_input)
+ else:
+ postnet_loss = self.criterion(postnet_output, mel_input)
+ loss = decoder_loss + postnet_loss
+ return_dict['decoder_loss'] = decoder_loss
+ return_dict['postnet_loss'] = postnet_loss
+
+ # stopnet loss
+ stop_loss = self.criterion_st(
+ stopnet_output, stopnet_target,
+ output_lens) if self.config.stopnet else torch.zeros(1)
+ if not self.config.separate_stopnet and self.config.stopnet:
+ loss += stop_loss
+ return_dict['stopnet_loss'] = stop_loss
+
+ # backward decoder loss (if enabled)
+ if self.config.bidirectional_decoder:
+ if self.config.loss_masking:
+ decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens)
+ else:
+ decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input)
+ decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output)
+ loss += decoder_b_loss + decoder_c_loss
+ return_dict['decoder_b_loss'] = decoder_b_loss
+ return_dict['decoder_c_loss'] = decoder_c_loss
+
+ # guided attention loss (if enabled)
+ if self.config.ga_alpha > 0:
+ ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
+ loss += ga_loss * self.ga_alpha
+ return_dict['ga_loss'] = ga_loss * self.ga_alpha
+
+ return_dict['loss'] = loss
+ return return_dict
+
diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index fa76a6b2..f11aee65 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -6,130 +6,128 @@ from .common_layers import init_attn, Prenet, Linear
class ConvBNBlock(nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size, nonlinear=None):
+ def __init__(self, in_channels, out_channels, kernel_size, activation=None):
super(ConvBNBlock, self).__init__()
assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2
- conv1d = nn.Conv1d(in_channels,
- out_channels,
- kernel_size,
- padding=padding)
- norm = nn.BatchNorm1d(out_channels)
- dropout = nn.Dropout(p=0.5)
- if nonlinear == 'relu':
- self.net = nn.Sequential(conv1d, norm, nn.ReLU(), dropout)
- elif nonlinear == 'tanh':
- self.net = nn.Sequential(conv1d, norm, nn.Tanh(), dropout)
+ self.convolution1d = nn.Conv1d(in_channels,
+ out_channels,
+ kernel_size,
+ padding=padding)
+ self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
+ self.dropout = nn.Dropout(p=0.5)
+ if activation == 'relu':
+ self.activation = nn.ReLU()
+ elif activation == 'tanh':
+ self.activation = nn.Tanh()
else:
- self.net = nn.Sequential(conv1d, norm, dropout)
+ self.activation = nn.Identity()
def forward(self, x):
- output = self.net(x)
- return output
+ o = self.convolution1d(x)
+ o = self.batch_normalization(o)
+ o = self.activation(o)
+ o = self.dropout(o)
+ return o
class Postnet(nn.Module):
- def __init__(self, mel_dim, num_convs=5):
+ def __init__(self, output_dim, num_convs=5):
super(Postnet, self).__init__()
self.convolutions = nn.ModuleList()
self.convolutions.append(
- ConvBNBlock(mel_dim, 512, kernel_size=5, nonlinear='tanh'))
+ ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
for _ in range(1, num_convs - 1):
self.convolutions.append(
- ConvBNBlock(512, 512, kernel_size=5, nonlinear='tanh'))
+ ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
self.convolutions.append(
- ConvBNBlock(512, mel_dim, kernel_size=5, nonlinear=None))
+ ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
def forward(self, x):
+ o = x
for layer in self.convolutions:
- x = layer(x)
- return x
+ o = layer(o)
+ return o
class Encoder(nn.Module):
- def __init__(self, in_features=512):
+ def __init__(self, output_input_dim=512):
super(Encoder, self).__init__()
- convolutions = []
+ self.convolutions = nn.ModuleList()
for _ in range(3):
- convolutions.append(
- ConvBNBlock(in_features, in_features, 5, 'relu'))
- self.convolutions = nn.Sequential(*convolutions)
- self.lstm = nn.LSTM(in_features,
- int(in_features / 2),
+ self.convolutions.append(
+ ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
+ self.lstm = nn.LSTM(output_input_dim,
+ int(output_input_dim / 2),
num_layers=1,
batch_first=True,
+ bias=True,
bidirectional=True)
self.rnn_state = None
def forward(self, x, input_lengths):
- x = self.convolutions(x)
- x = x.transpose(1, 2)
- x = nn.utils.rnn.pack_padded_sequence(x,
+ o = x
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ o = nn.utils.rnn.pack_padded_sequence(o,
input_lengths,
batch_first=True)
self.lstm.flatten_parameters()
- outputs, _ = self.lstm(x)
- outputs, _ = nn.utils.rnn.pad_packed_sequence(
- outputs,
- batch_first=True,
- )
- return outputs
+ o, _ = self.lstm(o)
+ o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
+ return o
def inference(self, x):
- x = self.convolutions(x)
- x = x.transpose(1, 2)
- self.lstm.flatten_parameters()
- outputs, _ = self.lstm(x)
- return outputs
-
- def inference_truncated(self, x):
- """
- Preserve encoder state for continuous inference
- """
- x = self.convolutions(x)
- x = x.transpose(1, 2)
- self.lstm.flatten_parameters()
- outputs, self.rnn_state = self.lstm(x, self.rnn_state)
- return outputs
+ o = x
+ for layer in self.convolutions:
+ o = layer(o)
+ o = o.transpose(1, 2)
+ # self.lstm.flatten_parameters()
+ o, _ = self.lstm(o)
+ return o
# adapted from https://github.com/NVIDIA/tacotron2/
class Decoder(nn.Module):
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
- def __init__(self, in_features, memory_dim, r, attn_type, attn_win, attn_norm,
+ def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn, trans_agent,
forward_attn_mask, location_attn, attn_K, separate_stopnet,
speaker_embedding_dim):
super(Decoder, self).__init__()
- self.memory_dim = memory_dim
+ self.frame_dim = frame_dim
self.r_init = r
self.r = r
- self.encoder_embedding_dim = in_features
+ self.encoder_embedding_dim = input_dim
self.separate_stopnet = separate_stopnet
+ self.max_decoder_steps = 1000
+ self.gate_threshold = 0.5
+
+ # model dimensions
self.query_dim = 1024
self.decoder_rnn_dim = 1024
self.prenet_dim = 256
- self.max_decoder_steps = 1000
- self.gate_threshold = 0.5
+ self.attn_dim = 128
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
# memory -> |Prenet| -> processed_memory
- prenet_dim = self.memory_dim
- self.prenet = Prenet(
- prenet_dim,
- prenet_type,
- prenet_dropout,
- out_features=[self.prenet_dim, self.prenet_dim],
- bias=False)
+ prenet_dim = self.frame_dim
+ self.prenet = Prenet(prenet_dim,
+ prenet_type,
+ prenet_dropout,
+ out_features=[self.prenet_dim, self.prenet_dim],
+ bias=False)
- self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
- self.query_dim)
+ self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
+ self.query_dim,
+ bias=True)
self.attention = init_attn(attn_type=attn_type,
query_dim=self.query_dim,
- embedding_dim=in_features,
+ embedding_dim=input_dim,
attention_dim=128,
location_attention=location_attn,
attention_location_n_filters=32,
@@ -141,15 +139,16 @@ class Decoder(nn.Module):
forward_attn_mask=forward_attn_mask,
attn_K=attn_K)
- self.decoder_rnn = nn.LSTMCell(self.query_dim + in_features,
- self.decoder_rnn_dim, 1)
+ self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
+ self.decoder_rnn_dim,
+ bias=True)
- self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
- self.memory_dim * self.r_init)
+ self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
+ self.frame_dim * self.r_init)
self.stopnet = nn.Sequential(
nn.Dropout(0.1),
- Linear(self.decoder_rnn_dim + self.memory_dim * self.r_init,
+ Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
1,
bias=True,
init_gain='sigmoid'))
@@ -161,7 +160,7 @@ class Decoder(nn.Module):
def get_go_frame(self, inputs):
B = inputs.size(0)
memory = torch.zeros(1, device=inputs.device).repeat(B,
- self.memory_dim * self.r)
+ self.frame_dim * self.r)
return memory
def _init_states(self, inputs, mask, keep_states=False):
@@ -187,9 +186,9 @@ class Decoder(nn.Module):
Reshape the spectrograms for given 'r'
"""
# Grouping multiple frames if necessary
- if memory.size(-1) == self.memory_dim:
+ if memory.size(-1) == self.frame_dim:
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
- # Time first (T_decoder, B, memory_dim)
+ # Time first (T_decoder, B, frame_dim)
memory = memory.transpose(0, 1)
return memory
@@ -197,22 +196,22 @@ class Decoder(nn.Module):
alignments = torch.stack(alignments).transpose(0, 1)
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
- outputs = outputs.view(outputs.size(0), -1, self.memory_dim)
+ outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
outputs = outputs.transpose(1, 2)
return outputs, stop_tokens, alignments
def _update_memory(self, memory):
if len(memory.shape) == 2:
- return memory[:, self.memory_dim * (self.r - 1):]
- return memory[:, :, self.memory_dim * (self.r - 1):]
+ return memory[:, self.frame_dim * (self.r - 1):]
+ return memory[:, :, self.frame_dim * (self.r - 1):]
def decode(self, memory):
'''
shapes:
- - memory: B x r * self.memory_dim
+ - memory: B x r * self.frame_dim
'''
# self.context: B x D_en
- # query_input: B x D_en + (r * self.memory_dim)
+ # query_input: B x D_en + (r * self.frame_dim)
query_input = torch.cat((memory, self.context), -1)
# self.query and self.attention_rnn_cell_state : B x D_attn_rnn
self.query, self.attention_rnn_cell_state = self.attention_rnn(
@@ -235,16 +234,16 @@ class Decoder(nn.Module):
# B x (D_decoder_rnn + D_en)
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
dim=1)
- # B x (self.r * self.memory_dim)
+ # B x (self.r * self.frame_dim)
decoder_output = self.linear_projection(decoder_hidden_context)
- # B x (D_decoder_rnn + (self.r * self.memory_dim))
+ # B x (D_decoder_rnn + (self.r * self.frame_dim))
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
# select outputs for the reduction rate self.r
- decoder_output = decoder_output[:, :self.r * self.memory_dim]
+ decoder_output = decoder_output[:, :self.r * self.frame_dim]
return decoder_output, self.attention.attention_weights, stop_token
def forward(self, inputs, memories, mask, speaker_embeddings=None):
diff --git a/models/tacotron2.py b/models/tacotron2.py
index d530774a..3e7adfca 100644
--- a/models/tacotron2.py
+++ b/models/tacotron2.py
@@ -29,7 +29,7 @@ class Tacotron2(nn.Module):
super(Tacotron2, self).__init__()
self.postnet_output_dim = postnet_output_dim
self.decoder_output_dim = decoder_output_dim
- self.n_frames_per_step = r
+ self.r = r
self.bidirectional_decoder = bidirectional_decoder
decoder_dim = 512 if num_speakers > 1 else 512
encoder_dim = 512 if num_speakers > 1 else 512
diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb
deleted file mode 100644
index 7d3a45cf..00000000
--- a/notebooks/Benchmark.ipynb
+++ /dev/null
@@ -1,546 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This is to test TTS models with benchmark sentences for speech synthesis.\n",
- "\n",
- "Before running this script please DON'T FORGET: \n",
- "- to set file paths.\n",
- "- to download related model files from TTS and WaveRNN.\n",
- "- to checkout right commit versions (given next to the model) of TTS and WaveRNN.\n",
- "- to set the right paths in the cell below.\n",
- "\n",
- "Repositories:\n",
- "- TTS: https://github.com/mozilla/TTS\n",
- "- WaveRNN: https://github.com/erogol/WaveRNN"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "TTS_PATH = \"/home/erogol/projects/\"\n",
- "WAVERNN_PATH =\"/home/erogol/projects/\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "import os\n",
- "import sys\n",
- "import io\n",
- "import torch \n",
- "import time\n",
- "import json\n",
- "import numpy as np\n",
- "from collections import OrderedDict\n",
- "from matplotlib import pylab as plt\n",
- "\n",
- "%pylab inline\n",
- "rcParams[\"figure.figsize\"] = (16,5)\n",
- "\n",
- "# add libraries into environment\n",
- "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
- "sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally\n",
- "\n",
- "import librosa\n",
- "import librosa.display\n",
- "\n",
- "from TTS.models.tacotron import Tacotron \n",
- "from TTS.layers import *\n",
- "from TTS.utils.data import *\n",
- "from TTS.utils.audio import AudioProcessor\n",
- "from TTS.utils.generic_utils import load_config, setup_model\n",
- "from TTS.utils.text import text_to_sequence\n",
- "from TTS.utils.synthesis import synthesis\n",
- "from TTS.utils.visual import visualize\n",
- "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
- "\n",
- "import IPython\n",
- "from IPython.display import Audio\n",
- "\n",
- "import os\n",
- "os.environ['CUDA_VISIBLE_DEVICES']='1'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
- " t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, \n",
- " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
- " use_griffin_lim=use_gl)\n",
- " if CONFIG.model == \"Tacotron\" and not use_gl:\n",
- " # coorect the normalization differences b/w TTS and the Vocoder.\n",
- " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
- " if not use_gl:\n",
- " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
- " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n",
- " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n",
- "\n",
- " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
- " if figures: \n",
- " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec) \n",
- " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
- " os.makedirs(OUT_FOLDER, exist_ok=True)\n",
- " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
- " out_path = os.path.join(OUT_FOLDER, file_name)\n",
- " ap.save_wav(waveform, out_path)\n",
- " return alignment, mel_postnet_spec, stop_tokens, waveform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Set constants\n",
- "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
- "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
- "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
- "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
- "CONFIG = load_config(CONFIG_PATH)\n",
- "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n",
- "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n",
- "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n",
- "use_cuda = True\n",
- "\n",
- "# Set some config fields manually for testing\n",
- "# CONFIG.windowing = False\n",
- "# CONFIG.prenet_dropout = False\n",
- "# CONFIG.separate_stopnet = True\n",
- "CONFIG.use_forward_attn = True\n",
- "# CONFIG.forward_attn_mask = True\n",
- "# CONFIG.stopnet = True\n",
- "\n",
- "# Set the vocoder\n",
- "use_gl = True # use GL if True\n",
- "batched_wavernn = True # use batched wavernn inference if True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# LOAD TTS MODEL\n",
- "# multi speaker \n",
- "if CONFIG.use_speaker_embedding:\n",
- " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
- " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
- "else:\n",
- " speakers = []\n",
- " speaker_id = None\n",
- "\n",
- "# if the vocabulary was passed, replace the default\n",
- "if 'characters' in CONFIG.keys():\n",
- " symbols, phonemes = make_symbols(**CONFIG.characters)\n",
- "\n",
- "# load the model\n",
- "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), CONFIG)\n",
- "\n",
- "# load the audio processor\n",
- "ap = AudioProcessor(**CONFIG.audio) \n",
- "\n",
- "\n",
- "# load model state\n",
- "if use_cuda:\n",
- " cp = torch.load(MODEL_PATH)\n",
- "else:\n",
- " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
- "\n",
- "# load the model\n",
- "model.load_state_dict(cp['model'])\n",
- "if use_cuda:\n",
- " model.cuda()\n",
- "model.eval()\n",
- "print(cp['step'])\n",
- "print(cp['r'])\n",
- "\n",
- "# set model stepsize\n",
- "if 'r' in cp:\n",
- " model.decoder.set_r(cp['r'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# LOAD WAVERNN - Make sure you downloaded the model and installed the module\n",
- "if use_gl == False:\n",
- " from WaveRNN.models.wavernn import Model\n",
- " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
- " bits = 10\n",
- " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n",
- " wavernn = Model(\n",
- " rnn_dims=512,\n",
- " fc_dims=512,\n",
- " mode=VOCODER_CONFIG.mode,\n",
- " mulaw=VOCODER_CONFIG.mulaw,\n",
- " pad=VOCODER_CONFIG.pad,\n",
- " upsample_factors=VOCODER_CONFIG.upsample_factors,\n",
- " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n",
- " compute_dims=128,\n",
- " res_out_dims=128,\n",
- " res_blocks=10,\n",
- " hop_length=ap_vocoder.hop_length,\n",
- " sample_rate=ap_vocoder.sample_rate,\n",
- " use_upsample_net = True,\n",
- " use_aux_net = True\n",
- " ).cuda()\n",
- "\n",
- " check = torch.load(VOCODER_MODEL_PATH)\n",
- " wavernn.load_state_dict(check['model'], strict=False)\n",
- " if use_cuda:\n",
- " wavernn.cuda()\n",
- " wavernn.eval();\n",
- " print(check['step'])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Comparision with https://mycroft.ai/blog/available-voices/"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model.eval()\n",
- "model.decoder.max_decoder_steps = 2000\n",
- "speaker_id = None\n",
- "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model.eval()\n",
- "model.decoder.max_decoder_steps = 2000\n",
- "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"The human voice is the most perfect instrument of all.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"This cake is great. It's so delicious and moist.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Comparison with https://keithito.github.io/audio-samples/"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \" He has read the whole thing.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"He reads books.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Thisss isrealy awhsome.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"This is your internet browser, Firefox.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"This is your internet browser Firefox.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Eren, how are you?\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Hard Sentences"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Encouraged, he started with a minute a day.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"If he decided to watch TV he really watched it.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# for twb dataset\n",
- "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb
index b5a88611..c747c764 100644
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@@ -7,15 +7,6 @@
"This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "TTS_PATH = \"/home/erogol/projects/\""
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -26,7 +17,6 @@
"%autoreload 2\n",
"import os\n",
"import sys\n",
- "sys.path.append(TTS_PATH)\n",
"import torch\n",
"import importlib\n",
"import numpy as np\n",
@@ -42,7 +32,7 @@
"%matplotlib inline\n",
"\n",
"import os\n",
- "os.environ['CUDA_VISIBLE_DEVICES']='2'"
+ "os.environ['CUDA_VISIBLE_DEVICES']='0'"
]
},
{
@@ -69,12 +59,12 @@
"metadata": {},
"outputs": [],
"source": [
- "OUT_PATH = \"/data/rw/pit/data/turkish-vocoder/\"\n",
- "DATA_PATH = \"/data/rw/home/Turkish\"\n",
+ "OUT_PATH = \"/home/erogol/Data/LJSpeech-1.1/ljspeech-March-17-2020_01+16AM-871588c/\"\n",
+ "DATA_PATH = \"/home/erogol/Data/LJSpeech-1.1/\"\n",
"DATASET = \"ljspeech\"\n",
- "METADATA_FILE = \"metadata.txt\"\n",
- "CONFIG_PATH = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/config.json\"\n",
- "MODEL_FILE = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/checkpoint_255000.pth.tar\"\n",
+ "METADATA_FILE = \"metadata.csv\"\n",
+ "CONFIG_PATH = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/config.json\"\n",
+ "MODEL_FILE = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/checkpoint_420000.pth.tar\"\n",
"BATCH_SIZE = 32\n",
"\n",
"QUANTIZED_WAV = False\n",
@@ -85,6 +75,7 @@
"print(\" > CUDA enabled: \", use_cuda)\n",
"\n",
"C = load_config(CONFIG_PATH)\n",
+ "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n",
"ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)"
]
},
@@ -94,7 +85,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# if the vocabulary was passed, replace the default\n",
+ "# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
@@ -120,7 +111,7 @@
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
- "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
+ "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
"loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)"
]
},
@@ -143,7 +134,7 @@
"metadata = []\n",
"losses = []\n",
"postnet_losses = []\n",
- "criterion = L1LossMasked()\n",
+ "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n",
"with torch.no_grad():\n",
" for data in tqdm(loader):\n",
" # setup input data\n",
@@ -232,7 +223,31 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Check model performance"
+ "### Sanity Check"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "idx = 1\n",
+ "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import soundfile as sf\n",
+ "wav, sr = sf.read(item_idx[idx])\n",
+ "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n",
+ "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n",
+ "mel_truth = ap.melspectrogram(wav)\n",
+ "print(mel_truth.shape)"
]
},
{
@@ -242,10 +257,8 @@
"outputs": [],
"source": [
"# plot posnet output\n",
- "idx = 1\n",
- "mel_example = postnet_outputs[idx]\n",
- "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
- "print(mel_example[:mel_lengths[1], :].shape)"
+ "plot_spectrogram(mel_postnet, ap);\n",
+ "print(mel_postnet[:mel_lengths[idx], :].shape)"
]
},
{
@@ -255,9 +268,8 @@
"outputs": [],
"source": [
"# plot decoder output\n",
- "mel_example = mel_outputs[idx].data.cpu().numpy()\n",
- "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n",
- "print(mel_example[:mel_lengths[1], :].shape)"
+ "plot_spectrogram(mel_decoder, ap);\n",
+ "print(mel_decoder.shape)"
]
},
{
@@ -267,10 +279,8 @@
"outputs": [],
"source": [
"# plot GT specgrogram\n",
- "wav = ap.load_wav(item_idx[idx])\n",
- "melt = ap.melspectrogram(wav)\n",
- "print(melt.shape)\n",
- "plot_spectrogram(melt.T, ap);"
+ "print(mel_truth.shape)\n",
+ "plot_spectrogram(mel_truth.T, ap);"
]
},
{
@@ -281,9 +291,9 @@
"source": [
"# postnet, decoder diff\n",
"from matplotlib import pylab as plt\n",
- "mel_diff = mel_outputs[idx] - postnet_outputs[idx]\n",
+ "mel_diff = mel_decoder - mel_postnet\n",
"plt.figure(figsize=(16, 10))\n",
- "plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
+ "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n",
"plt.colorbar()\n",
"plt.tight_layout()"
]
@@ -294,10 +304,25 @@
"metadata": {},
"outputs": [],
"source": [
+ "# PLOT GT SPECTROGRAM diff\n",
"from matplotlib import pylab as plt\n",
- "# mel = mel_poutputs[idx].detach().cpu().numpy()\n",
- "mel = postnet_outputs[idx].detach().cpu().numpy()\n",
- "mel_diff2 = melt.T - mel[:melt.shape[1]]\n",
+ "mel_diff2 = mel_truth.T - mel_decoder\n",
+ "plt.figure(figsize=(16, 10))\n",
+ "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
+ "plt.colorbar()\n",
+ "plt.tight_layout()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PLOT GT SPECTROGRAM diff\n",
+ "from matplotlib import pylab as plt\n",
+ "mel = postnet_outputs[idx]\n",
+ "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n",
"plt.figure(figsize=(16, 10))\n",
"plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n",
"plt.colorbar()\n",
diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb
index 9d3e5e75..92b1d6c4 100644
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@@ -2,15 +2,22 @@
"cells": [
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
- "This notebook is to test attention performance on hard sentences taken from DeepVoice paper."
+ "This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.\n",
+ "### Features of this notebook\n",
+ "- You can see visually how your model performs on each sentence and try to dicern common problems.\n",
+ "- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.\n",
+ "- You can change the list of sentences byt providing a different sentence file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
+ "Collapsed": "false",
"scrolled": true
},
"outputs": [],
@@ -31,7 +38,8 @@
"\n",
"from TTS.layers import *\n",
"from TTS.utils.audio import AudioProcessor\n",
- "from TTS.utils.generic_utils import load_config, setup_model\n",
+ "from TTS.utils.generic_utils import setup_model\n",
+ "from TTS.utils.io import load_config\n",
"from TTS.utils.text import text_to_sequence\n",
"from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import plot_alignment\n",
@@ -40,19 +48,12 @@
"import IPython\n",
"from IPython.display import Audio\n",
"\n",
- "os.environ['CUDA_VISIBLE_DEVICES']='2'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
+ "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
+ "\n",
"def tts(model, text, CONFIG, use_cuda, ap):\n",
" t_1 = time.time()\n",
" # run the model\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
+ " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" # plotting\n",
@@ -66,18 +67,11 @@
" file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
" ap.save_wav(waveform, out_path)\n",
- " return attn_score"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
+ " return attn_score\n",
+ "\n",
"# Set constants\n",
- "ROOT_PATH = '/data/rw/pit/keep/ljspeech-December-11-2019_04+32PM-ca49ae8/'\n",
- "MODEL_PATH = ROOT_PATH + '/checkpoint_410000.pth.tar'\n",
+ "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
+ "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = './hard_sentences/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
@@ -96,7 +90,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
@@ -144,34 +140,25 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"model.decoder.max_decoder_steps=3000\n",
- "model.decoder.prenet.train()\n",
"attn_scores = []\n",
"with open(SENTENCES_PATH, 'r') as f:\n",
" for text in f:\n",
- " try:\n",
- " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n",
- " except ValueError:\n",
- " attn_score = 0\n",
+ " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n",
" attn_scores.append(attn_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.mean(attn_scores)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"np.mean(attn_scores)"
diff --git a/requirements.txt b/requirements.txt
index 47fa1ec0..5f31db70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-numpy>=1.14.3
+numpy>=1.16.0
torch>=0.4.1
librosa>=0.5.1
Unidecode>=0.4.20
diff --git a/requirements_tests.txt b/requirements_tests.txt
new file mode 100644
index 00000000..1e0615b2
--- /dev/null
+++ b/requirements_tests.txt
@@ -0,0 +1,15 @@
+numpy>=1.16.0
+torch>=0.4.1
+tensorflow>=2.2
+librosa>=0.5.1
+Unidecode>=0.4.20
+tensorboard
+tensorboardX
+matplotlib
+Pillow
+flask
+scipy
+tqdm
+soundfile
+phonemizer
+bokeh==1.4.0
diff --git a/server/README.md b/server/README.md
index 0563ef94..3c65c961 100644
--- a/server/README.md
+++ b/server/README.md
@@ -7,7 +7,7 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple
#### Development server:
##### Using server.py
-If you have the environment set already for TTS, then you can directly call ```setup.py```.
+If you have the environment set already for TTS, then you can directly call ```server.py```.
##### Using .whl
1. apt-get install -y espeak libsndfile1 python3-venv
diff --git a/server/synthesizer.py b/server/synthesizer.py
index 10f4fb0a..392dcc4a 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -7,7 +7,8 @@ import torch
import yaml
from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import load_config, setup_model
+from TTS.utils.io import load_config
+from TTS.utils.generic_utils import setup_model
from TTS.utils.speakers import load_speaker_mapping
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
@@ -164,19 +165,25 @@ class Synthesizer(object):
sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences
return sentences
- def tts(self, text):
+ def tts(self, text, speaker_id=None):
wavs = []
sens = self.split_into_sentences(text)
print(sens)
+ speaker_id = id_to_torch(speaker_id)
+ if speaker_id is not None and self.use_cuda:
+ speaker_id = speaker_id.cuda()
+
for sen in sens:
# preprocess the given text
- inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)
+ inputs = text_to_seqvec(sen, self.tts_config)
+ inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
+ inputs = inputs.unsqueeze(0)
# synthesize voice
- decoder_output, postnet_output, alignments, _ = run_model(
- self.tts_model, inputs, self.tts_config, False, None, None)
+ decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
+ self.tts_model, inputs, self.tts_config, False, speaker_id, None)
# convert outputs to numpy
- postnet_output, decoder_output, _ = parse_outputs(
- postnet_output, decoder_output, alignments)
+ postnet_output, decoder_output, _, _ = parse_outputs_torch(
+ postnet_output, decoder_output, alignments, stop_tokens)
if self.pwgan:
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
diff --git a/setup.py b/setup.py
index f92dac8a..5e89723b 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args()
# Remove our arguments from argv so that setuptools doesn't see them
sys.argv = [sys.argv[0]] + unknown_args
-version = '0.0.1'
+version = '0.0.2'
# Adapted from https://github.com/pytorch/pytorch
cwd = os.path.dirname(os.path.abspath(__file__))
@@ -93,7 +93,7 @@ setup(
install_requires=[
"scipy>=0.19.0",
"torch>=0.4.1",
- "numpy==1.15.4",
+ "numpy>=1.16.0",
"librosa==0.6.2",
"unidecode==0.4.20",
"attrdict",
diff --git a/speaker_encoder/config.json b/speaker_encoder/config.json
index 79c42bc0..0d0f8f68 100644
--- a/speaker_encoder/config.json
+++ b/speaker_encoder/config.json
@@ -34,6 +34,7 @@
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 1, // Number of steps to log traning on console.
"output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+ "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"model": {
"input_dim": 40,
"proj_dim": 128,
diff --git a/speaker_encoder/tests.py b/speaker_encoder/tests.py
index 220ba360..039833fc 100644
--- a/speaker_encoder/tests.py
+++ b/speaker_encoder/tests.py
@@ -4,7 +4,7 @@ import torch as T
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.loss import GE2ELoss
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
diff --git a/speaker_encoder/train.py b/speaker_encoder/train.py
index 19067401..0a137360 100644
--- a/speaker_encoder/train.py
+++ b/speaker_encoder/train.py
@@ -44,7 +44,7 @@ def setup_loader(ap, is_val=False, verbose=False):
loader = DataLoader(dataset,
batch_size=c.num_speakers_in_batch,
shuffle=False,
- num_workers=0,
+ num_workers=c.num_loader_workers,
collate_fn=dataset.collate_fn)
return loader
diff --git a/synthesize.py b/synthesize.py
index 1f1ce36f..18048c2f 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -7,7 +7,8 @@ import json
import string
from TTS.utils.synthesis import synthesis
-from TTS.utils.generic_utils import load_config, setup_model
+from TTS.utils.generic_utils import setup_model
+from TTS.utils.io import load_config
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.utils.audio import AudioProcessor
@@ -25,7 +26,7 @@ def tts(model,
figures=False):
t_1 = time.time()
use_vocoder_model = vocoder_model is not None
- waveform, alignment, _, postnet_output, stop_tokens = synthesis(
+ waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
diff --git a/test_cluster.py b/test_cluster.py
deleted file mode 100644
index daeeedc3..00000000
--- a/test_cluster.py
+++ /dev/null
@@ -1 +0,0 @@
-print("Python is running!!")
diff --git a/tests/inputs/scale_stats.npy b/tests/inputs/scale_stats.npy
new file mode 100644
index 00000000..5368ecb2
Binary files /dev/null and b/tests/inputs/scale_stats.npy differ
diff --git a/tests/test_audio.py b/tests/test_audio.py
index fc5deb48..4b8ee276 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -3,7 +3,7 @@ import unittest
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
TESTS_PATH = get_tests_path()
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
@@ -13,6 +13,7 @@ os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
+# pylint: disable=protected-access
class TestAudio(unittest.TestCase):
def __init__(self, *args, **kwargs):
super(TestAudio, self).__init__(*args, **kwargs)
@@ -33,7 +34,7 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = clip_norm
wav = self.ap.load_wav(WAV_FILE)
mel = self.ap.melspectrogram(wav)
- wav_ = self.ap.inv_mel_spectrogram(mel)
+ wav_ = self.ap.inv_melspectrogram(mel)
file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\
.format(max_norm, signal_norm, symmetric_norm, clip_norm)
print(" | > Creating wav file at : ", file_name)
@@ -56,6 +57,7 @@ class TestAudio(unittest.TestCase):
"""Check normalization and denormalization for range values and consistency """
print(" > Testing normalization and denormalization.")
wav = self.ap.load_wav(WAV_FILE)
+ wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below.
self.ap.signal_norm = False
x = self.ap.melspectrogram(wav)
x_old = x
@@ -65,7 +67,7 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
@@ -79,7 +81,9 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
+
+
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
@@ -93,7 +97,9 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
+
+
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
@@ -108,7 +114,9 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
+
+
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
@@ -122,7 +130,9 @@ class TestAudio(unittest.TestCase):
self.ap.symmetric_norm = False
self.ap.max_norm = 1.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
+
+
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= 0, x_norm.min()
@@ -133,10 +143,33 @@ class TestAudio(unittest.TestCase):
self.ap.symmetric_norm = True
self.ap.max_norm = 1.0
x_norm = self.ap._normalize(x)
- print(x_norm.max(), " -- ", x_norm.min())
+ print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
+
+
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
assert x_norm.min() < 0, x_norm.min()
x_ = self.ap._denormalize(x_norm)
assert (x - x_).sum() < 1e-3
+
+ def test_scaler(self):
+ scaler_stats_path = os.path.join(get_tests_input_path(), 'scale_stats.npy')
+ conf.audio['stats_path'] = scaler_stats_path
+ conf.audio['preemphasis'] = 0.0
+ conf.audio['do_trim_silence'] = True
+ conf.audio['signal_norm'] = True
+
+ ap = AudioProcessor(**conf.audio)
+ mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
+ ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+
+ self.ap.signal_norm = False
+ self.ap.preemphasis = 0.0
+
+ # test scaler forward and backward transforms
+ wav = self.ap.load_wav(WAV_FILE)
+ mel_reference = self.ap.melspectrogram(wav)
+ mel_norm = ap.melspectrogram(wav)
+ mel_denorm = ap._denormalize(mel_norm)
+ assert abs(mel_reference - mel_denorm).max() < 1e-4
diff --git a/tests/test_config.json b/tests/test_config.json
index 6d63e6ab..e9cd48cf 100644
--- a/tests/test_config.json
+++ b/tests/test_config.json
@@ -2,10 +2,12 @@
"audio":{
"audio_processor": "audio", // to use dictate different audio processors, if available.
"num_mels": 80, // size of the mel spec frame.
- "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
+ "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
- "frame_length_ms": 50, // stft window length in ms.
- "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+ "frame_length_ms": null, // stft window length in ms.
+ "frame_shift_ms": null, // stft window hop-lengh in ms.
+ "hop_length": 256,
+ "win_length": 1024,
"preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
@@ -15,8 +17,8 @@
"symmetric_norm": true, // move normalization to range [-1, 1]
"clip_norm": true, // clip normalized values into the range.
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
- "mel_fmin": 95, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
- "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!!
+ "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false
},
diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py
index a0837686..51cbf341 100644
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@@ -6,7 +6,8 @@ import torch as T
from TTS.server.synthesizer import Synthesizer
from TTS.tests import get_tests_input_path, get_tests_output_path
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
-from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
+from TTS.utils.generic_utils import setup_model
+from TTS.utils.io import load_config, save_checkpoint
class DemoServerTest(unittest.TestCase):
@@ -21,7 +22,7 @@ class DemoServerTest(unittest.TestCase):
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
model = setup_model(num_chars, 0, config)
output_path = os.path.join(get_tests_output_path())
- save_checkpoint(model, None, None, None, output_path, 10, 10)
+ save_checkpoint(model, None, 10, 10, 1, output_path)
def test_in_out(self):
self._create_random_model()
diff --git a/tests/test_loader.py b/tests/test_loader.py
index d835c5d3..9edd233f 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -5,7 +5,7 @@ import torch
import numpy as np
from torch.utils.data import DataLoader
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor
from TTS.datasets import TTSDataset
from TTS.datasets.preprocess import ljspeech
@@ -36,6 +36,7 @@ class TestTTSDataset(unittest.TestCase):
dataset = TTSDataset.MyDataset(
r,
c.text_cleaner,
+ compute_linear_spec=True,
ap=self.ap,
meta_data=items,
tp=c.characters if 'characters' in c.keys() else None,
@@ -142,7 +143,7 @@ class TestTTSDataset(unittest.TestCase):
# check mel-spec correctness
mel_spec = mel_input[0].cpu().numpy()
- wav = self.ap.inv_mel_spectrogram(mel_spec.T)
+ wav = self.ap.inv_melspectrogram(mel_spec.T)
self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav')
shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav')
@@ -200,7 +201,8 @@ class TestTTSDataset(unittest.TestCase):
# check the second itme in the batch
assert linear_input[1 - idx, -1].sum() == 0
assert mel_input[1 - idx, -1].sum() == 0
- assert stop_target[1 - idx, -1] == 1
+ assert stop_target[1, mel_lengths[1]-1] == 1
+ assert stop_target[1, mel_lengths[1]:].sum() == 0
assert len(mel_lengths.shape) == 1
# check batch zero-frame conditions (zero-frame disabled)
diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py
index aa2869eb..eb91b3cc 100644
--- a/tests/test_tacotron2_model.py
+++ b/tests/test_tacotron2_model.py
@@ -6,7 +6,7 @@ import numpy as np
from torch import optim
from torch import nn
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
from TTS.layers.losses import MSELossMasked
from TTS.models.tacotron2 import Tacotron2
diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py
new file mode 100644
index 00000000..aca363a8
--- /dev/null
+++ b/tests/test_tacotron2_tf_model.py
@@ -0,0 +1,63 @@
+import os
+import torch
+import unittest
+import numpy as np
+import tensorflow as tf
+
+from TTS.utils.io import load_config
+from TTS.tf.models.tacotron2 import Tacotron2
+
+#pylint: disable=unused-variable
+
+torch.manual_seed(1)
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+c = load_config(os.path.join(file_path, 'test_config.json'))
+
+
+class TacotronTFTrainTest(unittest.TestCase):
+
+ @staticmethod
+ def generate_dummy_inputs():
+ chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
+ chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
+ chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
+ mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
+ mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
+ mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
+ stop_targets = torch.zeros(8, 30, 1).float().to(device)
+ speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
+
+ chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
+ chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
+ mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
+ return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
+ stop_targets, speaker_ids
+
+ def test_train_step(self):
+ ''' test forward pass '''
+ chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
+ stop_targets, speaker_ids = self.generate_dummy_inputs()
+
+ for idx in mel_lengths:
+ stop_targets[:, int(idx.item()):, 0] = 1.0
+
+ stop_targets = stop_targets.view(chars_seq.shape[0],
+ stop_targets.size(1) // c.r, -1)
+ stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
+
+ model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
+ # training pass
+ output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
+
+ # check model output shapes
+ assert np.all(output[0].shape == mel_spec.shape)
+ assert np.all(output[1].shape == mel_spec.shape)
+ assert output[2].shape[2] == chars_seq.shape[1]
+ assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
+ assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
+
+ # inference pass
+ output = model(chars_seq, training=False)
diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py
index ac6712b0..7053a580 100644
--- a/tests/test_tacotron_model.py
+++ b/tests/test_tacotron_model.py
@@ -5,7 +5,7 @@ import unittest
from torch import optim
from torch import nn
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
from TTS.layers.losses import L1LossMasked
from TTS.models.tacotron import Tacotron
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 6c0c7058..93edabe7 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -5,7 +5,7 @@ import os
import unittest
from TTS.utils.text import *
from TTS.tests import get_tests_path
-from TTS.utils.generic_utils import load_config
+from TTS.utils.io import load_config
TESTS_PATH = get_tests_path()
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
@@ -92,4 +92,4 @@ def test_text2phone():
gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
lang = "en-us"
ph = text2phone(text, lang)
- assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
\ No newline at end of file
+ assert gt == ph, f"\n{phonemes} \n vs \n{gt}"
diff --git a/tf/README.md b/tf/README.md
new file mode 100644
index 00000000..0f9d58e9
--- /dev/null
+++ b/tf/README.md
@@ -0,0 +1,20 @@
+## Utilities to Convert Models to Tensorflow2
+Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=).
+
+Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations.
+
+Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome.
+
+To see how you can use TF model at inference, check the notebook.
+
+This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own.
+
+
+### Converting a Model
+- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments.
+
+### Known issues ans limitations
+- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors.
+- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```.
+- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper.
+- You can only convert models trained after TF model implementation since model layers has been updated in Torch model.
diff --git a/tf/convert_tacotron2_torch_to_tf.py b/tf/convert_tacotron2_torch_to_tf.py
new file mode 100644
index 00000000..b1878343
--- /dev/null
+++ b/tf/convert_tacotron2_torch_to_tf.py
@@ -0,0 +1,211 @@
+# %%
+import sys
+sys.path.append('/home/erogol/Projects')
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+# %%
+import argparse
+import numpy as np
+import torch
+import tensorflow as tf
+from fuzzywuzzy import fuzz
+
+from TTS.utils.text.symbols import phonemes, symbols
+from TTS.utils.generic_utils import setup_model
+from TTS.utils.io import load_config
+from TTS.tf.models.tacotron2 import Tacotron2
+from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
+from TTS.tf.utils.generic_utils import save_checkpoint
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--torch_model_path',
+ type=str,
+ help='Path to target torch model to be converted to TF.')
+parser.add_argument('--config_path',
+ type=str,
+ help='Path to config file of torch model.')
+parser.add_argument('--output_path',
+ type=str,
+ help='path to save TF model weights.')
+args = parser.parse_args()
+
+# load model config
+config_path = args.config_path
+c = load_config(config_path)
+num_speakers = 0
+
+# init torch model
+num_chars = len(phonemes) if c.use_phonemes else len(symbols)
+model = setup_model(num_chars, num_speakers, c)
+checkpoint = torch.load(args.torch_model_path,
+ map_location=torch.device('cpu'))
+state_dict = checkpoint['model']
+model.load_state_dict(state_dict)
+
+# init tf model
+model_tf = Tacotron2(num_chars=num_chars,
+ num_speakers=num_speakers,
+ r=model.decoder.r,
+ postnet_output_dim=c.audio['num_mels'],
+ decoder_output_dim=c.audio['num_mels'],
+ attn_type=c.attention_type,
+ attn_win=c.windowing,
+ attn_norm=c.attention_norm,
+ prenet_type=c.prenet_type,
+ prenet_dropout=c.prenet_dropout,
+ forward_attn=c.use_forward_attn,
+ trans_agent=c.transition_agent,
+ forward_attn_mask=c.forward_attn_mask,
+ location_attn=c.location_attn,
+ attn_K=c.attention_heads,
+ separate_stopnet=c.separate_stopnet,
+ bidirectional_decoder=c.bidirectional_decoder)
+
+# set initial layer mapping - these are not captured by the below heuristic approach
+# TODO: set layer names so that we can remove these manual matching
+common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
+var_map = [
+ ('tacotron2/embedding/embeddings:0', 'embedding.weight'),
+ ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
+ 'encoder.lstm.weight_ih_l0'),
+ ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
+ 'encoder.lstm.weight_hh_l0'),
+ ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
+ 'encoder.lstm.weight_ih_l0_reverse'),
+ ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
+ 'encoder.lstm.weight_hh_l0_reverse'),
+ ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
+ ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
+ ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
+ ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
+ ('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
+ ('decoder/linear_projection/kernel:0',
+ 'decoder.linear_projection.linear_layer.weight'),
+ ('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
+]
+
+# %%
+# get tf_model graph
+input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()
+mel_pred = model_tf(input_ids, training=False)
+
+# get tf variables
+tf_vars = model_tf.weights
+
+# match variable names with fuzzy logic
+torch_var_names = list(state_dict.keys())
+tf_var_names = [we.name for we in model_tf.weights]
+for tf_name in tf_var_names:
+ # skip re-mapped layer names
+ if tf_name in [name[0] for name in var_map]:
+ continue
+ tf_name_edited = convert_tf_name(tf_name)
+ ratios = [
+ fuzz.ratio(torch_name, tf_name_edited)
+ for torch_name in torch_var_names
+ ]
+ max_idx = np.argmax(ratios)
+ matching_name = torch_var_names[max_idx]
+ del torch_var_names[max_idx]
+ var_map.append((tf_name, matching_name))
+
+# %%
+# print variable match
+from pprint import pprint
+pprint(var_map)
+pprint(torch_var_names)
+
+# pass weights
+tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
+
+# Compare TF and TORCH models
+# %%
+# check embedding outputs
+model.eval()
+input_ids = torch.randint(0, 24, (1, 128)).long()
+
+o_t = model.embedding(input_ids)
+o_tf = model_tf.embedding(input_ids.detach().numpy())
+assert abs(o_t.detach().numpy() -
+ o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
+ o_tf.numpy()).sum()
+
+# compare encoder outputs
+oo_en = model.encoder.inference(o_t.transpose(1, 2))
+ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
+assert compare_torch_tf(oo_en, ooo_en) < 1e-5
+
+#pylint: disable=redefined-builtin
+# compare decoder.attention_rnn
+inp = torch.rand([1, 768])
+inp_tf = inp.numpy()
+model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
+output, cell_state = model.decoder.attention_rnn(inp)
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
+ states[2],
+ training=False)
+assert compare_torch_tf(output, output_tf).mean() < 1e-5
+
+query = output
+inputs = torch.rand([1, 128, 512])
+query_tf = query.detach().numpy()
+inputs_tf = inputs.numpy()
+
+# compare decoder.attention
+model.decoder.attention.init_states(inputs)
+processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
+loc_attn, proc_query = model.decoder.attention.get_location_attention(
+ query, processes_inputs)
+context = model.decoder.attention(query, inputs, processes_inputs, None)
+
+attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1]
+model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
+loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states)
+context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False)
+
+assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5
+assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5
+assert compare_torch_tf(context, context_tf) < 1e-5
+
+# compare decoder.decoder_rnn
+input = torch.rand([1, 1536])
+input_tf = input.numpy()
+model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
+output, cell_state = model.decoder.decoder_rnn(
+ input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
+ states[3],
+ training=False)
+assert abs(input - input_tf).mean() < 1e-5
+assert compare_torch_tf(output, output_tf).mean() < 1e-5
+
+# compare decoder.linear_projection
+input = torch.rand([1, 1536])
+input_tf = input.numpy()
+output = model.decoder.linear_projection(input)
+output_tf = model_tf.decoder.linear_projection(input_tf, training=False)
+assert compare_torch_tf(output, output_tf) < 1e-5
+
+# compare decoder outputs
+model.decoder.max_decoder_steps = 100
+model_tf.decoder.set_max_decoder_steps(100)
+output, align, stop = model.decoder.inference(oo_en)
+states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
+output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
+assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
+
+# compare the whole model output
+outputs_torch = model.inference(input_ids)
+outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
+print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
+assert compare_torch_tf(outputs_torch[2][:, 50, :],
+ outputs_tf[2][:, 50, :]) < 1e-5
+assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
+
+# %%
+# save tf model
+save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
+ checkpoint['r'], args.output_path)
+print(' > Model conversion is successfully completed :).')
diff --git a/tf/layers/common_layers.py b/tf/layers/common_layers.py
new file mode 100644
index 00000000..995b5490
--- /dev/null
+++ b/tf/layers/common_layers.py
@@ -0,0 +1,256 @@
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.python.ops import math_ops
+# from tensorflow_addons.seq2seq import BahdanauAttention
+
+
+class Linear(keras.layers.Layer):
+ def __init__(self, units, use_bias, **kwargs):
+ super(Linear, self).__init__(**kwargs)
+ self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
+ self.activation = keras.layers.ReLU()
+
+ def call(self, x):
+ """
+ shapes:
+ x: B x T x C
+ """
+ return self.activation(self.linear_layer(x))
+
+
+class LinearBN(keras.layers.Layer):
+ def __init__(self, units, use_bias, **kwargs):
+ super(LinearBN, self).__init__(**kwargs)
+ self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
+ self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization')
+ self.activation = keras.layers.ReLU()
+
+ def call(self, x, training=None):
+ """
+ shapes:
+ x: B x T x C
+ """
+ out = self.linear_layer(x)
+ out = self.batch_normalization(out, training=training)
+ return self.activation(out)
+
+
+class Prenet(keras.layers.Layer):
+ def __init__(self,
+ prenet_type,
+ prenet_dropout,
+ units,
+ bias,
+ **kwargs):
+ super(Prenet, self).__init__(**kwargs)
+ self.prenet_type = prenet_type
+ self.prenet_dropout = prenet_dropout
+ self.linear_layers = []
+ if prenet_type == "bn":
+ self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
+ elif prenet_type == "original":
+ self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
+ else:
+ raise RuntimeError(' [!] Unknown prenet type.')
+ if prenet_dropout:
+ self.dropout = keras.layers.Dropout(rate=0.5)
+
+ def call(self, x, training=None):
+ """
+ shapes:
+ x: B x T x C
+ """
+ for linear in self.linear_layers:
+ if self.prenet_dropout:
+ x = self.dropout(linear(x), training=training)
+ else:
+ x = linear(x)
+ return x
+
+
+def _sigmoid_norm(score):
+ attn_weights = tf.nn.sigmoid(score)
+ attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True)
+ return attn_weights
+
+
+class Attention(keras.layers.Layer):
+ """TODO: implement forward_attention
+ TODO: location sensitive attention
+ TODO: implement attention windowing """
+ def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters,
+ loc_attn_kernel_size, use_windowing, norm, use_forward_attn,
+ use_trans_agent, use_forward_attn_mask, **kwargs):
+ super(Attention, self).__init__(**kwargs)
+ self.use_loc_attn = use_loc_attn
+ self.loc_attn_n_filters = loc_attn_n_filters
+ self.loc_attn_kernel_size = loc_attn_kernel_size
+ self.use_windowing = use_windowing
+ self.norm = norm
+ self.use_forward_attn = use_forward_attn
+ self.use_trans_agent = use_trans_agent
+ self.use_forward_attn_mask = use_forward_attn_mask
+ self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer')
+ self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer')
+ self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer')
+ if use_loc_attn:
+ self.location_conv1d = keras.layers.Conv1D(
+ filters=loc_attn_n_filters,
+ kernel_size=loc_attn_kernel_size,
+ padding='same',
+ use_bias=False,
+ name='location_layer/location_conv1d')
+ self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense')
+ if norm == 'softmax':
+ self.norm_func = tf.nn.softmax
+ elif norm == 'sigmoid':
+ self.norm_func = _sigmoid_norm
+ else:
+ raise ValueError("Unknown value for attention norm type")
+
+ def init_states(self, batch_size, value_length):
+ states = ()
+ if self.use_loc_attn:
+ attention_cum = tf.zeros([batch_size, value_length])
+ attention_old = tf.zeros([batch_size, value_length])
+ states = (attention_cum, attention_old)
+ return states
+
+ def process_values(self, values):
+ """ cache values for decoder iterations """
+ #pylint: disable=attribute-defined-outside-init
+ self.processed_values = self.inputs_layer(values)
+ self.values = values
+
+ def get_loc_attn(self, query, states):
+ """ compute location attention, query layer and
+ unnorm. attention weights"""
+ attention_cum, attention_old = states
+ attn_cat = tf.stack([attention_old, attention_cum], axis=2)
+
+ processed_query = self.query_layer(tf.expand_dims(query, 1))
+ processed_attn = self.location_dense(self.location_conv1d(attn_cat))
+ score = self.v(
+ tf.nn.tanh(self.processed_values + processed_query +
+ processed_attn))
+ score = tf.squeeze(score, axis=2)
+ return score, processed_query
+
+ def get_attn(self, query):
+ """ compute query layer and unnormalized attention weights """
+ processed_query = self.query_layer(tf.expand_dims(query, 1))
+ score = self.v(tf.nn.tanh(self.processed_values + processed_query))
+ score = tf.squeeze(score, axis=2)
+ return score, processed_query
+
+ def apply_score_masking(self, score, mask): #pylint: disable=no-self-use
+ """ ignore sequence paddings """
+ padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
+ # Bias so padding positions do not contribute to attention distribution.
+ score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
+ return score
+
+ def call(self, query, states):
+ """
+ shapes:
+ query: B x D
+ """
+ if self.use_loc_attn:
+ score, _ = self.get_loc_attn(query, states)
+ else:
+ score, _ = self.get_attn(query)
+
+ # TODO: masking
+ # if mask is not None:
+ # self.apply_score_masking(score, mask)
+ # attn_weights shape == (batch_size, max_length, 1)
+
+ attn_weights = self.norm_func(score)
+
+ # update attention states
+ if self.use_loc_attn:
+ states = (states[0] + attn_weights, attn_weights)
+ else:
+ states = ()
+
+ # context_vector shape after sum == (batch_size, hidden_size)
+ context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False)
+ context_vector = tf.squeeze(context_vector, axis=1)
+ return context_vector, attn_weights, states
+
+
+# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b):
+# dtype = processed_query.dtype
+# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1]
+# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2])
+
+
+# class LocationSensitiveAttention(BahdanauAttention):
+# def __init__(self,
+# units,
+# memory=None,
+# memory_sequence_length=None,
+# normalize=False,
+# probability_fn="softmax",
+# kernel_initializer="glorot_uniform",
+# dtype=None,
+# name="LocationSensitiveAttention",
+# location_attention_filters=32,
+# location_attention_kernel_size=31):
+
+# super(LocationSensitiveAttention,
+# self).__init__(units=units,
+# memory=memory,
+# memory_sequence_length=memory_sequence_length,
+# normalize=normalize,
+# probability_fn='softmax', ## parent module default
+# kernel_initializer=kernel_initializer,
+# dtype=dtype,
+# name=name)
+# if probability_fn == 'sigmoid':
+# self.probability_fn = lambda score, _: self._sigmoid_normalization(score)
+# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False)
+# self.location_dense = keras.layers.Dense(units, use_bias=False)
+# # self.v = keras.layers.Dense(1, use_bias=True)
+
+# def _location_sensitive_score(self, processed_query, keys, processed_loc):
+# processed_query = tf.expand_dims(processed_query, 1)
+# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2])
+
+# def _location_sensitive(self, alignment_cum, alignment_old):
+# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2)
+# return self.location_dense(self.location_conv(alignment_cat))
+
+# def _sigmoid_normalization(self, score):
+# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True)
+
+# # def _apply_masking(self, score, mask):
+# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
+# # # Bias so padding positions do not contribute to attention distribution.
+# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
+# # return score
+
+# def _calculate_attention(self, query, state):
+# alignment_cum, alignment_old = state[:2]
+# processed_query = self.query_layer(
+# query) if self.query_layer else query
+# processed_loc = self._location_sensitive(alignment_cum, alignment_old)
+# score = self._location_sensitive_score(
+# processed_query,
+# self.keys,
+# processed_loc)
+# alignment = self.probability_fn(score, state)
+# alignment_cum = alignment_cum + alignment
+# state[0] = alignment_cum
+# state[1] = alignment
+# return alignment, state
+
+# def compute_context(self, alignments):
+# expanded_alignments = tf.expand_dims(alignments, 1)
+# context = tf.matmul(expanded_alignments, self.values)
+# context = tf.squeeze(context, [1])
+# return context
+
+# # def call(self, query, state):
+# # alignment, next_state = self._calculate_attention(query, state)
+# # return alignment, next_state
diff --git a/tf/layers/tacotron2.py b/tf/layers/tacotron2.py
new file mode 100644
index 00000000..c6f1a2cd
--- /dev/null
+++ b/tf/layers/tacotron2.py
@@ -0,0 +1,232 @@
+
+import tensorflow as tf
+from tensorflow import keras
+from TTS.tf.utils.tf_utils import shape_list
+from TTS.tf.layers.common_layers import Prenet, Attention
+# from tensorflow_addons.seq2seq import AttentionWrapper
+
+
+class ConvBNBlock(keras.layers.Layer):
+ def __init__(self, filters, kernel_size, activation, **kwargs):
+ super(ConvBNBlock, self).__init__(**kwargs)
+ self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d')
+ self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization')
+ self.dropout = keras.layers.Dropout(rate=0.5, name='dropout')
+ self.activation = keras.layers.Activation(activation, name='activation')
+
+ def call(self, x, training=None):
+ o = self.convolution1d(x)
+ o = self.batch_normalization(o, training=training)
+ o = self.activation(o)
+ o = self.dropout(o, training=training)
+ return o
+
+
+class Postnet(keras.layers.Layer):
+ def __init__(self, output_filters, num_convs, **kwargs):
+ super(Postnet, self).__init__(**kwargs)
+ self.convolutions = []
+ self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0'))
+ for idx in range(1, num_convs - 1):
+ self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}'))
+ self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}'))
+
+ def call(self, x, training=None):
+ o = x
+ for layer in self.convolutions:
+ o = layer(o, training=training)
+ return o
+
+
+class Encoder(keras.layers.Layer):
+ def __init__(self, output_input_dim, **kwargs):
+ super(Encoder, self).__init__(**kwargs)
+ self.convolutions = []
+ for idx in range(3):
+ self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}'))
+ self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm')
+
+ def call(self, x, training=None):
+ o = x
+ for layer in self.convolutions:
+ o = layer(o, training=training)
+ o = self.lstm(o)
+ return o
+
+
+class Decoder(keras.layers.Layer):
+ #pylint: disable=unused-argument
+ def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type,
+ prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask,
+ use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, **kwargs):
+ super(Decoder, self).__init__(**kwargs)
+ self.frame_dim = frame_dim
+ self.r_init = tf.constant(r, dtype=tf.int32)
+ self.r = tf.constant(r, dtype=tf.int32)
+ self.separate_stopnet = separate_stopnet
+ self.max_decoder_steps = tf.constant(1000, dtype=tf.int32)
+ self.stop_thresh = tf.constant(0.5, dtype=tf.float32)
+
+ # model dimensions
+ self.query_dim = 1024
+ self.decoder_rnn_dim = 1024
+ self.prenet_dim = 256
+ self.attn_dim = 128
+ self.p_attention_dropout = 0.1
+ self.p_decoder_dropout = 0.1
+
+ self.prenet = Prenet(prenet_type,
+ prenet_dropout,
+ [self.prenet_dim, self.prenet_dim],
+ bias=False,
+ name='prenet')
+ self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name=f'{self.name}/attention_rnn', )
+ self.attention_rnn_dropout = keras.layers.Dropout(0.5)
+
+ # TODO: implement other attn options
+ self.attention = Attention(attn_dim=self.attn_dim,
+ use_loc_attn=True,
+ loc_attn_n_filters=32,
+ loc_attn_kernel_size=31,
+ use_windowing=False,
+ norm=attn_norm,
+ use_forward_attn=use_forward_attn,
+ use_trans_agent=use_trans_agent,
+ use_forward_attn_mask=use_forward_attn_mask,
+ name='attention')
+ self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name=f'{self.name}/decoder_rnn')
+ self.decoder_rnn_dropout = keras.layers.Dropout(0.5)
+ self.linear_projection = keras.layers.Dense(self.frame_dim * r, name=f'{self.name}/linear_projection/linear_layer')
+ self.stopnet = keras.layers.Dense(1, name=f'{self.name}/stopnet/linear_layer')
+
+
+ def set_max_decoder_steps(self, new_max_steps):
+ self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32)
+
+ def set_r(self, new_r):
+ self.r = tf.constant(new_r, dtype=tf.int32)
+
+ def build_decoder_initial_states(self, batch_size, memory_dim, memory_length):
+ zero_frame = tf.zeros([batch_size, self.frame_dim])
+ zero_context = tf.zeros([batch_size, memory_dim])
+ attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
+ decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
+ attention_states = self.attention.init_states(batch_size, memory_length)
+ return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states
+
+ def step(self, prenet_next, states,
+ memory_seq_length=None, training=None):
+ _, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states
+ attention_rnn_input = tf.concat([prenet_next, context_next], -1)
+ attention_rnn_output, attention_rnn_state = \
+ self.attention_rnn(attention_rnn_input,
+ attention_rnn_state, training=training)
+ attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training)
+ context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training)
+ decoder_rnn_input = tf.concat([attention_rnn_output, context], -1)
+ decoder_rnn_output, decoder_rnn_state = \
+ self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training)
+ decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training)
+ linear_projection_input = tf.concat([decoder_rnn_output, context], -1)
+ output_frame = self.linear_projection(linear_projection_input, training=training)
+ stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1)
+ stopnet_output = self.stopnet(stopnet_input, training=training)
+ output_frame = output_frame[:, :self.r * self.frame_dim]
+ states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states)
+ return output_frame, stopnet_output, states, attention
+
+ def decode(self, memory, states, frames, memory_seq_length=None):
+ B, _, _ = shape_list(memory)
+ num_iter = shape_list(frames)[1] // self.r
+ # init states
+ frame_zero = tf.expand_dims(states[0], 1)
+ frames = tf.concat([frame_zero, frames], axis=1)
+ outputs = tf.TensorArray(dtype=tf.float32, size=num_iter)
+ attentions = tf.TensorArray(dtype=tf.float32, size=num_iter)
+ stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter)
+ # pre-computes
+ self.attention.process_values(memory)
+ prenet_output = self.prenet(frames, training=True)
+ step_count = tf.constant(0, dtype=tf.int32)
+
+ def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions):
+ prenet_next = prenet_output[:, step]
+ output, stop_token, states, attention = self.step(prenet_next,
+ states,
+ memory_seq_length)
+ outputs = outputs.write(step, output)
+ attentions = attentions.write(step, attention)
+ stop_tokens = stop_tokens.write(step, stop_token)
+ return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions
+ _, memory, _, states, outputs, stop_tokens, attentions = \
+ tf.while_loop(lambda *arg: True,
+ _body,
+ loop_vars=(step_count, memory, prenet_output,
+ states, outputs, stop_tokens, attentions),
+ parallel_iterations=32,
+ swap_memory=True,
+ maximum_iterations=num_iter)
+
+ outputs = outputs.stack()
+ attentions = attentions.stack()
+ stop_tokens = stop_tokens.stack()
+ outputs = tf.transpose(outputs, [1, 0, 2])
+ attentions = tf.transpose(attentions, [1, 0, 2])
+ stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
+ stop_tokens = tf.squeeze(stop_tokens, axis=2)
+ outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
+ return outputs, stop_tokens, attentions
+
+ def decode_inference(self, memory, states):
+ B, _, _ = shape_list(memory)
+ # init states
+ outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
+ attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
+ stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
+ # pre-computes
+ self.attention.process_values(memory)
+
+ # iter vars
+ stop_flag = tf.constant(False, dtype=tf.bool)
+ step_count = tf.constant(0, dtype=tf.int32)
+
+ def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag):
+ frame_next = states[0]
+ prenet_next = self.prenet(frame_next, training=False)
+ output, stop_token, states, attention = self.step(prenet_next,
+ states,
+ None,
+ training=False)
+ stop_token = tf.math.sigmoid(stop_token)
+ outputs = outputs.write(step, output)
+ attentions = attentions.write(step, attention)
+ stop_tokens = stop_tokens.write(step, stop_token)
+ stop_flag = tf.greater(stop_token, self.stop_thresh)
+ stop_flag = tf.reduce_all(stop_flag)
+ return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag
+
+ cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
+ _, memory, states, outputs, stop_tokens, attentions, stop_flag = \
+ tf.while_loop(cond,
+ _body,
+ loop_vars=(step_count, memory, states, outputs,
+ stop_tokens, attentions, stop_flag),
+ parallel_iterations=32,
+ swap_memory=True,
+ maximum_iterations=self.max_decoder_steps)
+
+ outputs = outputs.stack()
+ attentions = attentions.stack()
+ stop_tokens = stop_tokens.stack()
+
+ outputs = tf.transpose(outputs, [1, 0, 2])
+ attentions = tf.transpose(attentions, [1, 0, 2])
+ stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
+ stop_tokens = tf.squeeze(stop_tokens, axis=2)
+ outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
+ return outputs, stop_tokens, attentions
+
+ def call(self, memory, states, frames=None, memory_seq_length=None, training=False):
+ if training:
+ return self.decode(memory, states, frames, memory_seq_length)
+ return self.decode_inference(memory, states)
diff --git a/tf/models/tacotron2.py b/tf/models/tacotron2.py
new file mode 100644
index 00000000..101291cf
--- /dev/null
+++ b/tf/models/tacotron2.py
@@ -0,0 +1,81 @@
+from tensorflow import keras
+
+from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
+from TTS.tf.utils.tf_utils import shape_list
+
+
+#pylint: disable=too-many-ancestors
+class Tacotron2(keras.models.Model):
+ def __init__(self,
+ num_chars,
+ num_speakers,
+ r,
+ postnet_output_dim=80,
+ decoder_output_dim=80,
+ attn_type='original',
+ attn_win=False,
+ attn_norm="softmax",
+ attn_K=4,
+ prenet_type="original",
+ prenet_dropout=True,
+ forward_attn=False,
+ trans_agent=False,
+ forward_attn_mask=False,
+ location_attn=True,
+ separate_stopnet=True,
+ bidirectional_decoder=False):
+ super(Tacotron2, self).__init__()
+ self.r = r
+ self.decoder_output_dim = decoder_output_dim
+ self.postnet_output_dim = postnet_output_dim
+ self.bidirectional_decoder = bidirectional_decoder
+ self.num_speakers = num_speakers
+ self.speaker_embed_dim = 256
+
+ self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding')
+ self.encoder = Encoder(512, name='encoder')
+ # TODO: most of the decoder args have no use at the momment
+ self.decoder = Decoder(decoder_output_dim,
+ r,
+ attn_type=attn_type,
+ use_attn_win=attn_win,
+ attn_norm=attn_norm,
+ prenet_type=prenet_type,
+ prenet_dropout=prenet_dropout,
+ use_forward_attn=forward_attn,
+ use_trans_agent=trans_agent,
+ use_forward_attn_mask=forward_attn_mask,
+ use_location_attn=location_attn,
+ attn_K=attn_K,
+ separate_stopnet=separate_stopnet,
+ speaker_emb_dim=self.speaker_embed_dim)
+ self.postnet = Postnet(postnet_output_dim, 5, name='postnet')
+
+ def call(self, characters, text_lengths=None, frames=None, training=None):
+ if training:
+ return self.training(characters, text_lengths, frames)
+ if not training:
+ return self.inference(characters)
+ raise RuntimeError(' [!] Set model training mode True or False')
+
+ def training(self, characters, text_lengths, frames):
+ B, T = shape_list(characters)
+ embedding_vectors = self.embedding(characters, training=True)
+ encoder_output = self.encoder(embedding_vectors, training=True)
+ decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
+ decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True)
+ postnet_frames = self.postnet(decoder_frames, training=True)
+ output_frames = decoder_frames + postnet_frames
+ return decoder_frames, output_frames, attentions, stop_tokens
+
+ def inference(self, characters):
+ B, T = shape_list(characters)
+ embedding_vectors = self.embedding(characters, training=False)
+ encoder_output = self.encoder(embedding_vectors, training=False)
+ decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
+ decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
+ postnet_frames = self.postnet(decoder_frames, training=False)
+ output_frames = decoder_frames + postnet_frames
+ print(output_frames.shape)
+ return decoder_frames, output_frames, attentions, stop_tokens
+
diff --git a/notebooks/Benchmark-PWGAN.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb
similarity index 64%
rename from notebooks/Benchmark-PWGAN.ipynb
rename to tf/notebooks/Benchmark-TTS_tf.ipynb
index 082ffa60..4a21ae17 100644
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/tf/notebooks/Benchmark-TTS_tf.ipynb
@@ -2,27 +2,35 @@
"cells": [
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
- "This is to test TTS models with benchmark sentences for speech synthesis.\n",
+ "This is to test TTS tensorflow models with benchmark sentences.\n",
"\n",
"Before running this script please DON'T FORGET: \n",
"- to set file paths.\n",
- "- to download related model files from TTS and PWGAN.\n",
+ "- to download related models.\n",
+ " - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n",
"- download or clone related repos, linked below.\n",
"- setup the repositories. ```python setup.py install```\n",
- "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n",
- "- to set the right paths in the cell below.\n",
+ "- to checkout right commit versions (given next to the model in the models page).\n",
+ "- to set the file paths below.\n",
"\n",
"Repositories:\n",
"- TTS: https://github.com/mozilla/TTS\n",
- "- PWGAN: https://github.com/erogol/ParallelWaveGAN"
+ "- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n",
+ "\n",
+ "Known Issues:\n",
+ "- To load the model second time you need to restart the notebook kernel. \n",
+ "- Some of the advance methods are not yet implemented for Tensorflow."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
+ "Collapsed": "false",
"scrolled": true
},
"outputs": [],
@@ -30,9 +38,16 @@
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
+ "\n",
+ "# you may need to change this depending on your system\n",
+ "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
+ "\n",
"import sys\n",
"import io\n",
"import torch \n",
+ "import tensorflow as tf\n",
+ "print(tf.config.list_physical_devices('GPU'))\n",
+ "\n",
"import time\n",
"import json\n",
"import yaml\n",
@@ -44,54 +59,49 @@
"import librosa\n",
"import librosa.display\n",
"\n",
- "from TTS.models.tacotron import Tacotron \n",
- "from TTS.layers import *\n",
- "from TTS.utils.data import *\n",
+ "from TTS.tf.models.tacotron2 import Tacotron2\n",
+ "from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n",
"from TTS.utils.audio import AudioProcessor\n",
- "from TTS.utils.generic_utils import load_config, setup_model\n",
- "from TTS.utils.text import text_to_sequence\n",
+ "from TTS.utils.io import load_config\n",
"from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import visualize\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"\n",
- "import os\n",
- "\n",
- "# you may need to change this depending on your system\n",
- "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
+ " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
- " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
- "# mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n",
" print(mel_postnet_spec.shape)\n",
" print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
" if not use_gl:\n",
- " waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n",
- "# waveform = waveform / abs(waveform).max() * 0.9\n",
- " if use_cuda:\n",
+ " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
+ " mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n",
+ " if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
- " waveform = waveform.numpy()\n",
+ " waveform = waveform.numpy()\n",
+ " waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" if figures: \n",
- " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n",
- " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n",
+ " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n",
+ " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n",
" os.makedirs(OUT_FOLDER, exist_ok=True)\n",
" file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
@@ -102,100 +112,108 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"# Set constants\n",
- "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
- "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n",
+ "ROOT_PATH = '../torch_model/'\n",
+ "MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
- "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n",
- "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n",
- "\n",
- "# load PWGAN config\n",
- "with open(VOCODER_CONFIG_PATH) as f:\n",
- " VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n",
- " \n",
"# Run FLAGs\n",
- "use_cuda = False\n",
- "# Set some config fields manually for testing\n",
- "CONFIG.windowing = True\n",
- "CONFIG.use_forward_attn = True \n",
+ "use_cuda = True # use the available GPU (only for torch)\n",
"# Set the vocoder\n",
- "use_gl = False # use GL if True\n",
- "batched_wavernn = True # use batched wavernn inference if True"
+ "use_gl = True # use GL if True\n",
+ "BACKEND = 'tf' # set the backend for inference "
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false",
+ "scrolled": true
+ },
"outputs": [],
"source": [
- "# LOAD TTS MODEL\n",
- "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
+ "from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n",
+ "from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n",
+ "c = CONFIG\n",
+ "num_speakers = 0\n",
+ "r = 1\n",
+ "num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n",
+ "model = setup_model(num_chars, num_speakers, c)\n",
"\n",
- "# multi speaker \n",
- "if CONFIG.use_speaker_embedding:\n",
- " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
- " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
- "else:\n",
- " speakers = []\n",
- " speaker_id = None\n",
- "\n",
- "# if the vocabulary was passed, replace the default\n",
- "if 'characters' in CONFIG.keys():\n",
- " symbols, phonemes = make_symbols(**CONFIG.characters)\n",
- "\n",
- "# load the model\n",
- "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), CONFIG)\n",
- "\n",
- "# load the audio processor\n",
- "ap = AudioProcessor(**CONFIG.audio) \n",
- "\n",
- "\n",
- "# load model state\n",
- "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
- "\n",
- "# load the model\n",
- "model.load_state_dict(cp['model'])\n",
- "if use_cuda:\n",
- " model.cuda()\n",
- "model.eval()\n",
- "print(cp['step'])\n",
- "print(cp['r'])\n",
- "\n",
- "# set model stepsize\n",
- "if 'r' in cp:\n",
- " model.decoder.set_r(cp['r'])"
+ "# before loading weights you need to run the model once to generate the variables\n",
+ "input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n",
+ "mel_pred = model(input_ids, training=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false",
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "model = load_checkpoint(model, MODEL_PATH)\n",
+ "# model = tf.function(model, experimental_relax_shapes=True)\n",
+ "ap = AudioProcessor(**CONFIG.audio) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "Collapsed": "false"
+ },
+ "outputs": [],
+ "source": [
+ "# wrapper class to use tf.function\n",
+ "class ModelInference(tf.keras.Model):\n",
+ " def __init__(self, model):\n",
+ " super(ModelInference, self).__init__()\n",
+ " self.model = model\n",
+ " \n",
+ " @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n",
+ " def call(self, characters):\n",
+ " return self.model(characters, training=False)\n",
+ " \n",
+ "model = ModelInference(model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"# LOAD WAVERNN\n",
"if use_gl == False:\n",
- " from parallel_wavegan.models import ParallelWaveGANGenerator\n",
- " from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
+ " from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n",
" \n",
- " vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
+ " vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
" vocoder_model.remove_weight_norm()\n",
- " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio']) \n",
+ " ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
" if use_cuda:\n",
" vocoder_model.cuda()\n",
- " vocoder_model.eval();"
+ " vocoder_model.eval();\n",
+ " print(count_parameters(vocoder_model))"
]
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### Comparision with https://mycroft.ai/blog/available-voices/"
]
@@ -203,21 +221,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model.eval()\n",
- "model.decoder.max_decoder_steps = 2000\n",
- "model.decoder.prenet.eval()\n",
- "speaker_id = None\n",
- "sentence = '''A breeding jennet, lusty, young, and proud,'''\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
@@ -226,7 +232,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### https://espnet.github.io/icassp2020-tts/"
]
@@ -234,7 +242,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"The Commission also recommends\"\n",
@@ -244,7 +254,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
@@ -254,7 +266,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
@@ -264,7 +278,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
@@ -274,7 +290,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
@@ -283,7 +301,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### Other examples"
]
@@ -291,7 +311,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
@@ -301,7 +323,21 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
+ "outputs": [],
+ "source": [
+ "sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n",
+ "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
@@ -311,7 +347,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
@@ -321,7 +359,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
@@ -330,7 +370,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### Comparison with https://keithito.github.io/audio-samples/"
]
@@ -338,7 +380,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
@@ -348,7 +392,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
@@ -358,7 +404,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
@@ -368,7 +416,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
@@ -378,7 +428,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
@@ -387,7 +439,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
]
@@ -395,7 +449,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
@@ -405,7 +461,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
@@ -415,7 +473,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \" He has read the whole thing.\"\n",
@@ -425,7 +485,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"He reads books.\"\n",
@@ -435,7 +497,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Thisss isrealy awhsome.\"\n",
@@ -445,7 +509,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"This is your internet browser, Firefox.\"\n",
@@ -455,7 +521,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"This is your internet browser Firefox.\"\n",
@@ -465,7 +533,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
@@ -475,7 +545,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
@@ -485,7 +557,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Eren, how are you?\"\n",
@@ -494,7 +568,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"source": [
"### Hard Sentences"
]
@@ -502,7 +578,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Encouraged, he started with a minute a day.\"\n",
@@ -512,7 +590,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
@@ -522,7 +602,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
@@ -532,7 +614,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"If he decided to watch TV he really watched it.\"\n",
@@ -542,7 +626,9 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
@@ -552,13 +638,56 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {},
+ "metadata": {
+ "Collapsed": "false"
+ },
"outputs": [],
"source": [
"# for twb dataset\n",
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "Collapsed": "false"
+ },
+ "outputs": [],
+ "source": [
+ "wavs = []\n",
+ "model.eval()\n",
+ "model.decoder.prenet.eval()\n",
+ "model.decoder.max_decoder_steps = 2000\n",
+ "# model.decoder.prenet.train()\n",
+ "speaker_id = None\n",
+ "sentence = '''This is App Store Optimization report.\n",
+ "The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and what’s new section.\n",
+ "In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n",
+ "Product name is the actual app name on the AppStore or Google Play Store.\n",
+ "Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n",
+ "Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n",
+ "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
+ "If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps’ Store presence back then.\n",
+ "You can also filter for a specific app using Product Name.\n",
+ "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
+ "'''\n",
+ "\n",
+ "for s in sentence.split('\\n'):\n",
+ " print(s)\n",
+ " align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n",
+ " wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "Collapsed": "false"
+ },
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/tf/utils/convert_torch_to_tf_utils.py b/tf/utils/convert_torch_to_tf_utils.py
new file mode 100644
index 00000000..e9e1e8a3
--- /dev/null
+++ b/tf/utils/convert_torch_to_tf_utils.py
@@ -0,0 +1,81 @@
+import numpy as np
+import tensorflow as tf
+
+
+def tf_create_dummy_inputs():
+ """ Create dummy inputs for TF Tacotron2 model """
+ batch_size = 4
+ max_input_length = 32
+ max_mel_length = 128
+ pad = 1
+ n_chars = 24
+ input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32)
+ input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size])
+ input_lengths[-1] = max_input_length
+ input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32)
+ mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80])
+ mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
+ mel_lengths[-1] = max_mel_length
+ mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32)
+ return input_ids, input_lengths, mel_outputs, mel_lengths
+
+
+def compare_torch_tf(torch_tensor, tf_tensor):
+ """ Compute the average absolute difference b/w torch and tf tensors """
+ return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
+
+
+def convert_tf_name(tf_name):
+ """ Convert certain patterns in TF layer names to Torch patterns """
+ tf_name_tmp = tf_name
+ tf_name_tmp = tf_name_tmp.replace(':0', '')
+ tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0')
+ tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1')
+ tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh')
+ tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight')
+ tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight')
+ tf_name_tmp = tf_name_tmp.replace('/beta', '/bias')
+ tf_name_tmp = tf_name_tmp.replace('/', '.')
+ return tf_name_tmp
+
+
+def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
+ """ Transfer weigths from torch state_dict to TF variables """
+ print(" > Passing weights from Torch to TF ...")
+ for tf_var in tf_vars:
+ torch_var_name = var_map_dict[tf_var.name]
+ print(f' | > {tf_var.name} <-- {torch_var_name}')
+ # if tuple, it is a bias variable
+ if not isinstance(torch_var_name, tuple):
+ torch_layer_name = '.'.join(torch_var_name.split('.')[-2:])
+ torch_weight = state_dict[torch_var_name]
+ if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name:
+ # out_dim, in_dim, filter -> filter, in_dim, out_dim
+ numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy()
+ elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name:
+ numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
+ # if variable is for bidirectional lstm and it is a bias vector there
+ # needs to be pre-defined two matching torch bias vectors
+ elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name:
+ bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name]
+ assert len(bias_vectors) == 2
+ numpy_weight = bias_vectors[0] + bias_vectors[1]
+ elif 'rnn' in tf_var.name and 'kernel' in tf_var.name:
+ numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
+ elif 'rnn' in tf_var.name and 'bias' in tf_var.name:
+ bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key]
+ assert len(bias_vectors) == 2
+ numpy_weight = bias_vectors[0] + bias_vectors[1]
+ elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name:
+ numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
+ else:
+ numpy_weight = torch_weight.detach().cpu().numpy()
+ assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}"
+ tf.keras.backend.set_value(tf_var, numpy_weight)
+ return tf_vars
+
+
+def load_tf_vars(model_tf, tf_vars):
+ for tf_var in tf_vars:
+ model_tf.get_layer(tf_var.name).set_weights(tf_var)
+ return model_tf
diff --git a/tf/utils/generic_utils.py b/tf/utils/generic_utils.py
new file mode 100644
index 00000000..6368658d
--- /dev/null
+++ b/tf/utils/generic_utils.py
@@ -0,0 +1,99 @@
+import os
+import datetime
+import importlib
+import pickle
+import numpy as np
+import tensorflow as tf
+
+
+def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
+ checkpoint_path = 'tts_tf_checkpoint_{}.pkl'.format(current_step)
+ checkpoint_path = os.path.join(output_folder, checkpoint_path)
+ state = {
+ 'model': model.weights,
+ 'optimizer': optimizer,
+ 'step': current_step,
+ 'epoch': epoch,
+ 'date': datetime.date.today().strftime("%B %d, %Y"),
+ 'r': r
+ }
+ state.update(kwargs)
+ pickle.dump(state, open(checkpoint_path, 'wb'))
+
+
+def load_checkpoint(model, checkpoint_path):
+ checkpoint = pickle.load(open(checkpoint_path, 'rb'))
+ chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
+ tf_vars = model.weights
+ for tf_var in tf_vars:
+ layer_name = tf_var.name
+ chkp_var_value = chkp_var_dict[layer_name]
+ tf.keras.backend.set_value(tf_var, chkp_var_value)
+ if 'r' in checkpoint.keys():
+ model.decoder.set_r(checkpoint['r'])
+ return model
+
+
+def sequence_mask(sequence_length, max_len=None):
+ if max_len is None:
+ max_len = sequence_length.max()
+ batch_size = sequence_length.size(0)
+ seq_range = np.empty([0, max_len], dtype=np.int8)
+ seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+ if sequence_length.is_cuda:
+ seq_range_expand = seq_range_expand.cuda()
+ seq_length_expand = (
+ sequence_length.unsqueeze(1).expand_as(seq_range_expand))
+ # B x T_max
+ return seq_range_expand < seq_length_expand
+
+
+# @tf.custom_gradient
+def check_gradient(x, grad_clip):
+ x_normed = tf.clip_by_norm(x, grad_clip)
+ grad_norm = tf.norm(grad_clip)
+ return x_normed, grad_norm
+
+
+def count_parameters(model, c):
+ try:
+ return model.count_params()
+ except RuntimeError:
+ input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32'))
+ input_lengths = np.random.randint(100, 129, (8, ))
+ input_lengths[-1] = 128
+ input_lengths = tf.convert_to_tensor(input_lengths.astype('int32'))
+ mel_spec = np.random.rand(8, 2 * c.r,
+ c.audio['num_mels']).astype('float32')
+ mel_spec = tf.convert_to_tensor(mel_spec)
+ speaker_ids = np.random.randint(
+ 0, 5, (8, )) if c.use_speaker_embedding else None
+ _ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids)
+ return model.count_params()
+
+
+def setup_model(num_chars, num_speakers, c):
+ print(" > Using model: {}".format(c.model))
+ MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
+ MyModel = getattr(MyModel, c.model)
+ if c.model.lower() in "tacotron":
+ raise NotImplementedError(' [!] Tacotron model is not ready.')
+ # tacotron2
+ model = MyModel(num_chars=num_chars,
+ num_speakers=num_speakers,
+ r=c.r,
+ postnet_output_dim=c.audio['num_mels'],
+ decoder_output_dim=c.audio['num_mels'],
+ attn_type=c.attention_type,
+ attn_win=c.windowing,
+ attn_norm=c.attention_norm,
+ prenet_type=c.prenet_type,
+ prenet_dropout=c.prenet_dropout,
+ forward_attn=c.use_forward_attn,
+ trans_agent=c.transition_agent,
+ forward_attn_mask=c.forward_attn_mask,
+ location_attn=c.location_attn,
+ attn_K=c.attention_heads,
+ separate_stopnet=c.separate_stopnet,
+ bidirectional_decoder=c.bidirectional_decoder)
+ return model
diff --git a/tf/utils/tf_utils.py b/tf/utils/tf_utils.py
new file mode 100644
index 00000000..558936d5
--- /dev/null
+++ b/tf/utils/tf_utils.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+
+def shape_list(x):
+ """Deal with dynamic shape in tensorflow cleanly."""
+ static = x.shape.as_list()
+ dynamic = tf.shape(x)
+ return [dynamic[i] if s is None else s for i, s in enumerate(static)]
diff --git a/train.py b/train.py
index 0aa3f748..e4963ee7 100644
--- a/train.py
+++ b/train.py
@@ -7,21 +7,22 @@ import traceback
import numpy as np
import torch
-import torch.nn as nn
from torch.utils.data import DataLoader
from TTS.datasets.TTSDataset import MyDataset
from distribute import (DistributedSampler, apply_gradient_allreduce,
init_distributed, reduce_tensor)
-from TTS.layers.losses import L1LossMasked, MSELossMasked, BCELossMasked
+from TTS.layers.losses import TacotronLoss
from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import (
- NoamLR, check_update, count_parameters, create_experiment_folder,
- get_git_branch, load_config, remove_experiment_folder, save_best_model,
- save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file,
- setup_model, gradual_training_scheduler, KeepAverage,
- set_weight_decay, check_config)
-from TTS.utils.logger import Logger
+from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder,
+ get_git_branch, set_init_dict,
+ setup_model, KeepAverage, check_config)
+from TTS.utils.io import (save_best_model, save_checkpoint,
+ load_config, copy_config_file)
+from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
+ gradual_training_scheduler, set_weight_decay)
+from TTS.utils.tensorboard_logger import TensorboardLogger
+from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers
from TTS.utils.synthesis import synthesis
@@ -47,6 +48,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
dataset = MyDataset(
r,
c.text_cleaner,
+ compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
@@ -115,7 +117,7 @@ def format_data(data):
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
-def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
+def train(model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
verbose=(epoch == 0))
@@ -124,24 +126,25 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
train_values = {
'avg_postnet_loss': 0,
'avg_decoder_loss': 0,
- 'avg_stop_loss': 0,
- 'avg_align_score': 0,
+ 'avg_stopnet_loss': 0,
+ 'avg_align_error': 0,
'avg_step_time': 0,
- 'avg_loader_time': 0,
- 'avg_alignment_score': 0
+ 'avg_loader_time': 0
}
if c.bidirectional_decoder:
train_values['avg_decoder_b_loss'] = 0 # decoder backward loss
train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss
+ if c.ga_alpha > 0:
+ train_values['avg_ga_loss'] = 0 # guidede attention loss
keep_avg = KeepAverage()
keep_avg.add_values(train_values)
- print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
+ c_logger.print_train_start()
for num_iter, data in enumerate(data_loader):
start_time = time.time()
@@ -165,50 +168,39 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+ decoder_backward_output = None
- # loss computation
- stop_loss = criterion_st(stop_tokens,
- stop_targets, mel_lengths) if c.stopnet else torch.zeros(1)
- if c.loss_masking:
- decoder_loss = criterion(decoder_output, mel_input, mel_lengths)
- if c.model in ["Tacotron", "TacotronGST"]:
- postnet_loss = criterion(postnet_output, linear_input,
- mel_lengths)
- else:
- postnet_loss = criterion(postnet_output, mel_input,
- mel_lengths)
+ # set the alignment lengths wrt reduction factor for guided attention
+ if mel_lengths.max() % model.decoder.r != 0:
+ alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
- decoder_loss = criterion(decoder_output, mel_input)
- if c.model in ["Tacotron", "TacotronGST"]:
- postnet_loss = criterion(postnet_output, linear_input)
- else:
- postnet_loss = criterion(postnet_output, mel_input)
- loss = decoder_loss + postnet_loss
- if not c.separate_stopnet and c.stopnet:
- loss += stop_loss
+ alignment_lengths = mel_lengths // model.decoder.r
- # backward decoder
+ # compute loss
+ loss_dict = criterion(postnet_output, decoder_output, mel_input,
+ linear_input, stop_tokens, stop_targets,
+ mel_lengths, decoder_backward_output,
+ alignments, alignment_lengths, text_lengths)
if c.bidirectional_decoder:
- if c.loss_masking:
- decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
- else:
- decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
- decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
- loss += decoder_backward_loss + decoder_c_loss
- keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
+ keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(),
+ 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()})
+ if c.ga_alpha > 0:
+ keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()})
- loss.backward()
+ # backward pass
+ loss_dict['loss'].backward()
optimizer, current_lr = adam_weight_decay(optimizer)
- grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True)
+ grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
optimizer.step()
- # compute alignment score
- align_score = alignment_diagonal_score(alignments)
- keep_avg.update_value('avg_align_score', align_score)
+ # compute alignment error (the lower the better )
+ align_error = 1 - alignment_diagonal_score(alignments)
+ keep_avg.update_value('avg_align_error', align_error)
+ loss_dict['align_error'] = align_error
# backpass and check the grad norm for stop loss
if c.separate_stopnet:
- stop_loss.backward()
+ loss_dict['stopnet_loss'].backward()
optimizer_st, _ = adam_weight_decay(optimizer_st)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step()
@@ -218,48 +210,37 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
step_time = time.time() - start_time
epoch_time += step_time
+ # update avg stats
+ update_train_values = {
+ 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()),
+ 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()),
+ 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() \
+ if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()),
+ 'avg_step_time': step_time,
+ 'avg_loader_time': loader_time
+ }
+ keep_avg.update_values(update_train_values)
+
if global_step % c.print_step == 0:
- print(
- " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} "
- "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} "
- "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} "
- "LoaderTime:{:.2f} LR:{:.6f}".format(
- num_iter, batch_n_iter, global_step, postnet_loss.item(),
- decoder_loss.item(), stop_loss.item(), align_score,
- grad_norm, grad_norm_st, avg_text_length, avg_spec_length,
- step_time, loader_time, current_lr),
- flush=True)
+ c_logger.print_train_step(batch_n_iter, num_iter, global_step,
+ avg_spec_length, avg_text_length,
+ step_time, loader_time, current_lr,
+ loss_dict, keep_avg.avg_values)
# aggregate losses from processes
if num_gpus > 1:
- postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
- decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
- loss = reduce_tensor(loss.data, num_gpus)
- stop_loss = reduce_tensor(stop_loss.data,
- num_gpus) if c.stopnet else stop_loss
+ loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
+ loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
+ loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
+ loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
if args.rank == 0:
- update_train_values = {
- 'avg_postnet_loss':
- float(postnet_loss.item()),
- 'avg_decoder_loss':
- float(decoder_loss.item()),
- 'avg_stop_loss':
- stop_loss
- if isinstance(stop_loss, float) else float(stop_loss.item()),
- 'avg_step_time':
- step_time,
- 'avg_loader_time':
- loader_time
- }
- keep_avg.update_values(update_train_values)
-
# Plot Training Iter Stats
# reduce TB load
if global_step % 10 == 0:
iter_stats = {
- "loss_posnet": postnet_loss.item(),
- "loss_decoder": decoder_loss.item(),
+ "loss_posnet": loss_dict['postnet_loss'].item(),
+ "loss_decoder": loss_dict['decoder_loss'].item(),
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
@@ -270,9 +251,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
- save_checkpoint(model, optimizer, optimizer_st,
- postnet_loss.item(), OUT_PATH, global_step,
- epoch)
+ save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
+ optimizer_st=optimizer_st,
+ model_loss=loss_dict['postnet_loss'].item())
# Diagnostic visualizations
const_spec = postnet_output[0].data.cpu().numpy()
@@ -296,58 +277,53 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
if c.model in ["Tacotron", "TacotronGST"]:
train_audio = ap.inv_spectrogram(const_spec.T)
else:
- train_audio = ap.inv_mel_spectrogram(const_spec.T)
+ train_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_train_audios(global_step,
{'TrainAudio': train_audio},
c.audio["sample_rate"])
end_time = time.time()
# print epoch stats
- print(" | > EPOCH END -- GlobalStep:{} "
- "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
- "AvgStopLoss:{:.5f} AvgAlignScore:{:3f} EpochTime:{:.2f} "
- "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(
- global_step, keep_avg['avg_postnet_loss'],
- keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'],
- keep_avg['avg_align_score'], epoch_time,
- keep_avg['avg_step_time'], keep_avg['avg_loader_time']),
- flush=True)
+ c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
+
# Plot Epoch Stats
if args.rank == 0:
# Plot Training Epoch Stats
epoch_stats = {
"loss_postnet": keep_avg['avg_postnet_loss'],
"loss_decoder": keep_avg['avg_decoder_loss'],
- "stop_loss": keep_avg['avg_stop_loss'],
- "alignment_score": keep_avg['avg_align_score'],
+ "stopnet_loss": keep_avg['avg_stopnet_loss'],
+ "alignment_score": keep_avg['avg_align_error'],
"epoch_time": epoch_time
}
+ if c.ga_alpha > 0:
+ epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss']
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step)
- return keep_avg['avg_postnet_loss'], global_step
+ return keep_avg.avg_values, global_step
@torch.no_grad()
-def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
+def evaluate(model, criterion, ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
- if c.use_speaker_embedding:
- speaker_mapping = load_speaker_mapping(OUT_PATH)
model.eval()
epoch_time = 0
eval_values_dict = {
'avg_postnet_loss': 0,
'avg_decoder_loss': 0,
- 'avg_stop_loss': 0,
- 'avg_align_score': 0
+ 'avg_stopnet_loss': 0,
+ 'avg_align_error': 0
}
if c.bidirectional_decoder:
eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss
eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss
+ if c.ga_alpha > 0:
+ eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss
keep_avg = KeepAverage()
keep_avg.add_values(eval_values_dict)
- print("\n > Validation")
+ c_logger.print_eval_start()
if data_loader is not None:
for num_iter, data in enumerate(data_loader):
start_time = time.time()
@@ -363,71 +339,51 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
+ decoder_backward_output = None
- # loss computation
- stop_loss = criterion_st(
- stop_tokens, stop_targets, mel_lengths) if c.stopnet else torch.zeros(1)
- if c.loss_masking:
- decoder_loss = criterion(decoder_output, mel_input,
- mel_lengths)
- if c.model in ["Tacotron", "TacotronGST"]:
- postnet_loss = criterion(postnet_output, linear_input,
- mel_lengths)
- else:
- postnet_loss = criterion(postnet_output, mel_input,
- mel_lengths)
+ # set the alignment lengths wrt reduction factor for guided attention
+ if mel_lengths.max() % model.decoder.r != 0:
+ alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
- decoder_loss = criterion(decoder_output, mel_input)
- if c.model in ["Tacotron", "TacotronGST"]:
- postnet_loss = criterion(postnet_output, linear_input)
- else:
- postnet_loss = criterion(postnet_output, mel_input)
- loss = decoder_loss + postnet_loss + stop_loss
+ alignment_lengths = mel_lengths // model.decoder.r
- # backward decoder loss
+ # compute loss
+ loss_dict = criterion(postnet_output, decoder_output, mel_input,
+ linear_input, stop_tokens, stop_targets,
+ mel_lengths, decoder_backward_output,
+ alignments, alignment_lengths, text_lengths)
if c.bidirectional_decoder:
- if c.loss_masking:
- decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths)
- else:
- decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input)
- decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output)
- loss += decoder_backward_loss + decoder_c_loss
- keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()})
+ keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_b_loss'].item(),
+ 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()})
+ if c.ga_alpha > 0:
+ keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()})
+ # step time
step_time = time.time() - start_time
epoch_time += step_time
# compute alignment score
- align_score = alignment_diagonal_score(alignments)
- keep_avg.update_value('avg_align_score', align_score)
+ align_error = 1 - alignment_diagonal_score(alignments)
+ keep_avg.update_value('avg_align_error', align_error)
# aggregate losses from processes
if num_gpus > 1:
- postnet_loss = reduce_tensor(postnet_loss.data, num_gpus)
- decoder_loss = reduce_tensor(decoder_loss.data, num_gpus)
+ loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
+ loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
if c.stopnet:
- stop_loss = reduce_tensor(stop_loss.data, num_gpus)
+ loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
keep_avg.update_values({
'avg_postnet_loss':
- float(postnet_loss.item()),
+ float(loss_dict['postnet_loss'].item()),
'avg_decoder_loss':
- float(decoder_loss.item()),
- 'avg_stop_loss':
- float(stop_loss.item()),
+ float(loss_dict['decoder_loss'].item()),
+ 'avg_stopnet_loss':
+ float(loss_dict['stopnet_loss'].item()),
})
- if num_iter % c.print_step == 0:
- print(
- " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} "
- "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}"
- .format(loss.item(), postnet_loss.item(),
- keep_avg['avg_postnet_loss'],
- decoder_loss.item(),
- keep_avg['avg_decoder_loss'], stop_loss.item(),
- keep_avg['avg_stop_loss'], align_score,
- keep_avg['avg_align_score']),
- flush=True)
+ if c.print_eval:
+ c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Diagnostic visualizations
@@ -448,7 +404,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
if c.model in ["Tacotron", "TacotronGST"]:
eval_audio = ap.inv_spectrogram(const_spec.T)
else:
- eval_audio = ap.inv_mel_spectrogram(const_spec.T)
+ eval_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
@@ -456,14 +412,16 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
epoch_stats = {
"loss_postnet": keep_avg['avg_postnet_loss'],
"loss_decoder": keep_avg['avg_decoder_loss'],
- "stop_loss": keep_avg['avg_stop_loss'],
- "alignment_score": keep_avg['avg_align_score']
+ "stopnet_loss": keep_avg['avg_stopnet_loss'],
+ "alignment_score": keep_avg['avg_align_error'],
}
if c.bidirectional_decoder:
epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
align_b_img = alignments_backward[idx].data.cpu().numpy()
eval_figures['alignment_backward'] = plot_alignment(align_b_img)
+ if c.ga_alpha > 0:
+ epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss']
tb_logger.tb_eval_stats(global_step, epoch_stats)
tb_logger.tb_eval_figures(global_step, eval_figures)
@@ -487,7 +445,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences):
try:
- wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
+ wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
model,
test_sentence,
c,
@@ -516,7 +474,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures)
- return keep_avg['avg_postnet_loss']
+ return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
@@ -569,14 +527,8 @@ def main(args): # pylint: disable=redefined-outer-name
else:
optimizer_st = None
- if c.loss_masking:
- criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron", "TacotronGST"
- ] else MSELossMasked(c.seq_len_norm)
- else:
- criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"
- ] else nn.MSELoss()
- criterion_st = BCELossMasked(
- pos_weight=torch.tensor(10)) if c.stopnet else None
+ # setup criterion
+ criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
@@ -604,8 +556,6 @@ def main(args): # pylint: disable=redefined-outer-name
if use_cuda:
model.cuda()
criterion.cuda()
- if criterion_st:
- criterion_st.cuda()
# DISTRUBUTED
if num_gpus > 1:
@@ -626,6 +576,7 @@ def main(args): # pylint: disable=redefined-outer-name
global_step = args.restore_step
for epoch in range(0, c.epochs):
+ c_logger.print_epoch_start(epoch, c.epochs)
# set gradual training
if c.gradual_training is not None:
r, c.batch_size = gradual_training_scheduler(global_step, c)
@@ -633,21 +584,18 @@ def main(args): # pylint: disable=redefined-outer-name
model.decoder.set_r(r)
if c.bidirectional_decoder:
model.decoder_backward.set_r(r)
- print(" > Number of outputs per iteration:", model.decoder.r)
+ print("\n > Number of output frames:", model.decoder.r)
- train_loss, global_step = train(model, criterion, criterion_st,
- optimizer, optimizer_st, scheduler, ap,
- global_step, epoch)
- val_loss = evaluate(model, criterion, criterion_st, ap, global_step,
- epoch)
- print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
- train_loss, val_loss),
- flush=True)
- target_loss = train_loss
+ train_avg_loss_dict, global_step = train(model, criterion, optimizer,
+ optimizer_st, scheduler, ap,
+ global_step, epoch)
+ eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
+ c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
+ target_loss = train_avg_loss_dict['avg_postnet_loss']
if c.run_eval:
- target_loss = val_loss
- best_loss = save_best_model(model, optimizer, target_loss, best_loss,
- OUT_PATH, global_step, epoch)
+ target_loss = eval_avg_loss_dict['avg_postnet_loss']
+ best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
+ OUT_PATH)
if __name__ == '__main__':
@@ -671,7 +619,7 @@ if __name__ == '__main__':
)
parser.add_argument('--debug',
type=bool,
- default=True,
+ default=False,
help='Do not verify commit integrity to run training.')
# DISTRUBUTED
@@ -705,6 +653,8 @@ if __name__ == '__main__':
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
+ c_logger = ConsoleLogger()
+
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
@@ -716,9 +666,11 @@ if __name__ == '__main__':
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
- if args.rank == 0:
LOG_DIR = OUT_PATH
- tb_logger = Logger(LOG_DIR)
+ tb_logger = TensorboardLogger(LOG_DIR)
+
+ # write model desc to tensorboard
+ tb_logger.tb_add_text('model-description', c['run_description'], 0)
try:
main(args)
diff --git a/utils/audio.py b/utils/audio.py
index 771e6a43..13eab3d6 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -4,6 +4,8 @@ import numpy as np
import scipy.io
import scipy.signal
+from TTS.utils.data import StandardScaler
+
class AudioProcessor(object):
def __init__(self,
@@ -27,11 +29,12 @@ class AudioProcessor(object):
griffin_lim_iters=None,
do_trim_silence=False,
trim_db=60,
- sound_norm=False,
+ do_sound_norm=False,
+ stats_path=None,
**_):
print(" > Setting up Audio Processor...")
-
+ # setup class attributed
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db or 0
@@ -50,7 +53,9 @@ class AudioProcessor(object):
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.trim_db = trim_db
- self.sound_norm = sound_norm
+ self.do_sound_norm = do_sound_norm
+ self.stats_path = stats_path
+ # setup stft parameters
if hop_length is None:
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
else:
@@ -61,19 +66,19 @@ class AudioProcessor(object):
members = vars(self)
for key, value in members.items():
print(" | > {}:{}".format(key, value))
+ # create spectrogram utils
+ self.mel_basis = self._build_mel_basis()
+ self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
+ # setup scaler
+ if stats_path:
+ mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
+ self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+ self.signal_norm = True
+ self.max_norm = None
+ self.clip_norm = None
+ self.symmetric_norm = None
- def save_wav(self, wav, path):
- wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
- scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))
-
- def _linear_to_mel(self, spectrogram):
- _mel_basis = self._build_mel_basis()
- return np.dot(_mel_basis, spectrogram)
-
- def _mel_to_linear(self, mel_spec):
- inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
- return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec))
-
+ ### setting up the parameters ###
def _build_mel_basis(self, ):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
@@ -84,11 +89,32 @@ class AudioProcessor(object):
fmin=self.mel_fmin,
fmax=self.mel_fmax)
+ def _stft_parameters(self, ):
+ """Compute necessary stft parameters with given time values"""
+ n_fft = (self.num_freq - 1) * 2
+ factor = self.frame_length_ms / self.frame_shift_ms
+ assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
+ hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
+ win_length = int(hop_length * factor)
+ return n_fft, hop_length, win_length
+
+ ### normalization ###
def _normalize(self, S):
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
#pylint: disable=no-else-return
+ S = S.copy()
if self.signal_norm:
- S_norm = ((S - self.min_level_db) / - self.min_level_db)
+ # mean-var scaling
+ if hasattr(self, 'mel_scaler'):
+ if S.shape[0] == self.num_mels:
+ return self.mel_scaler.transform(S.T).T
+ elif S.shape[0] == self.n_fft / 2:
+ return self.linear_scaler.transform(S.T).T
+ else:
+ raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
+ # range normalization
+ S -= self.ref_level_db # discard certain range of DB assuming it is air noise
+ S_norm = ((S - self.min_level_db) / (-self.min_level_db))
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
@@ -105,39 +131,64 @@ class AudioProcessor(object):
def _denormalize(self, S):
"""denormalize values"""
#pylint: disable=no-else-return
- S_denorm = S
+ S_denorm = S.copy()
if self.signal_norm:
+ # mean-var scaling
+ if hasattr(self, 'mel_scaler'):
+ if S_denorm.shape[0] == self.num_mels:
+ return self.mel_scaler.inverse_transform(S_denorm.T).T
+ elif S_denorm.shape[0] == self.n_fft / 2:
+ return self.linear_scaler.inverse_transform(S_denorm.T).T
+ else:
+ raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
- return S_denorm
+ return S_denorm + self.ref_level_db
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = (S_denorm * -self.min_level_db /
self.max_norm) + self.min_level_db
- return S_denorm
+ return S_denorm + self.ref_level_db
else:
- return S
+ return S_denorm
- def _stft_parameters(self, ):
- """Compute necessary stft parameters with given time values"""
- n_fft = (self.num_freq - 1) * 2
- factor = self.frame_length_ms / self.frame_shift_ms
- assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
- hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
- win_length = int(hop_length * factor)
- return n_fft, hop_length, win_length
+ ### Mean-STD scaling ###
+ def load_stats(self, stats_path):
+ stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg
+ mel_mean = stats['mel_mean']
+ mel_std = stats['mel_std']
+ linear_mean = stats['linear_mean']
+ linear_std = stats['linear_std']
+ stats_config = stats['audio_config']
+ # check all audio parameters used for computing stats
+ skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power']
+ for key in stats_config.keys():
+ if key in skip_parameters:
+ continue
+ assert stats_config[key] == self.__dict__[key],\
+ f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+ return mel_mean, mel_std, linear_mean, linear_std, stats_config
+ # pylint: disable=attribute-defined-outside-init
+ def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std):
+ self.mel_scaler = StandardScaler()
+ self.mel_scaler.set_stats(mel_mean, mel_std)
+ self.linear_scaler = StandardScaler()
+ self.linear_scaler.set_stats(linear_mean, linear_std)
+
+ ### DB and AMP conversion ###
+ # pylint: disable=no-self-use
def _amp_to_db(self, x):
- min_level = np.exp(self.min_level_db / 20 * np.log(10))
- return 20 * np.log10(np.maximum(min_level, x))
+ return 20 * np.log10(np.maximum(1e-5, x))
- @staticmethod
- def _db_to_amp(x):
+ # pylint: disable=no-self-use
+ def _db_to_amp(self, x):
return np.power(10.0, x * 0.05)
+ ### Preemphasis ###
def apply_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
@@ -148,12 +199,19 @@ class AudioProcessor(object):
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
+ ### SPECTROGRAMs ###
+ def _linear_to_mel(self, spectrogram):
+ return np.dot(self.mel_basis, spectrogram)
+
+ def _mel_to_linear(self, mel_spec):
+ return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
+
def spectrogram(self, y):
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
- S = self._amp_to_db(np.abs(D)) - self.ref_level_db
+ S = self._amp_to_db(np.abs(D))
return self._normalize(S)
def melspectrogram(self, y):
@@ -161,22 +219,22 @@ class AudioProcessor(object):
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
- S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+ S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
"""Converts spectrogram to waveform using librosa"""
S = self._denormalize(spectrogram)
- S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear
+ S = self._db_to_amp(S)
# Reconstruct phase
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
- def inv_mel_spectrogram(self, mel_spectrogram):
- '''Converts mel spectrogram to waveform using librosa'''
+ def inv_melspectrogram(self, mel_spectrogram):
+ '''Converts melspectrogram to waveform using librosa'''
D = self._denormalize(mel_spectrogram)
- S = self._db_to_amp(D + self.ref_level_db)
+ S = self._db_to_amp(D)
S = self._mel_to_linear(S) # Convert back to linear
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
@@ -184,21 +242,13 @@ class AudioProcessor(object):
def out_linear_to_mel(self, linear_spec):
S = self._denormalize(linear_spec)
- S = self._db_to_amp(S + self.ref_level_db)
+ S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
- S = self._amp_to_db(S) - self.ref_level_db
+ S = self._amp_to_db(S)
mel = self._normalize(S)
return mel
- def _griffin_lim(self, S):
- angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
- S_complex = np.abs(S).astype(np.complex)
- y = self._istft(S_complex * angles)
- for _ in range(self.griffin_lim_iters):
- angles = np.exp(1j * np.angle(self._stft(y)))
- y = self._istft(S_complex * angles)
- return y
-
+ ### STFT and ISTFT ###
def _stft(self, y):
return librosa.stft(
y=y,
@@ -212,6 +262,25 @@ class AudioProcessor(object):
return librosa.istft(
y, hop_length=self.hop_length, win_length=self.win_length)
+ def _griffin_lim(self, S):
+ angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+ S_complex = np.abs(S).astype(np.complex)
+ y = self._istft(S_complex * angles)
+ for _ in range(self.griffin_lim_iters):
+ angles = np.exp(1j * np.angle(self._stft(y)))
+ y = self._istft(S_complex * angles)
+ return y
+
+ def compute_stft_paddings(self, x, pad_sides=1):
+ '''compute right padding (final frame) or both sides padding (first and final frames)
+ '''
+ assert pad_sides in (1, 2)
+ pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
+ if pad_sides == 1:
+ return 0, pad
+ return pad // 2, pad // 2 + pad % 2
+
+ ### Audio Processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
@@ -228,6 +297,30 @@ class AudioProcessor(object):
return librosa.effects.trim(
wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
+ @staticmethod
+ def sound_norm(x):
+ return x / abs(x).max() * 0.9
+
+ ### save and load ###
+ def load_wav(self, filename, sr=None):
+ if sr is None:
+ x, sr = sf.read(filename)
+ else:
+ x, sr = librosa.load(filename, sr=sr)
+ if self.do_trim_silence:
+ try:
+ x = self.trim_silence(x)
+ except ValueError:
+ print(f' [!] File cannot be trimmed for silence - {filename}')
+ assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
+ if self.do_sound_norm:
+ x = self.sound_norm(x)
+ return x
+
+ def save_wav(self, wav, path):
+ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+ scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))
+
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
@@ -244,20 +337,6 @@ class AudioProcessor(object):
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
return x
- def load_wav(self, filename, sr=None):
- if sr is None:
- x, sr = sf.read(filename)
- else:
- x, sr = librosa.load(filename, sr=sr)
- if self.do_trim_silence:
- try:
- x = self.trim_silence(x)
- except ValueError:
- print(f' [!] File cannot be trimmed for silence - {filename}')
- assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
- if self.sound_norm:
- x = x / abs(x).max() * 0.9
- return x
@staticmethod
def encode_16bits(x):
diff --git a/utils/console_logger.py b/utils/console_logger.py
new file mode 100644
index 00000000..5c6ec75f
--- /dev/null
+++ b/utils/console_logger.py
@@ -0,0 +1,95 @@
+import datetime
+from TTS.utils.io import AttrDict
+
+
+tcolors = AttrDict({
+ 'OKBLUE': '\033[94m',
+ 'HEADER': '\033[95m',
+ 'OKGREEN': '\033[92m',
+ 'WARNING': '\033[93m',
+ 'FAIL': '\033[91m',
+ 'ENDC': '\033[0m',
+ 'BOLD': '\033[1m',
+ 'UNDERLINE': '\033[4m'
+})
+
+
+class ConsoleLogger():
+ def __init__(self):
+ # TODO: color code for value changes
+ # use these to compare values between iterations
+ self.old_train_loss_dict = None
+ self.old_epoch_loss_dict = None
+ self.old_eval_loss_dict = None
+
+ # pylint: disable=no-self-use
+ def get_time(self):
+ now = datetime.datetime.now()
+ return now.strftime("%Y-%m-%d %H:%M:%S")
+
+ def print_epoch_start(self, epoch, max_epoch):
+ print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD,
+ epoch, max_epoch, tcolors.ENDC),
+ flush=True)
+
+ def print_train_start(self):
+ print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
+
+ def print_train_step(self, batch_steps, step, global_step, avg_spec_length,
+ avg_text_length, step_time, loader_time, lr,
+ loss_dict, avg_loss_dict):
+ indent = " | > "
+ print()
+ log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
+ tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC)
+ for key, value in loss_dict.items():
+ # print the avg value if given
+ if f'avg_{key}' in avg_loss_dict.keys():
+ log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
+ else:
+ log_text += "{}{}: {:.5f} \n".format(indent, key, value)
+ log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\
+ f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"
+ print(log_text, flush=True)
+
+ # pylint: disable=unused-argument
+ def print_train_epoch_end(self, global_step, epoch, epoch_time,
+ print_dict):
+ indent = " | > "
+ log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n"
+ for key, value in print_dict.items():
+ log_text += "{}{}: {:.5f}\n".format(indent, key, value)
+ print(log_text, flush=True)
+
+ def print_eval_start(self):
+ print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n")
+
+ def print_eval_step(self, step, loss_dict, avg_loss_dict):
+ indent = " | > "
+ print()
+ log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n"
+ for key, value in loss_dict.items():
+ # print the avg value if given
+ if f'avg_{key}' in avg_loss_dict.keys():
+ log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
+ else:
+ log_text += "{}{}: {:.5f} \n".format(indent, key, value)
+ print(log_text, flush=True)
+
+ def print_epoch_end(self, epoch, avg_loss_dict):
+ indent = " | > "
+ log_text = " {}--> EVAL PERFORMANCE{}\n".format(
+ tcolors.BOLD, tcolors.ENDC)
+ for key, value in avg_loss_dict.items():
+ # print the avg value if given
+ color = tcolors.FAIL
+ sign = '+'
+ diff = 0
+ if self.old_eval_loss_dict is not None:
+ diff = value - self.old_eval_loss_dict[key]
+ if diff < 0:
+ color = tcolors.OKGREEN
+ sign = ''
+ log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
+ self.old_eval_loss_dict = avg_loss_dict
+ print(log_text, flush=True)
\ No newline at end of file
diff --git a/utils/data.py b/utils/data.py
index f2d7538a..a83325cb 100644
--- a/utils/data.py
+++ b/utils/data.py
@@ -50,3 +50,28 @@ def pad_per_step(inputs, pad_len):
inputs, [[0, 0], [0, 0], [0, pad_len]],
mode='constant',
constant_values=0.0)
+
+
+# pylint: disable=attribute-defined-outside-init
+class StandardScaler():
+
+ def set_stats(self, mean, scale):
+ self.mean_ = mean
+ self.scale_ = scale
+
+ def reset_stats(self):
+ delattr(self, 'mean_')
+ delattr(self, 'scale_')
+
+ def transform(self, X):
+ X = np.asarray(X)
+ X -= self.mean_
+ X /= self.scale_
+ return X
+
+ def inverse_transform(self, X):
+ X = np.asarray(X)
+ X *= self.scale_
+ X += self.mean_
+ return X
+
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
index f6c38530..1c7dd5e4 100644
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@@ -1,31 +1,12 @@
import os
-import re
import glob
+import torch
import shutil
import datetime
-import json
-import torch
import subprocess
import importlib
import numpy as np
-from collections import OrderedDict, Counter
-
-
-class AttrDict(dict):
- def __init__(self, *args, **kwargs):
- super(AttrDict, self).__init__(*args, **kwargs)
- self.__dict__ = self
-
-
-def load_config(config_path):
- config = AttrDict()
- with open(config_path, "r") as f:
- input_str = f.read()
- input_str = re.sub(r'\\\n', '', input_str)
- input_str = re.sub(r'//.*\n', '\n', input_str)
- data = json.loads(input_str)
- config.update(data)
- return config
+from collections import Counter
def get_git_branch():
@@ -60,10 +41,10 @@ def get_commit_hash():
def create_experiment_folder(root_path, model_name, debug):
""" Create a folder with the current date and time """
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
- # if debug:
- # commit_hash = 'debug'
- # else:
- commit_hash = get_commit_hash()
+ if debug:
+ commit_hash = 'debug'
+ else:
+ commit_hash = get_commit_hash()
output_folder = os.path.join(
root_path, model_name + '-' + date_str + '-' + commit_hash)
os.makedirs(output_folder, exist_ok=True)
@@ -77,161 +58,39 @@ def remove_experiment_folder(experiment_path):
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
if not checkpoint_files:
if os.path.exists(experiment_path):
- shutil.rmtree(experiment_path)
+ shutil.rmtree(experiment_path, ignore_errors=True)
print(" ! Run is removed from {}".format(experiment_path))
else:
print(" ! Run is kept in {}".format(experiment_path))
-def copy_config_file(config_file, out_path, new_fields):
- config_lines = open(config_file, "r").readlines()
- # add extra information fields
- for key, value in new_fields.items():
- if type(value) == str:
- new_line = '"{}":"{}",\n'.format(key, value)
- else:
- new_line = '"{}":{},\n'.format(key, value)
- config_lines.insert(1, new_line)
- config_out_file = open(out_path, "w")
- config_out_file.writelines(config_lines)
- config_out_file.close()
-
-
-def _trim_model_state_dict(state_dict):
- r"""Remove 'module.' prefix from state dictionary. It is necessary as it
- is loded for the next time by model.load_state(). Otherwise, it complains
- about the torch.DataParallel()"""
-
- new_state_dict = OrderedDict()
- for k, v in state_dict.items():
- name = k[7:] # remove `module.`
- new_state_dict[name] = v
- return new_state_dict
-
-
-def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path,
- current_step, epoch):
- checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
- checkpoint_path = os.path.join(out_path, checkpoint_path)
- print(" | | > Checkpoint saving : {}".format(checkpoint_path))
-
- new_state_dict = model.state_dict()
- state = {
- 'model': new_state_dict,
- 'optimizer': optimizer.state_dict() if optimizer is not None else None,
- 'step': current_step,
- 'epoch': epoch,
- 'linear_loss': model_loss,
- 'date': datetime.date.today().strftime("%B %d, %Y"),
- 'r': model.decoder.r
- }
- torch.save(state, checkpoint_path)
-
-
-def save_best_model(model, optimizer, model_loss, best_loss, out_path,
- current_step, epoch):
- if model_loss < best_loss:
- new_state_dict = model.state_dict()
- state = {
- 'model': new_state_dict,
- 'optimizer': optimizer.state_dict(),
- 'step': current_step,
- 'epoch': epoch,
- 'linear_loss': model_loss,
- 'date': datetime.date.today().strftime("%B %d, %Y"),
- 'r': model.decoder.r
- }
- best_loss = model_loss
- bestmodel_path = 'best_model.pth.tar'
- bestmodel_path = os.path.join(out_path, bestmodel_path)
- print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
- model_loss, bestmodel_path))
- torch.save(state, bestmodel_path)
- return best_loss
-
-
-def check_update(model, grad_clip, ignore_stopnet=False):
- r'''Check model gradient against unexpected jumps and failures'''
- skip_flag = False
- if ignore_stopnet:
- grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
- else:
- grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
- if np.isinf(grad_norm):
- print(" | > Gradient is INF !!")
- skip_flag = True
- return grad_norm, skip_flag
-
-
-def lr_decay(init_lr, global_step, warmup_steps):
- r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py'''
- warmup_steps = float(warmup_steps)
- step = global_step + 1.
- lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5,
- step**-0.5)
- return lr
-
-
-def adam_weight_decay(optimizer):
- """
- Custom weight decay operation, not effecting grad values.
- """
- for group in optimizer.param_groups:
- for param in group['params']:
- current_lr = group['lr']
- weight_decay = group['weight_decay']
- param.data = param.data.add(-weight_decay * group['lr'],
- param.data)
- return optimizer, current_lr
-
-# pylint: disable=dangerous-default-value
-def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
- """
- Skip biases, BatchNorm parameters, rnns.
- and attention projection layer v
- """
- decay = []
- no_decay = []
- for name, param in model.named_parameters():
- if not param.requires_grad:
- continue
-
- if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
- no_decay.append(param)
- else:
- decay.append(param)
- return [{
- 'params': no_decay,
- 'weight_decay': 0.
- }, {
- 'params': decay,
- 'weight_decay': weight_decay
- }]
-
-
-class NoamLR(torch.optim.lr_scheduler._LRScheduler):
- def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
- self.warmup_steps = float(warmup_steps)
- super(NoamLR, self).__init__(optimizer, last_epoch)
-
- def get_lr(self):
- step = max(self.last_epoch, 1)
- return [
- base_lr * self.warmup_steps**0.5 *
- min(step * self.warmup_steps**-1.5, step**-0.5)
- for base_lr in self.base_lrs
- ]
-
-
-def mk_decay(init_mk, max_epoch, n_epoch):
- return init_mk * ((max_epoch - n_epoch) / max_epoch)
-
-
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def split_dataset(items):
+ is_multi_speaker = False
+ speakers = [item[-1] for item in items]
+ is_multi_speaker = len(set(speakers)) > 1
+ eval_split_size = 500 if len(items) * 0.01 > 500 else int(
+ len(items) * 0.01)
+ np.random.seed(0)
+ np.random.shuffle(items)
+ if is_multi_speaker:
+ items_eval = []
+ # most stupid code ever -- Fix it !
+ while len(items_eval) < eval_split_size:
+ speakers = [item[-1] for item in items]
+ speaker_counter = Counter(speakers)
+ item_idx = np.random.randint(0, len(items))
+ if speaker_counter[items[item_idx][-1]] > 1:
+ items_eval.append(items[item_idx])
+ del items[item_idx]
+ return items_eval, items
+ return items[:eval_split_size], items[eval_split_size:]
+
+
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
@@ -240,7 +99,7 @@ def sequence_mask(sequence_length, max_len=None):
seq_range = torch.arange(0, max_len).long()
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
- seq_range_expand = seq_range_expand.cuda()
+ seq_range_expand = seq_range_expand.to(sequence_length.device)
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
@@ -322,44 +181,6 @@ def setup_model(num_chars, num_speakers, c):
bidirectional_decoder=c.bidirectional_decoder)
return model
-
-def split_dataset(items):
- is_multi_speaker = False
- speakers = [item[-1] for item in items]
- is_multi_speaker = len(set(speakers)) > 1
- eval_split_size = 500 if len(items) * 0.01 > 500 else int(
- len(items) * 0.01)
- np.random.seed(0)
- np.random.shuffle(items)
- if is_multi_speaker:
- items_eval = []
- # most stupid code ever -- Fix it !
- while len(items_eval) < eval_split_size:
- speakers = [item[-1] for item in items]
- speaker_counter = Counter(speakers)
- item_idx = np.random.randint(0, len(items))
- if speaker_counter[items[item_idx][-1]] > 1:
- items_eval.append(items[item_idx])
- del items[item_idx]
- return items_eval, items
- else:
- return items[:eval_split_size], items[eval_split_size:]
-
-
-def gradual_training_scheduler(global_step, config):
- """Setup the gradual training schedule wrt number
- of active GPUs"""
- num_gpus = torch.cuda.device_count()
- if num_gpus == 0:
- num_gpus = 1
- new_values = None
- # we set the scheduling wrt num_gpus
- for values in config.gradual_training:
- if global_step * num_gpus >= values[0]:
- new_values = values
- return new_values[1], new_values[2]
-
-
class KeepAverage():
def __init__(self):
self.avg_values = {}
@@ -368,6 +189,9 @@ class KeepAverage():
def __getitem__(self, key):
return self.avg_values[key]
+ def items(self):
+ return self.avg_values.items()
+
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
@@ -496,7 +320,9 @@ def check_config(c):
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
# dataloading
- _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners'])
+ # pylint: disable=import-outside-toplevel
+ from TTS.utils.text import cleaners
+ _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
_check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
_check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
_check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
@@ -518,4 +344,4 @@ def check_config(c):
_check_argument('name', dataset_entry, restricted=True, val_type=str)
_check_argument('path', dataset_entry, restricted=True, val_type=str)
_check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
- _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
\ No newline at end of file
+ _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
diff --git a/utils/io.py b/utils/io.py
new file mode 100644
index 00000000..faf00195
--- /dev/null
+++ b/utils/io.py
@@ -0,0 +1,78 @@
+import os
+import json
+import re
+import torch
+import datetime
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+def load_config(config_path):
+ config = AttrDict()
+ with open(config_path, "r") as f:
+ input_str = f.read()
+ input_str = re.sub(r'\\\n', '', input_str)
+ input_str = re.sub(r'//.*\n', '\n', input_str)
+ data = json.loads(input_str)
+ config.update(data)
+ return config
+
+
+def copy_config_file(config_file, out_path, new_fields):
+ config_lines = open(config_file, "r").readlines()
+ # add extra information fields
+ for key, value in new_fields.items():
+ if isinstance(value, str):
+ new_line = '"{}":"{}",\n'.format(key, value)
+ else:
+ new_line = '"{}":{},\n'.format(key, value)
+ config_lines.insert(1, new_line)
+ config_out_file = open(out_path, "w")
+ config_out_file.writelines(config_lines)
+ config_out_file.close()
+
+
+def load_checkpoint(model, checkpoint_path, use_cuda=False):
+ state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+ model.load_state_dict(state['model'])
+ if use_cuda:
+ model.cuda()
+ # set model stepsize
+ if 'r' in state.keys():
+ model.decoder.set_r(state['r'])
+ return model, state
+
+
+def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
+ new_state_dict = model.state_dict()
+ state = {
+ 'model': new_state_dict,
+ 'optimizer': optimizer.state_dict() if optimizer is not None else None,
+ 'step': current_step,
+ 'epoch': epoch,
+ 'date': datetime.date.today().strftime("%B %d, %Y"),
+ 'r': r
+ }
+ state.update(kwargs)
+ torch.save(state, output_path)
+
+
+def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
+ file_name = 'checkpoint_{}.pth.tar'.format(current_step)
+ checkpoint_path = os.path.join(output_folder, file_name)
+ print(" > CHECKPOINT : {}".format(checkpoint_path))
+ save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs)
+
+
+def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
+ if target_loss < best_loss:
+ file_name = 'best_model.pth.tar'
+ checkpoint_path = os.path.join(output_folder, file_name)
+ print(" > BEST MODEL : {}".format(checkpoint_path))
+ save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
+ best_loss = target_loss
+ return best_loss
diff --git a/utils/radam.py b/utils/radam.py
index 62ecc695..4724b705 100644
--- a/utils/radam.py
+++ b/utils/radam.py
@@ -1,17 +1,31 @@
+# from https://github.com/LiyuanLucasLiu/RAdam
+
import math
import torch
-from torch.optim.optimizer import Optimizer
+from torch.optim.optimizer import Optimizer, required
-# adapted from https://github.com/LiyuanLucasLiu/RAdam
class RAdam(Optimizer):
- def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
- defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
- self.buffer = [[None, None, None] for ind in range(10)]
+ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+ if lr < 0.0:
+ raise ValueError("Invalid learning rate: {}".format(lr))
+ if eps < 0.0:
+ raise ValueError("Invalid epsilon value: {}".format(eps))
+ if not 0.0 <= betas[0] < 1.0:
+ raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+ if not 0.0 <= betas[1] < 1.0:
+ raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+ self.degenerated_to_sgd = degenerated_to_sgd
+ if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+ for param in params:
+ if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
+ param['buffer'] = [[None, None, None] for _ in range(10)]
+ defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
super(RAdam, self).__init__(params, defaults)
- def __setstate__(self, state): # pylint: disable= useless-super-delegation
+ def __setstate__(self, state):
super(RAdam, self).__setstate__(state)
def step(self, closure=None):
@@ -27,128 +41,57 @@ class RAdam(Optimizer):
continue
grad = p.grad.data.float()
if grad.is_sparse:
- raise RuntimeError(
- 'RAdam does not support sparse gradients')
+ raise RuntimeError('RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
- if not state:
+ if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
- state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
- p_data_fp32)
+ state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
- exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
- exp_avg.mul_(beta1).add_(1 - beta1, grad)
+ exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+ exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
state['step'] += 1
- buffered = self.buffer[int(state['step'] % 10)]
+ buffered = group['buffer'][int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
- N_sma = N_sma_max - 2 * \
- state['step'] * beta2_t / (1 - beta2_t)
+ N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
# more conservative since it's an approximated value
if N_sma >= 5:
- step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
- N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
+ step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
+ elif self.degenerated_to_sgd:
+ step_size = 1.0 / (1 - beta1 ** state['step'])
else:
- step_size = group['lr'] / (1 - beta1 ** state['step'])
+ step_size = -1
buffered[2] = step_size
- if group['weight_decay'] != 0:
- p_data_fp32.add_(-group['weight_decay']
- * group['lr'], p_data_fp32)
-
# more conservative since it's an approximated value
if N_sma >= 5:
+ if group['weight_decay'] != 0:
+ p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
denom = exp_avg_sq.sqrt().add_(group['eps'])
- p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
- else:
- p_data_fp32.add_(-step_size, exp_avg)
-
- p.data.copy_(p_data_fp32)
-
- return loss
-
-
-class PlainRAdam(Optimizer):
-
- def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
- defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-
- super(PlainRAdam, self).__init__(params, defaults)
-
- def __setstate__(self, state): # pylint: disable= useless-super-delegation
- super(PlainRAdam, self).__setstate__(state)
-
- def step(self, closure=None):
-
- loss = None
- if closure is not None:
- loss = closure()
-
- for group in self.param_groups:
-
- for p in group['params']:
- if p.grad is None:
- continue
- grad = p.grad.data.float()
- if grad.is_sparse:
- raise RuntimeError(
- 'RAdam does not support sparse gradients')
-
- p_data_fp32 = p.data.float()
-
- state = self.state[p]
-
- if not state:
- state['step'] = 0
- state['exp_avg'] = torch.zeros_like(p_data_fp32)
- state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
- else:
- state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
- state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
- p_data_fp32)
-
- exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
- beta1, beta2 = group['betas']
-
- exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
- exp_avg.mul_(beta1).add_(1 - beta1, grad)
-
- state['step'] += 1
- beta2_t = beta2 ** state['step']
- N_sma_max = 2 / (1 - beta2) - 1
- N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
-
- if group['weight_decay'] != 0:
- p_data_fp32.add_(-group['weight_decay']
- * group['lr'], p_data_fp32)
-
- # more conservative since it's an approximated value
- if N_sma >= 5:
- step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
- N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
- denom = exp_avg_sq.sqrt().add_(group['eps'])
- p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
- else:
- step_size = group['lr'] / (1 - beta1 ** state['step'])
- p_data_fp32.add_(-step_size, exp_avg)
-
- p.data.copy_(p_data_fp32)
+ p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
+ p.data.copy_(p_data_fp32)
+ elif step_size > 0:
+ if group['weight_decay'] != 0:
+ p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
+ p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
+ p.data.copy_(p_data_fp32)
return loss
diff --git a/utils/synthesis.py b/utils/synthesis.py
index 1047c16b..a53c12dc 100644
--- a/utils/synthesis.py
+++ b/utils/synthesis.py
@@ -1,9 +1,13 @@
+import pkg_resources
+installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
+if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
+ import tensorflow as tf
import torch
import numpy as np
from .text import text_to_sequence, phoneme_to_sequence
-def text_to_seqvec(text, CONFIG, use_cuda):
+def text_to_seqvec(text, CONFIG):
text_cleaner = [CONFIG.text_cleaner]
# text ot phonemes to sequence vector
if CONFIG.use_phonemes:
@@ -14,23 +18,32 @@ def text_to_seqvec(text, CONFIG, use_cuda):
dtype=np.int32)
else:
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
- # torch tensor
- chars_var = torch.from_numpy(seq).unsqueeze(0)
- if use_cuda:
- chars_var = chars_var.cuda()
- return chars_var.long()
+ return seq
-def compute_style_mel(style_wav, ap, use_cuda):
- print(style_wav)
- style_mel = torch.FloatTensor(ap.melspectrogram(
- ap.load_wav(style_wav))).unsqueeze(0)
- if use_cuda:
- return style_mel.cuda()
+def numpy_to_torch(np_array, dtype, cuda=False):
+ if np_array is None:
+ return None
+ tensor = torch.as_tensor(np_array, dtype=dtype)
+ if cuda:
+ return tensor.cuda()
+ return tensor
+
+
+def numpy_to_tf(np_array, dtype):
+ if np_array is None:
+ return None
+ tensor = tf.convert_to_tensor(np_array, dtype=dtype)
+ return tensor
+
+
+def compute_style_mel(style_wav, ap):
+ style_mel = ap.melspectrogram(
+ ap.load_wav(style_wav)).expand_dims(0)
return style_mel
-def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
+def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
if CONFIG.use_gst:
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
inputs, style_mel=style_mel, speaker_ids=speaker_id)
@@ -44,11 +57,33 @@ def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None)
return decoder_output, postnet_output, alignments, stop_tokens
-def parse_outputs(postnet_output, decoder_output, alignments):
+def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
+ if CONFIG.use_gst and style_mel is not None:
+ raise NotImplementedError(' [!] GST inference not implemented for TF')
+ if truncated:
+ raise NotImplementedError(' [!] Truncated inference not implemented for TF')
+ if speaker_id is not None:
+ raise NotImplementedError(' [!] Multi-Speaker not implemented for TF')
+ # TODO: handle multispeaker case
+ decoder_output, postnet_output, alignments, stop_tokens = model(
+ inputs, training=False)
+ return decoder_output, postnet_output, alignments, stop_tokens
+
+
+def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens):
postnet_output = postnet_output[0].data.cpu().numpy()
decoder_output = decoder_output[0].data.cpu().numpy()
alignment = alignments[0].cpu().data.numpy()
- return postnet_output, decoder_output, alignment
+ stop_tokens = stop_tokens[0].cpu().numpy()
+ return postnet_output, decoder_output, alignment, stop_tokens
+
+
+def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens):
+ postnet_output = postnet_output[0].numpy()
+ decoder_output = decoder_output[0].numpy()
+ alignment = alignments[0].numpy()
+ stop_tokens = stop_tokens[0].numpy()
+ return postnet_output, decoder_output, alignment, stop_tokens
def trim_silence(wav, ap):
@@ -59,7 +94,7 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
if CONFIG.model in ["Tacotron", "TacotronGST"]:
wav = ap.inv_spectrogram(postnet_output.T)
else:
- wav = ap.inv_mel_spectrogram(postnet_output.T)
+ wav = ap.inv_melspectrogram(postnet_output.T)
return wav
@@ -98,7 +133,8 @@ def synthesis(model,
truncated=False,
enable_eos_bos_chars=False, #pylint: disable=unused-argument
use_griffin_lim=False,
- do_trim_silence=False):
+ do_trim_silence=False,
+ backend='torch'):
"""Synthesize voice for the given text.
Args:
@@ -114,22 +150,37 @@ def synthesis(model,
for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
do_trim_silence (bool): trim silence after synthesis.
+ backend (str): tf or torch
"""
# GST processing
style_mel = None
if CONFIG.model == "TacotronGST" and style_wav is not None:
- style_mel = compute_style_mel(style_wav, ap, use_cuda)
+ style_mel = compute_style_mel(style_wav, ap)
# preprocess the given text
- inputs = text_to_seqvec(text, CONFIG, use_cuda)
- speaker_id = id_to_torch(speaker_id)
- if speaker_id is not None and use_cuda:
- speaker_id = speaker_id.cuda()
+ inputs = text_to_seqvec(text, CONFIG)
+ # pass tensors to backend
+ if backend == 'torch':
+ speaker_id = id_to_torch(speaker_id)
+ style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
+ inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
+ inputs = inputs.unsqueeze(0)
+ else:
+ # TODO: handle speaker id for tf model
+ style_mel = numpy_to_tf(style_mel, tf.float32)
+ inputs = numpy_to_tf(inputs, tf.int32)
+ inputs = tf.expand_dims(inputs, 0)
# synthesize voice
- decoder_output, postnet_output, alignments, stop_tokens = run_model(
- model, inputs, CONFIG, truncated, speaker_id, style_mel)
+ if backend == 'torch':
+ decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
+ model, inputs, CONFIG, truncated, speaker_id, style_mel)
+ postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
+ postnet_output, decoder_output, alignments, stop_tokens)
+ else:
+ decoder_output, postnet_output, alignments, stop_tokens = run_model_tf(
+ model, inputs, CONFIG, truncated, speaker_id, style_mel)
+ postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf(
+ postnet_output, decoder_output, alignments, stop_tokens)
# convert outputs to numpy
- postnet_output, decoder_output, alignment = parse_outputs(
- postnet_output, decoder_output, alignments)
# plot results
wav = None
if use_griffin_lim:
@@ -137,4 +188,4 @@ def synthesis(model,
# trim silence
if do_trim_silence:
wav = trim_silence(wav, ap)
- return wav, alignment, decoder_output, postnet_output, stop_tokens
+ return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs
diff --git a/utils/logger.py b/utils/tensorboard_logger.py
similarity index 95%
rename from utils/logger.py
rename to utils/tensorboard_logger.py
index 51a10422..15fe04e4 100644
--- a/utils/logger.py
+++ b/utils/tensorboard_logger.py
@@ -2,7 +2,7 @@ import traceback
from tensorboardX import SummaryWriter
-class Logger(object):
+class TensorboardLogger(object):
def __init__(self, log_dir):
self.writer = SummaryWriter(log_dir)
self.train_stats = {}
@@ -75,3 +75,6 @@ class Logger(object):
def tb_test_figures(self, step, figures):
self.dict_to_tb_figure("TestFigures", figures, step)
+
+ def tb_add_text(self, title, text, step):
+ self.writer.add_text(title, text, step)
diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py
index e6b611b4..35da8aef 100644
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@@ -91,6 +91,15 @@ def transliteration_cleaners(text):
return text
+# TODO: elaborate it
+def basic_turkish_cleaners(text):
+ '''Pipeline for Turkish text'''
+ text = text.replace("I", "ı")
+ text = lowercase(text)
+ text = collapse_whitespace(text)
+ return text
+
+
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
diff --git a/utils/training.py b/utils/training.py
new file mode 100644
index 00000000..ebf8fd13
--- /dev/null
+++ b/utils/training.py
@@ -0,0 +1,91 @@
+import torch
+import numpy as np
+
+
+def check_update(model, grad_clip, ignore_stopnet=False):
+ r'''Check model gradient against unexpected jumps and failures'''
+ skip_flag = False
+ if ignore_stopnet:
+ grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
+ else:
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+ if torch.isinf(grad_norm):
+ print(" | > Gradient is INF !!")
+ skip_flag = True
+ return grad_norm, skip_flag
+
+
+def lr_decay(init_lr, global_step, warmup_steps):
+ r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py'''
+ warmup_steps = float(warmup_steps)
+ step = global_step + 1.
+ lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5,
+ step**-0.5)
+ return lr
+
+
+def adam_weight_decay(optimizer):
+ """
+ Custom weight decay operation, not effecting grad values.
+ """
+ for group in optimizer.param_groups:
+ for param in group['params']:
+ current_lr = group['lr']
+ weight_decay = group['weight_decay']
+ factor = -weight_decay * group['lr']
+ param.data = param.data.add(param.data,
+ alpha=factor)
+ return optimizer, current_lr
+
+# pylint: disable=dangerous-default-value
+def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}):
+ """
+ Skip biases, BatchNorm parameters, rnns.
+ and attention projection layer v
+ """
+ decay = []
+ no_decay = []
+ for name, param in model.named_parameters():
+ if not param.requires_grad:
+ continue
+
+ if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]):
+ no_decay.append(param)
+ else:
+ decay.append(param)
+ return [{
+ 'params': no_decay,
+ 'weight_decay': 0.
+ }, {
+ 'params': decay,
+ 'weight_decay': weight_decay
+ }]
+
+
+# pylint: disable=protected-access
+class NoamLR(torch.optim.lr_scheduler._LRScheduler):
+ def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
+ self.warmup_steps = float(warmup_steps)
+ super(NoamLR, self).__init__(optimizer, last_epoch)
+
+ def get_lr(self):
+ step = max(self.last_epoch, 1)
+ return [
+ base_lr * self.warmup_steps**0.5 *
+ min(step * self.warmup_steps**-1.5, step**-0.5)
+ for base_lr in self.base_lrs
+ ]
+
+
+def gradual_training_scheduler(global_step, config):
+ """Setup the gradual training schedule wrt number
+ of active GPUs"""
+ num_gpus = torch.cuda.device_count()
+ if num_gpus == 0:
+ num_gpus = 1
+ new_values = None
+ # we set the scheduling wrt num_gpus
+ for values in config.gradual_training:
+ if global_step * num_gpus >= values[0]:
+ new_values = values
+ return new_values[1], new_values[2]
diff --git a/utils/visual.py b/utils/visual.py
index 1cb9ac5d..87fbc8e4 100644
--- a/utils/visual.py
+++ b/utils/visual.py
@@ -32,51 +32,55 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
linear_output_ = linear_output.detach().cpu().numpy().squeeze()
else:
linear_output_ = linear_output
- spectrogram = audio._denormalize(linear_output_) # pylint: disable=protected-access
+ spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access
fig = plt.figure(figsize=fig_size)
- plt.imshow(spectrogram.T, aspect="auto", origin="lower")
+ plt.imshow(spectrogram, aspect="auto", origin="lower")
plt.colorbar()
plt.tight_layout()
return fig
-def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None):
- if spectrogram is not None:
+def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)):
+ if decoder_output is not None:
num_plot = 4
else:
num_plot = 3
label_fontsize = 16
- fig = plt.figure(figsize=(8, 24))
+ fig = plt.figure(figsize=figsize)
plt.subplot(num_plot, 1, 1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+ # compute phoneme representation and back
if CONFIG.use_phonemes:
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
print(text)
-
plt.yticks(range(len(text)), list(text))
plt.colorbar()
-
- stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
+ # plot stopnet predictions
plt.subplot(num_plot, 1, 2)
plt.plot(range(len(stop_tokens)), list(stop_tokens))
-
+ # plot postnet spectrogram
plt.subplot(num_plot, 1, 3)
- librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'],
- hop_length=hop_length, x_axis="time", y_axis="linear")
+ librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'],
+ hop_length=hop_length, x_axis="time", y_axis="linear",
+ fmin=CONFIG.audio['mel_fmin'],
+ fmax=CONFIG.audio['mel_fmax'])
+
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
- if spectrogram is not None:
+ if decoder_output is not None:
plt.subplot(num_plot, 1, 4)
- librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
- hop_length=hop_length, x_axis="time", y_axis="linear")
+ librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'],
+ hop_length=hop_length, x_axis="time", y_axis="linear",
+ fmin=CONFIG.audio['mel_fmin'],
+ fmax=CONFIG.audio['mel_fmax'])
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()