Mass refactoring

This commit is contained in:
erogol 2020-07-16 15:05:36 +02:00
parent 3bc38517aa
commit 82dd465365
148 changed files with 3699 additions and 13888 deletions

View File

@ -17,5 +17,9 @@ matrix:
python: "3.6"
install: pip install --quiet -r requirements_tests.txt
env: TEST_SUITE="unittest"
- name: "Unit tests"
python: "3.6"
install: pip install --quiet -r requirements_tests.txt
env: TEST_SUITE="testscripts"
script: ./.travis/script

View File

@ -14,9 +14,15 @@ if [[ "$TEST_SUITE" == "unittest" ]]; then
pushd tts_namespace
nosetests TTS.speaker_encoder.tests --nocapture
nosetests TTS.vocoder.tests --nocapture
nosetests TTS.tests --nocapture
nosetests TTS.tf.tests --nocapture
nosetests TTS.tts.tests --nocapture
nosetests TTS.tts.tf.tests --nocapture
popd
# Test server package
./tests/test_server_package.sh
fi
if [[ "$TEST_SUITE" == "testscripts" ]]; then
# Test server package
./tts/tests/test_server_package.sh
# test model training scripts
./tts/tests/test_tts_train.sh
./vocoder/tests/test_vocoder_train.sh
fi

View File

@ -1,85 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import argparse
import numpy as np
from tqdm import tqdm
from TTS.datasets.preprocess import load_meta_data
from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor
def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(
description="Compute mean and variance of spectrogtram features.")
parser.add_argument("--config_path", type=str, required=True,
help="TTS config file path to define audio processin parameters.")
parser.add_argument("--out_path", default=None, type=str,
help="directory to save the output file.")
args = parser.parse_args()
# load config
CONFIG = load_config(args.config_path)
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
CONFIG.audio['stats_path'] = None # discard pre-defined stats
# load audio processor
ap = AudioProcessor(**CONFIG.audio)
# load the meta data of target dataset
dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data
print(f" > There are {len(dataset_items)} files.")
mel_sum = 0
mel_square_sum = 0
linear_sum = 0
linear_square_sum = 0
N = 0
for item in tqdm(dataset_items):
# compute features
wav = ap.load_wav(item[1])
linear = ap.spectrogram(wav)
mel = ap.melspectrogram(wav)
# compute stats
N += mel.shape[1]
mel_sum += mel.sum(1)
linear_sum += linear.sum(1)
mel_square_sum += (mel ** 2).sum(axis=1)
linear_square_sum += (linear ** 2).sum(axis=1)
mel_mean = mel_sum / N
mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
linear_mean = linear_sum / N
linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
output_file_path = os.path.join(args.out_path, "scale_stats.npy")
stats = {}
stats['mel_mean'] = mel_mean
stats['mel_std'] = mel_scale
stats['linear_mean'] = linear_mean
stats['linear_std'] = linear_scale
print(f' > Avg mel spec mean: {mel_mean.mean()}')
print(f' > Avg mel spec scale: {mel_scale.mean()}')
print(f' > Avg linear spec mean: {linear_mean.mean()}')
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
# set default config values for mean-var scaling
CONFIG.audio['stats_path'] = output_file_path
CONFIG.audio['signal_norm'] = True
# remove redundant values
del CONFIG.audio['max_norm']
del CONFIG.audio['min_level_db']
del CONFIG.audio['symmetric_norm']
del CONFIG.audio['clip_norm']
stats['audio_config'] = CONFIG.audio
np.save(output_file_path, stats, allow_pickle=True)
print(f' > scale_stats.npy is saved to {output_file_path}')
if __name__ == "__main__":
main()

View File

@ -1,240 +0,0 @@
import os
import numpy as np
import collections
import torch
import random
from torch.utils.data import Dataset
from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
class MyDataset(Dataset):
def __init__(self,
outputs_per_step,
text_cleaner,
compute_linear_spec,
ap,
meta_data,
tp=None,
batch_group_size=0,
min_seq_len=0,
max_seq_len=float("inf"),
use_phonemes=True,
phoneme_cache_path=None,
phoneme_language="en-us",
enable_eos_bos=False,
verbose=False):
"""
Args:
outputs_per_step (int): number of time frames predicted per step.
text_cleaner (str): text cleaner used for the dataset.
compute_linear_spec (bool): compute linear spectrogram if True.
ap (TTS.utils.AudioProcessor): audio processor object.
meta_data (list): list of dataset instances.
batch_group_size (int): (0) range of batch randomization after sorting
sequences by length.
min_seq_len (int): (0) minimum sequence length to be processed
by the loader.
max_seq_len (int): (float("inf")) maximum sequence length.
use_phonemes (bool): (true) if true, text converted to phonemes.
phoneme_cache_path (str): path to cache phoneme features.
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
verbose (bool): print diagnostic information.
"""
self.batch_group_size = batch_group_size
self.items = meta_data
self.outputs_per_step = outputs_per_step
self.sample_rate = ap.sample_rate
self.cleaners = text_cleaner
self.compute_linear_spec = compute_linear_spec
self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len
self.ap = ap
self.tp = tp
self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos
self.verbose = verbose
if use_phonemes and not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path, exist_ok=True)
if self.verbose:
print("\n > DataLoader initialization")
print(" | > Use phonemes: {}".format(self.use_phonemes))
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Number of instances : {}".format(len(self.items)))
self.sort_items()
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
return audio
@staticmethod
def load_np(filename):
data = np.load(filename).astype('float32')
return data
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
"""generate a phoneme sequence from text.
since the usage is for subsequent caching, we never add bos and
eos chars here. Instead we add those dynamically later; based on the
config option."""
phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language,
enable_eos_bos=False,
tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes)
return phonemes
def _load_or_generate_phoneme_sequence(self, wav_file, text):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
cache_path = os.path.join(self.phoneme_cache_path,
file_name + '_phoneme.npy')
try:
phonemes = np.load(cache_path)
except FileNotFoundError:
phonemes = self._generate_and_cache_phoneme_sequence(text,
cache_path)
except (ValueError, IOError):
print(" > ERROR: failed loading phonemes for {}. "
"Recomputing.".format(wav_file))
phonemes = self._generate_and_cache_phoneme_sequence(text,
cache_path)
if self.enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
phonemes = np.asarray(phonemes, dtype=np.int32)
return phonemes
def load_data(self, idx):
text, wav_file, speaker_name = self.items[idx]
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text)
else:
text = np.asarray(
text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
assert text.size > 0, self.items[idx][1]
assert wav.size > 0, self.items[idx][1]
sample = {
'text': text,
'wav': wav,
'item_idx': self.items[idx][1],
'speaker_name': speaker_name
}
return sample
def sort_items(self):
r"""Sort instances based on text length in ascending order"""
lengths = np.array([len(ins[0]) for ins in self.items])
idxs = np.argsort(lengths)
new_items = []
ignored = []
for i, idx in enumerate(idxs):
length = lengths[idx]
if length < self.min_seq_len or length > self.max_seq_len:
ignored.append(idx)
else:
new_items.append(self.items[idx])
# shuffle batch groups
if self.batch_group_size > 0:
for i in range(len(new_items) // self.batch_group_size):
offset = i * self.batch_group_size
end_offset = offset + self.batch_group_size
temp_items = new_items[offset:end_offset]
random.shuffle(temp_items)
new_items[offset:end_offset] = temp_items
self.items = new_items
if self.verbose:
print(" | > Max length sequence: {}".format(np.max(lengths)))
print(" | > Min length sequence: {}".format(np.min(lengths)))
print(" | > Avg length sequence: {}".format(np.mean(lengths)))
print(" | > Num. instances discarded by max-min (max={}, min={}) seq limits: {}".format(
self.max_seq_len, self.min_seq_len, len(ignored)))
print(" | > Batch group size: {}.".format(self.batch_group_size))
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
return self.load_data(idx)
def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. Sort batch instances by text-length
2. Convert Audio signal to Spectrograms.
3. PAD sequences wrt r.
4. Load to Torch.
"""
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):
text_lenghts = np.array([len(d["text"]) for d in batch])
# sort items with text input length for RNN efficiency
text_lenghts, ids_sorted_decreasing = torch.sort(
torch.LongTensor(text_lenghts), dim=0, descending=True)
wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing]
item_idxs = [
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
]
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
speaker_name = [batch[idx]['speaker_name']
for idx in ids_sorted_decreasing]
# compute features
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
mel_lengths = [m.shape[1] for m in mel]
# compute 'stop token' targets
stop_targets = [
np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths
]
# PAD stop targets
stop_targets = prepare_stop_target(stop_targets,
self.outputs_per_step)
# PAD sequences with longest instance in the batch
text = prepare_data(text).astype(np.int32)
# PAD features with longest instance
mel = prepare_tensor(mel, self.outputs_per_step)
# B x D x T --> B x T x D
mel = mel.transpose(0, 2, 1)
# convert things to pytorch
text_lenghts = torch.LongTensor(text_lenghts)
text = torch.LongTensor(text)
mel = torch.FloatTensor(mel).contiguous()
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
# compute linear spectrogram
if self.compute_linear_spec:
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
linear = prepare_tensor(linear, self.outputs_per_step)
linear = linear.transpose(0, 2, 1)
assert mel.shape[1] == linear.shape[1]
linear = torch.FloatTensor(linear).contiguous()
else:
linear = None
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
stop_targets, item_idxs
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
found {}".format(type(batch[0]))))

View File

@ -1,207 +0,0 @@
import os
from glob import glob
import re
import sys
from TTS.utils.generic_utils import split_dataset
def load_meta_data(datasets):
meta_data_train_all = []
meta_data_eval_all = []
for dataset in datasets:
name = dataset['name']
root_path = dataset['path']
meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val']
preprocessor = get_preprocessor_by_name(name)
meta_data_train = preprocessor(root_path, meta_file_train)
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else:
meta_data_eval = preprocessor(root_path, meta_file_val)
meta_data_train_all += meta_data_train
meta_data_eval_all += meta_data_eval
return meta_data_train_all, meta_data_eval_all
def get_preprocessor_by_name(name):
"""Returns the respective preprocessing function."""
thismodule = sys.modules[__name__]
return getattr(thismodule, name.lower())
def tweb(root_path, meta_file):
"""Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "tweb"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('\t')
wav_file = os.path.join(root_path, cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
return items
# def kusal(root_path, meta_file):
# txt_file = os.path.join(root_path, meta_file)
# texts = []
# wavs = []
# with open(txt_file, "r", encoding="utf8") as f:
# frames = [
# line.split('\t') for line in f
# if line.split('\t')[0] in self.wav_files_dict.keys()
# ]
# # TODO: code the rest
# return {'text': texts, 'wavs': wavs}
def mozilla(root_path, meta_file):
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = cols[1].strip()
text = cols[0].strip()
wav_file = os.path.join(root_path, "wavs", wav_file)
items.append([text, wav_file, speaker_name])
return items
def mozilla_de(root_path, meta_file):
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, 'r', encoding="ISO 8859-1") as ttf:
for line in ttf:
cols = line.strip().split('|')
wav_file = cols[0].strip()
text = cols[1].strip()
folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
wav_file = os.path.join(root_path, folder_name, wav_file)
items.append([text, wav_file, speaker_name])
return items
def mailabs(root_path, meta_files=None):
"""Normalizes M-AI-Labs meta data files to TTS format"""
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
if meta_files is None:
csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
else:
csv_files = meta_files
# meta_files = [f.strip() for f in meta_files.split(",")]
items = []
for csv_file in csv_files:
txt_file = os.path.join(root_path, csv_file)
folder = os.path.dirname(txt_file)
# determine speaker based on folder structure...
speaker_name_match = speaker_regex.search(txt_file)
if speaker_name_match is None:
continue
speaker_name = speaker_name_match.group("speaker_name")
print(" | > {}".format(csv_file))
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
if meta_files is None:
wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
else:
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
if os.path.isfile(wav_file):
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
else:
raise RuntimeError("> File %s does not exist!"%(wav_file))
return items
def ljspeech(root_path, meta_file):
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "ljspeech"
with open(txt_file, 'r') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
return items
def nancy(root_path, meta_file):
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "nancy"
with open(txt_file, 'r') as ttf:
for line in ttf:
utt_id = line.split()[1]
text = line[line.find('"') + 1:line.rfind('"') - 1]
wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
items.append([text, wav_file, speaker_name])
return items
def common_voice(root_path, meta_file):
"""Normalize the common voice meta data file to TTS format."""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, 'r') as ttf:
for line in ttf:
if line.startswith("client_id"):
continue
cols = line.split("\t")
text = cols[2]
speaker_name = cols[0]
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
items.append([text, wav_file, speaker_name])
return items
def libri_tts(root_path, meta_files=None):
"""https://ai.google/tools/datasets/libri-tts/"""
items = []
if meta_files is None:
meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
for meta_file in meta_files:
_meta_file = os.path.basename(meta_file).split('.')[0]
speaker_name = _meta_file.split('_')[0]
chapter_id = _meta_file.split('_')[1]
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
with open(meta_file, 'r') as ttf:
for line in ttf:
cols = line.split('\t')
wav_file = os.path.join(_root_path, cols[0] + '.wav')
text = cols[1]
items.append([text, wav_file, speaker_name])
for item in items:
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
return items
def custom_turkish(root_path, meta_file):
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "turkish-female"
skipped_files = []
with open(txt_file, 'r', encoding='utf-8') as ttf:
for line in ttf:
cols = line.split('|')
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
if not os.path.exists(wav_file):
skipped_files.append(wav_file)
continue
text = cols[1].strip()
items.append([text, wav_file, speaker_name])
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
return items

View File

@ -1,178 +0,0 @@
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
import os, sys
import math
import time
import subprocess
import argparse
import torch
import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from TTS.utils.generic_utils import create_experiment_folder
class DistributedSampler(Sampler):
"""
Non shuffling Distributed Sampler
"""
def __init__(self, dataset, num_replicas=None, rank=None):
super(DistributedSampler, self).__init__(dataset)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
indices = torch.arange(len(self.dataset)).tolist()
# add extra samples to make it evenly divisible
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
def reduce_tensor(tensor, num_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= num_gpus
return rt
def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
# Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication
dist.init_process_group(
dist_backend,
init_method=dist_url,
world_size=num_gpus,
rank=rank,
group_name=group_name)
def apply_gradient_allreduce(module):
# sync model parameters
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if module.needs_reduction:
module.needs_reduction = False
# bucketing params based on value types
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced, op=dist.reduce_op.SUM)
coalesced /= dist.get_world_size()
for buf, synced in zip(
grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*_):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, *_):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module
def main():
"""
Call train.py as a new process and pass command arguments
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
args = parser.parse_args()
# OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
# True)
# stdout_path = os.path.join(OUT_PATH, "process_stdout/")
num_gpus = torch.cuda.device_count()
group_id = time.strftime("%Y_%m_%d-%H%M%S")
# set arguments for train.py
command = ['train.py']
command.append('--continue_path={}'.format(args.continue_path))
command.append('--restore_path={}'.format(args.restore_path))
command.append('--config_path={}'.format(args.config_path))
command.append('--group_id=group_{}'.format(group_id))
command.append('')
# run processes
processes = []
for i in range(num_gpus):
my_env = os.environ.copy()
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = '--rank={}'.format(i)
stdout = None if i == 0 else open(os.devnull, 'w')
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
processes.append(p)
print(command)
for p in processes:
p.wait()
if __name__ == '__main__':
main()

View File

View File

@ -1,389 +0,0 @@
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
class Linear(nn.Module):
def __init__(self,
in_features,
out_features,
bias=True,
init_gain='linear'):
super(Linear, self).__init__()
self.linear_layer = torch.nn.Linear(
in_features, out_features, bias=bias)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
return self.linear_layer(x)
class LinearBN(nn.Module):
def __init__(self,
in_features,
out_features,
bias=True,
init_gain='linear'):
super(LinearBN, self).__init__()
self.linear_layer = torch.nn.Linear(
in_features, out_features, bias=bias)
self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
out = self.linear_layer(x)
if len(out.shape) == 3:
out = out.permute(1, 2, 0)
out = self.batch_normalization(out)
if len(out.shape) == 3:
out = out.permute(2, 0, 1)
return out
class Prenet(nn.Module):
def __init__(self,
in_features,
prenet_type="original",
prenet_dropout=True,
out_features=[256, 256],
bias=True):
super(Prenet, self).__init__()
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
in_features = [in_features] + out_features[:-1]
if prenet_type == "bn":
self.linear_layers = nn.ModuleList([
LinearBN(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
elif prenet_type == "original":
self.linear_layers = nn.ModuleList([
Linear(in_size, out_size, bias=bias)
for (in_size, out_size) in zip(in_features, out_features)
])
def forward(self, x):
for linear in self.linear_layers:
if self.prenet_dropout:
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
else:
x = F.relu(linear(x))
return x
####################
# ATTENTION MODULES
####################
class LocationLayer(nn.Module):
def __init__(self,
attention_dim,
attention_n_filters=32,
attention_kernel_size=31):
super(LocationLayer, self).__init__()
self.location_conv1d = nn.Conv1d(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
stride=1,
padding=(attention_kernel_size - 1) // 2,
bias=False)
self.location_dense = Linear(
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
def forward(self, attention_cat):
processed_attention = self.location_conv1d(attention_cat)
processed_attention = self.location_dense(
processed_attention.transpose(1, 2))
return processed_attention
class GravesAttention(nn.Module):
""" Discretized Graves attention:
- https://arxiv.org/abs/1910.10288
- https://arxiv.org/pdf/1906.01083.pdf
"""
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
def __init__(self, query_dim, K):
super(GravesAttention, self).__init__()
self._mask_value = 1e-8
self.K = K
# self.attention_alignment = 0.05
self.eps = 1e-5
self.J = None
self.N_a = nn.Sequential(
nn.Linear(query_dim, query_dim, bias=True),
nn.ReLU(),
nn.Linear(query_dim, 3*K, bias=True))
self.attention_weights = None
self.mu_prev = None
self.init_layers()
def init_layers(self):
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
def init_states(self, inputs):
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
# pylint: disable=R0201
# pylint: disable=unused-argument
def preprocess_inputs(self, inputs):
return None
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attention_rnn
inputs: B x T_in x D_encoder
processed_inputs: place_holder
mask: B x T_in
"""
gbk_t = self.N_a(query)
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
# attention model parameters
# each B x K
g_t = gbk_t[:, 0, :]
b_t = gbk_t[:, 1, :]
k_t = gbk_t[:, 2, :]
# dropout to decorrelate attention heads
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
# attention GMM parameters
sig_t = torch.nn.functional.softplus(b_t) + self.eps
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
g_t = torch.softmax(g_t, dim=-1) + self.eps
j = self.J[:inputs.size(1)+1]
# attention weights
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
# discritize attention weights
alpha_t = torch.sum(phi_t, 1)
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
alpha_t[alpha_t == 0] = 1e-8
# apply masking
if mask is not None:
alpha_t.data.masked_fill_(~mask, self._mask_value)
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
self.attention_weights = alpha_t
self.mu_prev = mu_t
return context
class OriginalAttention(nn.Module):
"""Following the methods proposed here:
- https://arxiv.org/abs/1712.05884
- https://arxiv.org/abs/1807.06736 + state masking at inference
- Using sigmoid instead of softmax normalization
- Attention windowing at inference time
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask):
super(OriginalAttention, self).__init__()
self.query_layer = Linear(
query_dim, attention_dim, bias=False, init_gain='tanh')
self.inputs_layer = Linear(
embedding_dim, attention_dim, bias=False, init_gain='tanh')
self.v = Linear(attention_dim, 1, bias=True)
if trans_agent:
self.ta = nn.Linear(
query_dim + embedding_dim, 1, bias=True)
if location_attention:
self.location_layer = LocationLayer(
attention_dim,
attention_location_n_filters,
attention_location_kernel_size,
)
self._mask_value = -float("inf")
self.windowing = windowing
self.win_idx = None
self.norm = norm
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attention = location_attention
def init_win_idx(self):
self.win_idx = -1
self.win_back = 2
self.win_front = 6
def init_forward_attn(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.alpha = torch.cat(
[torch.ones([B, 1]),
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
def init_location_attention(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_())
def init_states(self, inputs):
B = inputs.shape[0]
T = inputs.shape[1]
self.attention_weights = Variable(inputs.data.new(B, T).zero_())
if self.location_attention:
self.init_location_attention(inputs)
if self.forward_attn:
self.init_forward_attn(inputs)
if self.windowing:
self.init_win_idx()
def preprocess_inputs(self, inputs):
return self.inputs_layer(inputs)
def update_location_attention(self, alignments):
self.attention_weights_cum += alignments
def get_location_attention(self, query, processed_inputs):
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
self.attention_weights_cum.unsqueeze(1)),
dim=1)
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_cat)
energies = self.v(
torch.tanh(processed_query + processed_attention_weights +
processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def get_attention(self, query, processed_inputs):
processed_query = self.query_layer(query.unsqueeze(1))
energies = self.v(torch.tanh(processed_query + processed_inputs))
energies = energies.squeeze(-1)
return energies, processed_query
def apply_windowing(self, attention, inputs):
back_win = self.win_idx - self.win_back
front_win = self.win_idx + self.win_front
if back_win > 0:
attention[:, :back_win] = -float("inf")
if front_win < inputs.shape[1]:
attention[:, front_win:] = -float("inf")
# this is a trick to solve a special problem.
# but it does not hurt.
if self.win_idx == -1:
attention[:, 0] = attention.max()
# Update the window
self.win_idx = torch.argmax(attention, 1).long()[0].item()
return attention
def apply_forward_attention(self, alignment):
# forward attention
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
(1, 0, 0, 0))
# compute transition potentials
alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha
+ 1e-8) * alignment
# force incremental alignment
if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1)
val, n2 = alpha.max(1)
for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0
alpha[b, :(
n[b] - 1
)] = 0 # ignore all previous states to prevent repetition.
alpha[b,
(n[b] - 2
)] = 0.01 * val[b] # smoothing factor for the prev step
# renormalize attention weights
alpha = alpha / alpha.sum(dim=1, keepdim=True)
return alpha
def forward(self, query, inputs, processed_inputs, mask):
"""
shapes:
query: B x D_attn_rnn
inputs: B x T_en x D_en
processed_inputs:: B x T_en x D_attn
mask: B x T_en
"""
if self.location_attention:
attention, _ = self.get_location_attention(
query, processed_inputs)
else:
attention, _ = self.get_attention(
query, processed_inputs)
# apply masking
if mask is not None:
attention.data.masked_fill_(~mask, self._mask_value)
# apply windowing - only in eval mode
if not self.training and self.windowing:
attention = self.apply_windowing(attention, inputs)
# normalize attention values
if self.norm == "softmax":
alignment = torch.softmax(attention, dim=-1)
elif self.norm == "sigmoid":
alignment = torch.sigmoid(attention) / torch.sigmoid(
attention).sum(
dim=1, keepdim=True)
else:
raise ValueError("Unknown value for attention norm type")
if self.location_attention:
self.update_location_attention(alignment)
# apply forward attention if enabled
if self.forward_attn:
alignment = self.apply_forward_attention(alignment)
self.alpha = alignment
context = torch.bmm(alignment.unsqueeze(1), inputs)
context = context.squeeze(1)
self.attention_weights = alignment
# compute transition agent
if self.forward_attn and self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask, attn_K):
if attn_type == "original":
return OriginalAttention(query_dim, embedding_dim, attention_dim,
location_attention,
attention_location_n_filters,
attention_location_kernel_size, windowing,
norm, forward_attn, trans_agent,
forward_attn_mask)
if attn_type == "graves":
return GravesAttention(query_dim, attn_K)
raise RuntimeError(
" [!] Given Attention Type '{attn_type}' is not exist.")

View File

@ -1,169 +0,0 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
class GST(nn.Module):
"""Global Style Token Module for factorizing prosody in speech.
See https://arxiv.org/pdf/1803.09017"""
def __init__(self, num_mel, num_heads, num_style_tokens, embedding_dim):
super().__init__()
self.encoder = ReferenceEncoder(num_mel, embedding_dim)
self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens,
embedding_dim)
def forward(self, inputs):
enc_out = self.encoder(inputs)
style_embed = self.style_token_layer(enc_out)
return style_embed
class ReferenceEncoder(nn.Module):
"""NN module creating a fixed size prosody embedding from a spectrogram.
inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
outputs: [batch_size, embedding_dim]
"""
def __init__(self, num_mel, embedding_dim):
super().__init__()
self.num_mel = num_mel
filters = [1] + [32, 32, 64, 64, 128, 128]
num_layers = len(filters) - 1
convs = [
nn.Conv2d(
in_channels=filters[i],
out_channels=filters[i + 1],
kernel_size=(3, 3),
stride=(2, 2),
padding=(1, 1)) for i in range(num_layers)
]
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList([
nn.BatchNorm2d(num_features=filter_size)
for filter_size in filters[1:]
])
post_conv_height = self.calculate_post_conv_height(
num_mel, 3, 2, 1, num_layers)
self.recurrence = nn.GRU(
input_size=filters[-1] * post_conv_height,
hidden_size=embedding_dim // 2,
batch_first=True)
def forward(self, inputs):
batch_size = inputs.size(0)
x = inputs.view(batch_size, 1, -1, self.num_mel)
# x: 4D tensor [batch_size, num_channels==1, num_frames, num_mel]
for conv, bn in zip(self.convs, self.bns):
x = conv(x)
x = bn(x)
x = F.relu(x)
x = x.transpose(1, 2)
# x: 4D tensor [batch_size, post_conv_width,
# num_channels==128, post_conv_height]
post_conv_width = x.size(1)
x = x.contiguous().view(batch_size, post_conv_width, -1)
# x: 3D tensor [batch_size, post_conv_width,
# num_channels*post_conv_height]
self.recurrence.flatten_parameters()
memory, out = self.recurrence(x)
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
return out.squeeze(0)
@staticmethod
def calculate_post_conv_height(height, kernel_size, stride, pad,
n_convs):
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
for _ in range(n_convs):
height = (height - kernel_size + 2 * pad) // stride + 1
return height
class StyleTokenLayer(nn.Module):
"""NN Module attending to style tokens based on prosody encodings."""
def __init__(self, num_heads, num_style_tokens,
embedding_dim):
super().__init__()
self.query_dim = embedding_dim // 2
self.key_dim = embedding_dim // num_heads
self.style_tokens = nn.Parameter(
torch.FloatTensor(num_style_tokens, self.key_dim))
nn.init.orthogonal_(self.style_tokens)
self.attention = MultiHeadAttention(
query_dim=self.query_dim,
key_dim=self.key_dim,
num_units=embedding_dim,
num_heads=num_heads)
def forward(self, inputs):
batch_size = inputs.size(0)
prosody_encoding = inputs.unsqueeze(1)
# prosody_encoding: 3D tensor [batch_size, 1, encoding_size==128]
tokens = torch.tanh(self.style_tokens) \
.unsqueeze(0) \
.expand(batch_size, -1, -1)
# tokens: 3D tensor [batch_size, num tokens, token embedding size]
style_embed = self.attention(prosody_encoding, tokens)
return style_embed
class MultiHeadAttention(nn.Module):
'''
input:
query --- [N, T_q, query_dim]
key --- [N, T_k, key_dim]
output:
out --- [N, T_q, num_units]
'''
def __init__(self, query_dim, key_dim, num_units, num_heads):
super().__init__()
self.num_units = num_units
self.num_heads = num_heads
self.key_dim = key_dim
self.W_query = nn.Linear(
in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(
in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(
in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query, key):
queries = self.W_query(query) # [N, T_q, num_units]
keys = self.W_key(key) # [N, T_k, num_units]
values = self.W_value(key)
split_size = self.num_units // self.num_heads
queries = torch.stack(
torch.split(queries, split_size, dim=2),
dim=0) # [h, N, T_q, num_units/h]
keys = torch.stack(
torch.split(keys, split_size, dim=2),
dim=0) # [h, N, T_k, num_units/h]
values = torch.stack(
torch.split(values, split_size, dim=2),
dim=0) # [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5))
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores = scores / (self.key_dim**0.5)
scores = F.softmax(scores, dim=3)
# out = score * V
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
out = torch.cat(
torch.split(out, 1, dim=0),
dim=3).squeeze(0) # [N, T_q, num_units]
return out

View File

@ -1,246 +0,0 @@
import numpy as np
import torch
from torch import nn
from torch.nn import functional
from TTS.utils.generic_utils import sequence_mask
class L1LossMasked(nn.Module):
def __init__(self, seq_len_norm):
super(L1LossMasked, self).__init__()
self.seq_len_norm = seq_len_norm
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len, dim) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len, dim) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
if self.seq_len_norm:
norm_w = mask / mask.sum(dim=1, keepdim=True)
out_weights = norm_w.div(target.shape[0] * target.shape[2])
mask = mask.expand_as(x)
loss = functional.l1_loss(
x * mask, target * mask, reduction='none')
loss = loss.mul(out_weights.to(loss.device)).sum()
else:
mask = mask.expand_as(x)
loss = functional.l1_loss(
x * mask, target * mask, reduction='sum')
loss = loss / mask.sum()
return loss
class MSELossMasked(nn.Module):
def __init__(self, seq_len_norm):
super(MSELossMasked, self).__init__()
self.seq_len_norm = seq_len_norm
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len, dim) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len, dim) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
if self.seq_len_norm:
norm_w = mask / mask.sum(dim=1, keepdim=True)
out_weights = norm_w.div(target.shape[0] * target.shape[2])
mask = mask.expand_as(x)
loss = functional.mse_loss(
x * mask, target * mask, reduction='none')
loss = loss.mul(out_weights.to(loss.device)).sum()
else:
mask = mask.expand_as(x)
loss = functional.mse_loss(
x * mask, target * mask, reduction='sum')
loss = loss / mask.sum()
return loss
class AttentionEntropyLoss(nn.Module):
# pylint: disable=R0201
def forward(self, align):
"""
Forces attention to be more decisive by penalizing
soft attention weights
TODO: arguments
TODO: unit_test
"""
entropy = torch.distributions.Categorical(probs=align).entropy()
loss = (entropy / np.log(align.shape[1])).mean()
return loss
class BCELossMasked(nn.Module):
def __init__(self, pos_weight):
super(BCELossMasked, self).__init__()
self.pos_weight = pos_weight
def forward(self, x, target, length):
"""
Args:
x: A Variable containing a FloatTensor of size
(batch, max_len) which contains the
unnormalized probability for each class.
target: A Variable containing a LongTensor of size
(batch, max_len) which contains the index of the true
class for each corresponding step.
length: A Variable containing a LongTensor of size (batch,)
which contains the length of each data in a batch.
Returns:
loss: An average loss value in range [0, 1] masked by the length.
"""
# mask: (batch, max_len, 1)
target.requires_grad = False
mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float()
loss = functional.binary_cross_entropy_with_logits(
x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum')
loss = loss / mask.sum()
return loss
class GuidedAttentionLoss(torch.nn.Module):
def __init__(self, sigma=0.4):
super(GuidedAttentionLoss, self).__init__()
self.sigma = sigma
def _make_ga_masks(self, ilens, olens):
B = len(ilens)
max_ilen = max(ilens)
max_olen = max(olens)
ga_masks = torch.zeros((B, max_olen, max_ilen))
for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma)
return ga_masks
def forward(self, att_ws, ilens, olens):
ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device)
seq_masks = self._make_masks(ilens, olens).to(att_ws.device)
losses = ga_masks * att_ws
loss = torch.mean(losses.masked_select(seq_masks))
return loss
@staticmethod
def _make_ga_mask(ilen, olen, sigma):
grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
grid_x, grid_y = grid_x.float(), grid_y.float()
return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))
@staticmethod
def _make_masks(ilens, olens):
in_masks = sequence_mask(ilens)
out_masks = sequence_mask(olens)
return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
class TacotronLoss(torch.nn.Module):
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
super(TacotronLoss, self).__init__()
self.stopnet_pos_weight = stopnet_pos_weight
self.ga_alpha = c.ga_alpha
self.config = c
# postnet decoder loss
if c.loss_masking:
self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
"Tacotron"
] else MSELossMasked(c.seq_len_norm)
else:
self.criterion = nn.L1Loss() if c.model in ["Tacotron"
] else nn.MSELoss()
# guided attention loss
if c.ga_alpha > 0:
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
# stopnet loss
# pylint: disable=not-callable
self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None
def forward(self, postnet_output, decoder_output, mel_input, linear_input,
stopnet_output, stopnet_target, output_lens, decoder_b_output,
alignments, alignment_lens, alignments_backwards, input_lens):
return_dict = {}
# decoder and postnet losses
if self.config.loss_masking:
decoder_loss = self.criterion(decoder_output, mel_input,
output_lens)
if self.config.model in ["Tacotron", "TacotronGST"]:
postnet_loss = self.criterion(postnet_output, linear_input,
output_lens)
else:
postnet_loss = self.criterion(postnet_output, mel_input,
output_lens)
else:
decoder_loss = self.criterion(decoder_output, mel_input)
if self.config.model in ["Tacotron", "TacotronGST"]:
postnet_loss = self.criterion(postnet_output, linear_input)
else:
postnet_loss = self.criterion(postnet_output, mel_input)
loss = decoder_loss + postnet_loss
return_dict['decoder_loss'] = decoder_loss
return_dict['postnet_loss'] = postnet_loss
# stopnet loss
stop_loss = self.criterion_st(
stopnet_output, stopnet_target,
output_lens) if self.config.stopnet else torch.zeros(1)
if not self.config.separate_stopnet and self.config.stopnet:
loss += stop_loss
return_dict['stopnet_loss'] = stop_loss
# backward decoder loss (if enabled)
if self.config.bidirectional_decoder:
if self.config.loss_masking:
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens)
else:
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input)
decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output)
loss += decoder_b_loss + decoder_c_loss
return_dict['decoder_b_loss'] = decoder_b_loss
return_dict['decoder_c_loss'] = decoder_c_loss
# double decoder consistency loss (if enabled)
if self.config.double_decoder_consistency:
decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens)
# decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output)
attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards)
loss += decoder_b_loss + attention_c_loss
return_dict['decoder_coarse_loss'] = decoder_b_loss
return_dict['decoder_ddc_loss'] = attention_c_loss
# guided attention loss (if enabled)
if self.config.ga_alpha > 0:
ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
loss += ga_loss * self.ga_alpha
return_dict['ga_loss'] = ga_loss * self.ga_alpha
return_dict['loss'] = loss
return return_dict

View File

@ -1,496 +0,0 @@
# coding: utf-8
import torch
from torch import nn
from .common_layers import Prenet, init_attn, Linear
class BatchNormConv1d(nn.Module):
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
function between Conv and BatchNorm layers. BatchNorm layer
is initialized with the TF default values for momentum and eps.
Args:
in_channels: size of each input sample
out_channels: size of each output samples
kernel_size: kernel size of conv filters
stride: stride of conv filters
padding: padding of conv filters
activation: activation function set b/w Conv1d and BatchNorm
Shapes:
- input: batch x dims
- output: batch x dims
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
activation=None):
super(BatchNormConv1d, self).__init__()
self.padding = padding
self.padder = nn.ConstantPad1d(padding, 0)
self.conv1d = nn.Conv1d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=0,
bias=False)
# Following tensorflow's default parameters
self.bn = nn.BatchNorm1d(out_channels, momentum=0.99, eps=1e-3)
self.activation = activation
# self.init_layers()
def init_layers(self):
if type(self.activation) == torch.nn.ReLU:
w_gain = 'relu'
elif type(self.activation) == torch.nn.Tanh:
w_gain = 'tanh'
elif self.activation is None:
w_gain = 'linear'
else:
raise RuntimeError('Unknown activation function')
torch.nn.init.xavier_uniform_(
self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_gain))
def forward(self, x):
x = self.padder(x)
x = self.conv1d(x)
x = self.bn(x)
if self.activation is not None:
x = self.activation(x)
return x
class Highway(nn.Module):
# TODO: Try GLU layer
def __init__(self, in_size, out_size):
super(Highway, self).__init__()
self.H = nn.Linear(in_size, out_size)
self.H.bias.data.zero_()
self.T = nn.Linear(in_size, out_size)
self.T.bias.data.fill_(-1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
# self.init_layers()
def init_layers(self):
torch.nn.init.xavier_uniform_(
self.H.weight, gain=torch.nn.init.calculate_gain('relu'))
torch.nn.init.xavier_uniform_(
self.T.weight, gain=torch.nn.init.calculate_gain('sigmoid'))
def forward(self, inputs):
H = self.relu(self.H(inputs))
T = self.sigmoid(self.T(inputs))
return H * T + inputs * (1.0 - T)
class CBHG(nn.Module):
"""CBHG module: a recurrent neural network composed of:
- 1-d convolution banks
- Highway networks + residual connections
- Bidirectional gated recurrent units
Args:
in_features (int): sample size
K (int): max filter size in conv bank
projections (list): conv channel sizes for conv projections
num_highways (int): number of highways layers
Shapes:
- input: B x D x T_in
- output: B x T_in x D*2
"""
def __init__(self,
in_features,
K=16,
conv_bank_features=128,
conv_projections=[128, 128],
highway_features=128,
gru_features=128,
num_highways=4):
super(CBHG, self).__init__()
self.in_features = in_features
self.conv_bank_features = conv_bank_features
self.highway_features = highway_features
self.gru_features = gru_features
self.conv_projections = conv_projections
self.relu = nn.ReLU()
# list of conv1d bank with filter size k=1...K
# TODO: try dilational layers instead
self.conv1d_banks = nn.ModuleList([
BatchNormConv1d(in_features,
conv_bank_features,
kernel_size=k,
stride=1,
padding=[(k - 1) // 2, k // 2],
activation=self.relu) for k in range(1, K + 1)
])
# max pooling of conv bank, with padding
# TODO: try average pooling OR larger kernel size
out_features = [K * conv_bank_features] + conv_projections[:-1]
activations = [self.relu] * (len(conv_projections) - 1)
activations += [None]
# setup conv1d projection layers
layer_set = []
for (in_size, out_size, ac) in zip(out_features, conv_projections,
activations):
layer = BatchNormConv1d(in_size,
out_size,
kernel_size=3,
stride=1,
padding=[1, 1],
activation=ac)
layer_set.append(layer)
self.conv1d_projections = nn.ModuleList(layer_set)
# setup Highway layers
if self.highway_features != conv_projections[-1]:
self.pre_highway = nn.Linear(conv_projections[-1],
highway_features,
bias=False)
self.highways = nn.ModuleList([
Highway(highway_features, highway_features)
for _ in range(num_highways)
])
# bi-directional GPU layer
self.gru = nn.GRU(gru_features,
gru_features,
1,
batch_first=True,
bidirectional=True)
def forward(self, inputs):
# (B, in_features, T_in)
x = inputs
# (B, hid_features*K, T_in)
# Concat conv1d bank outputs
outs = []
for conv1d in self.conv1d_banks:
out = conv1d(x)
outs.append(out)
x = torch.cat(outs, dim=1)
assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
for conv1d in self.conv1d_projections:
x = conv1d(x)
x += inputs
x = x.transpose(1, 2)
if self.highway_features != self.conv_projections[-1]:
x = self.pre_highway(x)
# Residual connection
# TODO: try residual scaling as in Deep Voice 3
# TODO: try plain residual layers
for highway in self.highways:
x = highway(x)
# (B, T_in, hid_features*2)
# TODO: replace GRU with convolution as in Deep Voice 3
self.gru.flatten_parameters()
outputs, _ = self.gru(x)
return outputs
class EncoderCBHG(nn.Module):
def __init__(self):
super(EncoderCBHG, self).__init__()
self.cbhg = CBHG(
128,
K=16,
conv_bank_features=128,
conv_projections=[128, 128],
highway_features=128,
gru_features=128,
num_highways=4)
def forward(self, x):
return self.cbhg(x)
class Encoder(nn.Module):
r"""Encapsulate Prenet and CBHG modules for encoder"""
def __init__(self, in_features):
super(Encoder, self).__init__()
self.prenet = Prenet(in_features, out_features=[256, 128])
self.cbhg = EncoderCBHG()
def forward(self, inputs):
r"""
Args:
inputs (FloatTensor): embedding features
Shapes:
- inputs: batch x time x in_features
- outputs: batch x time x 128*2
"""
# B x T x prenet_dim
outputs = self.prenet(inputs)
outputs = self.cbhg(outputs.transpose(1, 2))
return outputs
class PostCBHG(nn.Module):
def __init__(self, mel_dim):
super(PostCBHG, self).__init__()
self.cbhg = CBHG(
mel_dim,
K=8,
conv_bank_features=128,
conv_projections=[256, mel_dim],
highway_features=128,
gru_features=128,
num_highways=4)
def forward(self, x):
return self.cbhg(x)
class Decoder(nn.Module):
"""Decoder module.
Args:
in_features (int): input vector (encoder output) sample size.
memory_dim (int): memory vector (prev. time-step output) sample size.
r (int): number of outputs per time step.
memory_size (int): size of the past window. if <= 0 memory_size = r
TODO: arguments
"""
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, speaker_embedding_dim):
super(Decoder, self).__init__()
self.r_init = r
self.r = r
self.in_features = in_features
self.max_decoder_steps = 500
self.use_memory_queue = memory_size > 0
self.memory_size = memory_size if memory_size > 0 else r
self.memory_dim = memory_dim
self.separate_stopnet = separate_stopnet
self.query_dim = 256
# memory -> |Prenet| -> processed_memory
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
self.prenet = Prenet(
prenet_dim,
prenet_type,
prenet_dropout,
out_features=[256, 128])
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
# attention_rnn generates queries for the attention mechanism
self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
self.attention = init_attn(attn_type=attn_type,
query_dim=self.query_dim,
embedding_dim=in_features,
attention_dim=128,
location_attention=location_attn,
attention_location_n_filters=32,
attention_location_kernel_size=31,
windowing=attn_windowing,
norm=attn_norm,
forward_attn=forward_attn,
trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask,
attn_K=attn_K)
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
# decoder_RNN_input -> |RNN| -> RNN_state
self.decoder_rnns = nn.ModuleList(
[nn.GRUCell(256, 256) for _ in range(2)])
# RNN_state -> |Linear| -> mel_spec
self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
# learn init values instead of zero init.
self.stopnet = StopNet(256 + memory_dim * self.r_init)
def set_r(self, new_r):
self.r = new_r
def _reshape_memory(self, memory):
"""
Reshape the spectrograms for given 'r'
"""
# Grouping multiple frames if necessary
if memory.size(-1) == self.memory_dim:
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
# Time first (T_decoder, B, memory_dim)
memory = memory.transpose(0, 1)
return memory
def _init_states(self, inputs):
"""
Initialization of decoder states
"""
B = inputs.size(0)
T = inputs.size(1)
# go frame as zeros matrix
if self.use_memory_queue:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
else:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
# decoder states
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
self.decoder_rnn_hiddens = [
torch.zeros(1, device=inputs.device).repeat(B, 256)
for idx in range(len(self.decoder_rnns))
]
self.context_vec = inputs.data.new(B, self.in_features).zero_()
# cache attention inputs
self.processed_inputs = self.attention.preprocess_inputs(inputs)
def _parse_outputs(self, outputs, attentions, stop_tokens):
# Back to batch first
attentions = torch.stack(attentions).transpose(0, 1)
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
outputs = outputs.view(
outputs.size(0), -1, self.memory_dim)
outputs = outputs.transpose(1, 2)
return outputs, attentions, stop_tokens
def decode(self, inputs, mask=None):
# Prenet
processed_memory = self.prenet(self.memory_input)
# Attention RNN
self.attention_rnn_hidden = self.attention_rnn(
torch.cat((processed_memory, self.context_vec), -1),
self.attention_rnn_hidden)
self.context_vec = self.attention(
self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
# Concat RNN output and attention context vector
decoder_input = self.project_to_decoder_in(
torch.cat((self.attention_rnn_hidden, self.context_vec), -1))
# Pass through the decoder RNNs
for idx in range(len(self.decoder_rnns)):
self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
decoder_input, self.decoder_rnn_hiddens[idx])
# Residual connection
decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
decoder_output = decoder_input
# predict mel vectors from decoder vectors
output = self.proj_to_mel(decoder_output)
# output = torch.sigmoid(output)
# predict stop token
stopnet_input = torch.cat([decoder_output, output], -1)
if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
output = output[:, : self.r * self.memory_dim]
return output, stop_token, self.attention.attention_weights
def _update_memory_input(self, new_memory):
if self.use_memory_queue:
if self.memory_size > self.r:
# memory queue size is larger than number of frames per decoder iter
self.memory_input = torch.cat([
new_memory, self.memory_input[:, :(
self.memory_size - self.r) * self.memory_dim].clone()
], dim=-1)
else:
# memory queue size smaller than number of frames per decoder iter
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
else:
# use only the last frame prediction
# assert new_memory.shape[-1] == self.r * self.memory_dim
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
def forward(self, inputs, memory, mask, speaker_embeddings=None):
"""
Args:
inputs: Encoder outputs.
memory: Decoder memory (autoregression. If None (at eval-time),
decoder outputs are used as decoder inputs. If None, it uses the last
output as the input.
mask: Attention mask for sequence padding.
Shapes:
- inputs: batch x time x encoder_out_dim
- memory: batch x #mel_specs x mel_spec_dim
"""
# Run greedy decoding if memory is None
memory = self._reshape_memory(memory)
outputs = []
attentions = []
stop_tokens = []
t = 0
self._init_states(inputs)
self.attention.init_states(inputs)
while len(outputs) < memory.size(0):
if t > 0:
new_memory = memory[t - 1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, mask)
outputs += [output]
attentions += [attention]
stop_tokens += [stop_token.squeeze(1)]
t += 1
return self._parse_outputs(outputs, attentions, stop_tokens)
def inference(self, inputs, speaker_embeddings=None):
"""
Args:
inputs: encoder outputs.
speaker_embeddings: speaker vectors.
Shapes:
- inputs: batch x time x encoder_out_dim
- speaker_embeddings: batch x embed_dim
"""
outputs = []
attentions = []
stop_tokens = []
t = 0
self._init_states(inputs)
self.attention.init_win_idx()
self.attention.init_states(inputs)
while True:
if t > 0:
new_memory = outputs[-1]
self._update_memory_input(new_memory)
if speaker_embeddings is not None:
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data)
outputs += [output]
attentions += [attention]
stop_tokens += [stop_token]
t += 1
if t > inputs.shape[1] / 4 and (stop_token > 0.6
or attention[:, -1].item() > 0.6):
break
elif t > self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
return self._parse_outputs(outputs, attentions, stop_tokens)
class StopNet(nn.Module):
r"""
Args:
in_features (int): feature dimension of input.
"""
def __init__(self, in_features):
super(StopNet, self).__init__()
self.dropout = nn.Dropout(0.1)
self.linear = nn.Linear(in_features, 1)
torch.nn.init.xavier_uniform_(
self.linear.weight, gain=torch.nn.init.calculate_gain('linear'))
def forward(self, inputs):
outputs = self.dropout(inputs)
outputs = self.linear(outputs)
return outputs

View File

@ -1,353 +0,0 @@
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from .common_layers import init_attn, Prenet, Linear
class ConvBNBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, activation=None):
super(ConvBNBlock, self).__init__()
assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2
self.convolution1d = nn.Conv1d(in_channels,
out_channels,
kernel_size,
padding=padding)
self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
self.dropout = nn.Dropout(p=0.5)
if activation == 'relu':
self.activation = nn.ReLU()
elif activation == 'tanh':
self.activation = nn.Tanh()
else:
self.activation = nn.Identity()
def forward(self, x):
o = self.convolution1d(x)
o = self.batch_normalization(o)
o = self.activation(o)
o = self.dropout(o)
return o
class Postnet(nn.Module):
def __init__(self, output_dim, num_convs=5):
super(Postnet, self).__init__()
self.convolutions = nn.ModuleList()
self.convolutions.append(
ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
for _ in range(1, num_convs - 1):
self.convolutions.append(
ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
self.convolutions.append(
ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
def forward(self, x):
o = x
for layer in self.convolutions:
o = layer(o)
return o
class Encoder(nn.Module):
def __init__(self, output_input_dim=512):
super(Encoder, self).__init__()
self.convolutions = nn.ModuleList()
for _ in range(3):
self.convolutions.append(
ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
self.lstm = nn.LSTM(output_input_dim,
int(output_input_dim / 2),
num_layers=1,
batch_first=True,
bias=True,
bidirectional=True)
self.rnn_state = None
def forward(self, x, input_lengths):
o = x
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
o = nn.utils.rnn.pack_padded_sequence(o,
input_lengths,
batch_first=True)
self.lstm.flatten_parameters()
o, _ = self.lstm(o)
o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
return o
def inference(self, x):
o = x
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
# self.lstm.flatten_parameters()
o, _ = self.lstm(o)
return o
# adapted from https://github.com/NVIDIA/tacotron2/
class Decoder(nn.Module):
# Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init
def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn, trans_agent,
forward_attn_mask, location_attn, attn_K, separate_stopnet,
speaker_embedding_dim):
super(Decoder, self).__init__()
self.frame_dim = frame_dim
self.r_init = r
self.r = r
self.encoder_embedding_dim = input_dim
self.separate_stopnet = separate_stopnet
self.max_decoder_steps = 1000
self.gate_threshold = 0.5
# model dimensions
self.query_dim = 1024
self.decoder_rnn_dim = 1024
self.prenet_dim = 256
self.attn_dim = 128
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
# memory -> |Prenet| -> processed_memory
prenet_dim = self.frame_dim
self.prenet = Prenet(prenet_dim,
prenet_type,
prenet_dropout,
out_features=[self.prenet_dim, self.prenet_dim],
bias=False)
self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
self.query_dim,
bias=True)
self.attention = init_attn(attn_type=attn_type,
query_dim=self.query_dim,
embedding_dim=input_dim,
attention_dim=128,
location_attention=location_attn,
attention_location_n_filters=32,
attention_location_kernel_size=31,
windowing=attn_win,
norm=attn_norm,
forward_attn=forward_attn,
trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask,
attn_K=attn_K)
self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
self.decoder_rnn_dim,
bias=True)
self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
self.frame_dim * self.r_init)
self.stopnet = nn.Sequential(
nn.Dropout(0.1),
Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
1,
bias=True,
init_gain='sigmoid'))
self.memory_truncated = None
def set_r(self, new_r):
self.r = new_r
def get_go_frame(self, inputs):
B = inputs.size(0)
memory = torch.zeros(1, device=inputs.device).repeat(B,
self.frame_dim * self.r)
return memory
def _init_states(self, inputs, mask, keep_states=False):
B = inputs.size(0)
# T = inputs.size(1)
if not keep_states:
self.query = torch.zeros(1, device=inputs.device).repeat(
B, self.query_dim)
self.attention_rnn_cell_state = torch.zeros(
1, device=inputs.device).repeat(B, self.query_dim)
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
B, self.decoder_rnn_dim)
self.context = torch.zeros(1, device=inputs.device).repeat(
B, self.encoder_embedding_dim)
self.inputs = inputs
self.processed_inputs = self.attention.preprocess_inputs(inputs)
self.mask = mask
def _reshape_memory(self, memory):
"""
Reshape the spectrograms for given 'r'
"""
# Grouping multiple frames if necessary
if memory.size(-1) == self.frame_dim:
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
# Time first (T_decoder, B, frame_dim)
memory = memory.transpose(0, 1)
return memory
def _parse_outputs(self, outputs, stop_tokens, alignments):
alignments = torch.stack(alignments).transpose(0, 1)
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
outputs = outputs.transpose(1, 2)
return outputs, stop_tokens, alignments
def _update_memory(self, memory):
if len(memory.shape) == 2:
return memory[:, self.frame_dim * (self.r - 1):]
return memory[:, :, self.frame_dim * (self.r - 1):]
def decode(self, memory):
'''
shapes:
- memory: B x r * self.frame_dim
'''
# self.context: B x D_en
# query_input: B x D_en + (r * self.frame_dim)
query_input = torch.cat((memory, self.context), -1)
# self.query and self.attention_rnn_cell_state : B x D_attn_rnn
self.query, self.attention_rnn_cell_state = self.attention_rnn(
query_input, (self.query, self.attention_rnn_cell_state))
self.query = F.dropout(self.query, self.p_attention_dropout,
self.training)
self.attention_rnn_cell_state = F.dropout(
self.attention_rnn_cell_state, self.p_attention_dropout,
self.training)
# B x D_en
self.context = self.attention(self.query, self.inputs,
self.processed_inputs, self.mask)
# B x (D_en + D_attn_rnn)
decoder_rnn_input = torch.cat((self.query, self.context), -1)
# self.decoder_hidden and self.decoder_cell: B x D_decoder_rnn
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_rnn_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(self.decoder_hidden,
self.p_decoder_dropout, self.training)
# B x (D_decoder_rnn + D_en)
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
dim=1)
# B x (self.r * self.frame_dim)
decoder_output = self.linear_projection(decoder_hidden_context)
# B x (D_decoder_rnn + (self.r * self.frame_dim))
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach())
else:
stop_token = self.stopnet(stopnet_input)
# select outputs for the reduction rate self.r
decoder_output = decoder_output[:, :self.r * self.frame_dim]
return decoder_output, self.attention.attention_weights, stop_token
def forward(self, inputs, memories, mask, speaker_embeddings=None):
memory = self.get_go_frame(inputs).unsqueeze(0)
memories = self._reshape_memory(memories)
memories = torch.cat((memory, memories), dim=0)
memories = self._update_memory(memories)
if speaker_embeddings is not None:
memories = torch.cat([memories, speaker_embeddings], dim=-1)
memories = self.prenet(memories)
self._init_states(inputs, mask=mask)
self.attention.init_states(inputs)
outputs, stop_tokens, alignments = [], [], []
while len(outputs) < memories.size(0) - 1:
memory = memories[len(outputs)]
decoder_output, attention_weights, stop_token = self.decode(memory)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token.squeeze(1)]
alignments += [attention_weights]
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference(self, inputs, speaker_embeddings=None):
memory = self.get_go_frame(inputs)
memory = self._update_memory(memory)
self._init_states(inputs, mask=None)
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
while True:
memory = self.prenet(memory)
if speaker_embeddings is not None:
memory = torch.cat([memory, speaker_embeddings], dim=-1)
decoder_output, alignment, stop_token = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token]
alignments += [alignment]
if stop_token > 0.7 and t > inputs.shape[0] / 2:
break
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
memory = self._update_memory(decoder_output)
t += 1
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference_truncated(self, inputs):
"""
Preserve decoder states for continuous inference
"""
if self.memory_truncated is None:
self.memory_truncated = self.get_go_frame(inputs)
self._init_states(inputs, mask=None, keep_states=False)
else:
self._init_states(inputs, mask=None, keep_states=True)
self.attention.init_win_idx()
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
while True:
memory = self.prenet(self.memory_truncated)
decoder_output, alignment, stop_token = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
outputs += [decoder_output.squeeze(1)]
stop_tokens += [stop_token]
alignments += [alignment]
if stop_token > 0.7:
break
if len(outputs) == self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
self.memory_truncated = decoder_output
t += 1
outputs, stop_tokens, alignments = self._parse_outputs(
outputs, stop_tokens, alignments)
return outputs, alignments, stop_tokens
def inference_step(self, inputs, t, memory=None):
"""
For debug purposes
"""
if t == 0:
memory = self.get_go_frame(inputs)
self._init_states(inputs, mask=None)
memory = self.prenet(memory)
decoder_output, stop_token, alignment = self.decode(memory)
stop_token = torch.sigmoid(stop_token.data)
memory = decoder_output
return decoder_output, stop_token, alignment

View File

View File

@ -1,160 +0,0 @@
# coding: utf-8
import torch
from torch import nn
from TTS.layers.gst_layers import GST
from TTS.layers.tacotron import Decoder, Encoder, PostCBHG
from TTS.models.tacotron_abstract import TacotronAbstract
class Tacotron(TacotronAbstract):
def __init__(self,
num_chars,
num_speakers,
r=5,
postnet_output_dim=1025,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False,
memory_size=5):
super(Tacotron,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
decoder_output_dim, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 256
encoder_in_features = 512 if num_speakers > 1 else 256
speaker_embedding_dim = 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base model layers
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
self.embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
memory_size, attn_type, attn_win, attn_norm,
prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
self.postnet = PostCBHG(decoder_output_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
postnet_output_dim)
# speaker embedding layers
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.speaker_project_mel = nn.Sequential(
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# global style token layers
if self.gst:
gst_embedding_dim = 256
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask, location_attn,
attn_K, separate_stopnet, proj_speaker_dim)
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
"""
Shapes:
- characters: B x T_in
- text_lengths: B
- mel_specs: B x T_out x D
- speaker_ids: B x 1
"""
self._init_states()
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x embed_dim
inputs = self.embedding(characters)
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
# B x T_in x encoder_in_features
encoder_outputs = self.encoder(inputs)
# sequence masking
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
# decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask,
self.speaker_embeddings_projected)
# sequence masking
if output_mask is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x T_out x decoder_in_features
postnet_outputs = self.postnet(decoder_outputs)
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs)
# B x T_out x posnet_dim
postnet_outputs = self.last_linear(postnet_outputs)
# B x T_out x decoder_in_features
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, characters, speaker_ids=None, style_mel=None):
inputs = self.embedding(characters)
self._init_states()
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
inputs = self._concat_speaker_embedding(inputs,
self.speaker_embeddings)
encoder_outputs = self.encoder(inputs)
if self.gst and style_mel is not None:
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
if self.num_speakers > 1:
encoder_outputs = self._concat_speaker_embedding(
encoder_outputs, self.speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs, self.speaker_embeddings_projected)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = self.last_linear(postnet_outputs)
decoder_outputs = decoder_outputs.transpose(1, 2)
return decoder_outputs, postnet_outputs, alignments, stop_tokens

View File

@ -1,169 +0,0 @@
import torch
from torch import nn
from TTS.layers.gst_layers import GST
from TTS.layers.tacotron2 import Decoder, Encoder, Postnet
from TTS.models.tacotron_abstract import TacotronAbstract
# TODO: match function arguments with tacotron
class Tacotron2(TacotronAbstract):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
super(Tacotron2,
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
decoder_output_dim, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet,
bidirectional_decoder, double_decoder_consistency,
ddc_r, gst)
decoder_in_features = 512 if num_speakers > 1 else 512
encoder_in_features = 512 if num_speakers > 1 else 512
proj_speaker_dim = 80 if num_speakers > 1 else 0
# base layers
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(num_speakers, 512)
self.speaker_embedding.weight.data.normal_(0, 0.3)
self.encoder = Encoder(encoder_in_features)
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
attn_norm, prenet_type, prenet_dropout,
forward_attn, trans_agent, forward_attn_mask,
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
self.postnet = Postnet(self.postnet_output_dim)
# global style token layers
if self.gst:
gst_embedding_dim = encoder_in_features
self.gst_layer = GST(num_mel=80,
num_heads=4,
num_style_tokens=10,
embedding_dim=gst_embedding_dim)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, attn_K,
separate_stopnet, proj_speaker_dim)
@staticmethod
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
mel_outputs = mel_outputs.transpose(1, 2)
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
self._init_states()
# compute mask for padding
# B x T_in_max (boolean)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x D_embed x T_in_max
embedded_inputs = self.embedding(text).transpose(1, 2)
# B x T_in_max x D_en
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
# adding speaker embeddding to encoder output
# TODO: multi-speaker
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
# B x T_in x embed_dim + speaker_embed_dim
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
decoder_outputs, alignments, stop_tokens = self.decoder(
encoder_outputs, mel_specs, input_mask)
# sequence masking
if mel_lengths is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x mel_dim x T_out
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs)
# B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
decoder_outputs, postnet_outputs, alignments)
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
return decoder_outputs, postnet_outputs, alignments, stop_tokens
@torch.no_grad()
def inference(self, text, speaker_ids=None):
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs)
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
if self.num_speakers > 1:
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
self.speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
decoder_outputs, postnet_outputs, alignments)
return decoder_outputs, postnet_outputs, alignments, stop_tokens
def inference_truncated(self, text, speaker_ids=None):
"""
Preserve model states for continuous inference
"""
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
speaker_ids)
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
encoder_outputs)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
mel_outputs, mel_outputs_postnet, alignments)
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
def _speaker_embedding_pass(self, encoder_outputs, speaker_ids):
# TODO: multi-speaker
# if hasattr(self, "speaker_embedding") and speaker_ids is None:
# raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
# if hasattr(self, "speaker_embedding") and speaker_ids is not None:
# speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
# encoder_outputs.size(1),
# -1)
# encoder_outputs = encoder_outputs + speaker_embeddings
# return encoder_outputs
pass

View File

@ -1,180 +0,0 @@
import copy
from abc import ABC, abstractmethod
import torch
from torch import nn
from TTS.utils.generic_utils import sequence_mask
class TacotronAbstract(ABC, nn.Module):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=5,
separate_stopnet=True,
bidirectional_decoder=False,
double_decoder_consistency=False,
ddc_r=None,
gst=False):
""" Abstract Tacotron class """
super().__init__()
self.num_chars = num_chars
self.r = r
self.decoder_output_dim = decoder_output_dim
self.postnet_output_dim = postnet_output_dim
self.gst = gst
self.num_speakers = num_speakers
self.bidirectional_decoder = bidirectional_decoder
self.double_decoder_consistency = double_decoder_consistency
self.ddc_r = ddc_r
self.attn_type = attn_type
self.attn_win = attn_win
self.attn_norm = attn_norm
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
self.forward_attn = forward_attn
self.trans_agent = trans_agent
self.forward_attn_mask = forward_attn_mask
self.location_attn = location_attn
self.attn_K = attn_K
self.separate_stopnet = separate_stopnet
# layers
self.embedding = None
self.encoder = None
self.decoder = None
self.postnet = None
# global style token
if self.gst:
self.gst_layer = None
# model states
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
# additional layers
self.decoder_backward = None
self.coarse_decoder = None
#############################
# INIT FUNCTIONS
#############################
def _init_states(self):
self.speaker_embeddings = None
self.speaker_embeddings_projected = None
def _init_backward_decoder(self):
self.decoder_backward = copy.deepcopy(self.decoder)
def _init_coarse_decoder(self):
self.coarse_decoder = copy.deepcopy(self.decoder)
self.coarse_decoder.r_init = self.ddc_r
self.coarse_decoder.set_r(self.ddc_r)
#############################
# CORE FUNCTIONS
#############################
@abstractmethod
def forward(self):
pass
@abstractmethod
def inference(self):
pass
#############################
# COMMON COMPUTE FUNCTIONS
#############################
def compute_masks(self, text_lengths, mel_lengths):
"""Compute masks against sequence paddings."""
# B x T_in_max (boolean)
device = text_lengths.device
input_mask = sequence_mask(text_lengths).to(device)
output_mask = None
if mel_lengths is not None:
max_len = mel_lengths.max()
r = self.decoder.r
max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device)
return input_mask, output_mask
def _backward_pass(self, mel_specs, encoder_outputs, mask):
""" Run backwards decoder """
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
self.speaker_embeddings_projected)
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
return decoder_outputs_b, alignments_b
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments,
input_mask):
""" Double Decoder Consistency """
T = mel_specs.shape[1]
if T % self.coarse_decoder.r > 0:
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
mel_specs = torch.nn.functional.pad(mel_specs,
(0, 0, 0, padding_size, 0, 0))
decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
encoder_outputs.detach(), mel_specs, input_mask)
# scale_factor = self.decoder.r_init / self.decoder.r
alignments_backward = torch.nn.functional.interpolate(
alignments_backward.transpose(1, 2),
size=alignments.shape[1],
mode='nearest').transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
return decoder_outputs_backward, alignments_backward
#############################
# EMBEDDING FUNCTIONS
#############################
def compute_speaker_embedding(self, speaker_ids):
""" Compute speaker embedding vectors """
if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(
" [!] Model has speaker embedding layer but speaker_id is not provided"
)
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1)
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs):
""" Compute global style token """
# pylint: disable=not-callable
gst_outputs = self.gst_layer(mel_specs)
inputs = self._add_speaker_embedding(inputs, gst_outputs)
return inputs
@staticmethod
def _add_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = outputs + speaker_embeddings_
return outputs
@staticmethod
def _concat_speaker_embedding(outputs, speaker_embeddings):
speaker_embeddings_ = speaker_embeddings.expand(
outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
return outputs

View File

@ -16,9 +16,9 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.visual import plot_spectrogram\n",
"from TTS.utils.generic_utils import load_config\n",
"from TTS.tts.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.visual import plot_spectrogram\n",
"from TTS.tts.utils.generic_utils import load_config\n",
"import glob \n",
"import IPython.display as ipd"
]

View File

@ -22,12 +22,12 @@
"import numpy as np\n",
"from tqdm import tqdm as tqdm\n",
"from torch.utils.data import DataLoader\n",
"from TTS.datasets.TTSDataset import MyDataset\n",
"from TTS.layers.losses import L1LossMasked\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.visual import plot_spectrogram\n",
"from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n",
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"from TTS.tts.datasets.TTSDataset import MyDataset\n",
"from TTS.tts.layers.losses import L1LossMasked\n",
"from TTS.tts.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.visual import plot_spectrogram\n",
"from TTS.tts.utils.generic_utils import load_config, setup_model, sequence_mask\n",
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n",
"%matplotlib inline\n",
"\n",
@ -108,7 +108,7 @@
"metadata": {},
"outputs": [],
"source": [
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
"preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",

View File

@ -36,14 +36,14 @@
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.layers import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import setup_model\n",
"from TTS.utils.io import load_config\n",
"from TTS.utils.text import text_to_sequence\n",
"from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import plot_alignment\n",
"from TTS.utils.measures import alignment_diagonal_score\n",
"from TTS.tts.layers import *\n",
"from TTS.tts.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.io import load_config\n",
"from TTS.tts.utils.text import text_to_sequence\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.visual import plot_alignment\n",
"from TTS.tts.utils.measures import alignment_diagonal_score\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
@ -96,7 +96,7 @@
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n",
"# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n",

File diff suppressed because one or more lines are too long

View File

@ -27,7 +27,7 @@
"from multiprocessing import Pool\n",
"from matplotlib import pylab as plt\n",
"from collections import Counter\n",
"from TTS.datasets.preprocess import *\n",
"from TTS.tts.datasets.preprocess import *\n",
"%matplotlib inline"
]
},

10
run_tests.sh Executable file
View File

@ -0,0 +1,10 @@
# tests
nosetests tests -x
# runtime tests
./tests/test_server_package.sh
./tests/test_tts_train.sh
./tests/test_vocoder_train.sh
# linter check
cardboardlinter --refspec master

View File

@ -1,47 +0,0 @@
## TTS example web-server
You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work.
#### Development server:
##### Using server.py
If you have the environment set already for TTS, then you can directly call ```server.py```.
##### Using .whl
1. apt-get install -y espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. python -m TTS.server.server
You can now open http://localhost:5002 in a browser
#### Running with nginx/uwsgi:
1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip
7. unzip *-nginx-uwsgi.zip
8. cp tts_site_nginx /etc/nginx/sites-enabled/default
9. service nginx restart
10. uwsgi --ini uwsgi.ini
You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx).
Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting.
#### Creating a server package with an embedded model
[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command:
- `--checkpoint <path to checkpoint file>` - path to model checkpoint file you want to embed in the package
- `--model_config <path to config.json file>` - path to corresponding config.json file for the checkpoint
To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`.
A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it.

View File

View File

@ -1,16 +0,0 @@
{
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
"tts_file":"best_model.pth.tar", // tts checkpoint file
"tts_config":"config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
"vocoder_config":null,
"vocoder_file": null,
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
"wavernn_path":null, // wavernn model root path
"wavernn_file":null, // wavernn checkpoint file name
"wavernn_config": null, // wavernn config file
"is_wavernn_batched":true,
"port": 5002,
"use_cuda": true,
"debug": true
}

View File

@ -1,86 +0,0 @@
#!flask/bin/python
import argparse
import os
from flask import Flask, request, render_template, send_file
from TTS.server.synthesizer import Synthesizer
def create_argparser():
def convert_boolean(x):
return x.lower() in ['true', '1', 'yes']
parser = argparse.ArgumentParser()
parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
return parser
synthesizer = None
embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder')
vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar')
vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json')
# These models are soon to be deprecated
embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
args = create_argparser().parse_args()
# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file
if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
args.vocoder_config = vocoder_config_file
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
args.wavernn_checkpoint = wavernn_checkpoint_file
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
args.wavernn_config = wavernn_config_file
synthesizer = Synthesizer(args)
app = Flask(__name__)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/tts', methods=['GET'])
def tts():
text = request.args.get('text')
print(" > Model input: {}".format(text))
data = synthesizer.tts(text)
return send_file(data, mimetype='audio/wav')
def main():
app.run(debug=args.debug, host='0.0.0.0', port=args.port)
if __name__ == '__main__':
main()

View File

@ -1,194 +0,0 @@
import io
import sys
import time
import numpy as np
import torch
import yaml
import pysbd
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.utils.generic_utils import setup_model
from TTS.utils.speakers import load_speaker_mapping
from TTS.vocoder.utils.generic_utils import setup_generator
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
from TTS.utils.synthesis import *
from TTS.utils.text import make_symbols, phonemes, symbols
class Synthesizer(object):
def __init__(self, config):
self.wavernn = None
self.vocoder_model = None
self.config = config
print(config)
self.seg = self.get_segmenter("en")
self.use_cuda = self.config.use_cuda
if self.use_cuda:
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
self.config.use_cuda)
if self.config.vocoder_checkpoint:
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
if self.config.wavernn_lib_path:
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
self.config.wavernn_config, self.config.use_cuda)
@staticmethod
def get_segmenter(lang):
return pysbd.Segmenter(language=lang, clean=True)
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
# pylint: disable=global-statement
global symbols, phonemes
print(" > Loading TTS model ...")
print(" | > model config: ", tts_config)
print(" | > checkpoint file: ", tts_checkpoint)
self.tts_config = load_config(tts_config)
self.use_phonemes = self.tts_config.use_phonemes
self.ap = AudioProcessor(**self.tts_config.audio)
if 'characters' in self.tts_config.keys():
symbols, phonemes = make_symbols(**self.tts_config.characters)
if self.use_phonemes:
self.input_size = len(phonemes)
else:
self.input_size = len(symbols)
# TODO: fix this for multi-speaker model - load speakers
if self.config.tts_speakers is not None:
self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
num_speakers = len(self.tts_speakers)
else:
num_speakers = 0
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
# load model state
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
# load the model
self.tts_model.load_state_dict(cp['model'])
if use_cuda:
self.tts_model.cuda()
self.tts_model.eval()
self.tts_model.decoder.max_decoder_steps = 3000
if 'r' in cp:
self.tts_model.decoder.set_r(cp['r'])
print(f" > model reduction factor: {cp['r']}")
def load_vocoder(self, model_file, model_config, use_cuda):
self.vocoder_config = load_config(model_config)
self.vocoder_model = setup_generator(self.vocoder_config)
self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
self.vocoder_model.remove_weight_norm()
self.vocoder_model.inference_padding = 0
self.vocoder_config = load_config(model_config)
if use_cuda:
self.vocoder_model.cuda()
self.vocoder_model.eval()
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
# TODO: set a function in wavernn code base for model setup and call it here.
sys.path.append(lib_path) # set this if WaveRNN is not installed globally
#pylint: disable=import-outside-toplevel
from WaveRNN.models.wavernn import Model
print(" > Loading WaveRNN model ...")
print(" | > model config: ", model_config)
print(" | > model file: ", model_file)
self.wavernn_config = load_config(model_config)
# This is the default architecture we use for our models.
# You might need to update it
self.wavernn = Model(
rnn_dims=512,
fc_dims=512,
mode=self.wavernn_config.mode,
mulaw=self.wavernn_config.mulaw,
pad=self.wavernn_config.pad,
use_aux_net=self.wavernn_config.use_aux_net,
use_upsample_net=self.wavernn_config.use_upsample_net,
upsample_factors=self.wavernn_config.upsample_factors,
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=self.ap.hop_length,
sample_rate=self.ap.sample_rate,
).cuda()
check = torch.load(model_file, map_location="cpu")
self.wavernn.load_state_dict(check['model'])
if use_cuda:
self.wavernn.cuda()
self.wavernn.eval()
def save_wav(self, wav, path):
# wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
wav = np.array(wav)
self.ap.save_wav(wav, path)
def split_into_sentences(self, text):
return self.seg.segment(text)
def tts(self, text, speaker_id=None):
start_time = time.time()
wavs = []
sens = self.split_into_sentences(text)
print(sens)
speaker_id = id_to_torch(speaker_id)
if speaker_id is not None and self.use_cuda:
speaker_id = speaker_id.cuda()
for sen in sens:
# preprocess the given text
inputs = text_to_seqvec(sen, self.tts_config)
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
inputs = inputs.unsqueeze(0)
# synthesize voice
_, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
if self.vocoder_model:
# use native vocoder model
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
wav = self.vocoder_model.inference(vocoder_input)
if self.use_cuda:
wav = wav.cpu().numpy()
else:
wav = wav.numpy()
wav = wav.flatten()
elif self.wavernn:
# use 3rd paty wavernn
vocoder_input = None
if self.tts_config.model == "Tacotron":
vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
else:
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
if self.use_cuda:
vocoder_input.cuda()
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
else:
# use GL
if self.use_cuda:
postnet_output = postnet_output[0].cpu()
else:
postnet_output = postnet_output[0]
postnet_output = postnet_output.numpy()
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
# trim silence
wav = trim_silence(wav, self.ap)
wavs += list(wav)
wavs += [0] * 10000
out = io.BytesIO()
self.save_wav(wavs, out)
# compute stats
process_time = time.time() - start_time
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
print(f" > Processing time: {process_time}")
print(f" > Real-time factor: {process_time / audio_time}")
return out

View File

@ -1,111 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="">
<title>Mozilla - Text2Speech engine</title>
<!-- Bootstrap core CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
<!-- Custom styles for this template -->
<style>
body {
padding-top: 54px;
}
@media (min-width: 992px) {
body {
padding-top: 56px;
}
}
</style>
</head>
<body>
<a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
<!-- Navigation -->
<!--
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
<div class="container">
<a class="navbar-brand" href="#">Mozilla TTS</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarResponsive">
<ul class="navbar-nav ml-auto">
<li class="nav-item active">
<a class="nav-link" href="#">Home
<span class="sr-only">(current)</span>
</a>
</li>
</ul>
</div>
</div>
</nav>
-->
<!-- Page Content -->
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
<h1 class="mt-5">Mozilla TTS</h1>
<ul class="list-unstyled">
</ul>
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
<button id="speak-button" name="speak">Speak</button><br/><br/>
<audio id="audio" controls autoplay hidden></audio>
<p id="message"></p>
</div>
</div>
</div>
<!-- Bootstrap core JavaScript -->
<script>
function q(selector) {return document.querySelector(selector)}
q('#text').focus()
function do_tts(e) {
text = q('#text').value
if (text) {
q('#message').textContent = 'Synthesizing...'
q('#speak-button').disabled = true
q('#audio').hidden = true
synthesize(text)
}
e.preventDefault()
return false
}
q('#speak-button').addEventListener('click', do_tts)
q('#text').addEventListener('keyup', function(e) {
if (e.keyCode == 13) { // enter
do_tts(e)
}
})
function synthesize(text) {
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
.then(function(res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()
}).then(function(blob) {
q('#message').textContent = ''
q('#speak-button').disabled = false
q('#audio').src = URL.createObjectURL(blob)
q('#audio').hidden = false
}).catch(function(err) {
q('#message').textContent = 'Error: ' + err.message
q('#speak-button').disabled = false
})
}
</script>
</body>
</html>

View File

@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args()
# Remove our arguments from argv so that setuptools doesn't see them
sys.argv = [sys.argv[0]] + unknown_args
version = '0.0.3'
version = '0.0.4'
# Adapted from https://github.com/pytorch/pytorch
cwd = os.path.dirname(os.path.abspath(__file__))
@ -112,6 +112,8 @@ setup(
name='TTS',
version=version,
url='https://github.com/mozilla/TTS',
author='Eren Gölge',
author_email='egolge@mozilla.com',
description='Text to Speech with Deep Learning',
license='MPL-2.0',
entry_points={
@ -119,11 +121,7 @@ setup(
'tts-server = TTS.server.server:main'
]
},
package_dir={'': 'tts_namespace'},
packages=find_packages('tts_namespace'),
package_data={
'TTS': package_data,
},
packages=find_packages(include=['TTS*']),
project_urls={
'Documentation': 'https://github.com/mozilla/TTS/wiki',
'Tracker': 'https://github.com/mozilla/TTS/issues',

View File

@ -1,18 +0,0 @@
### Speaker Encoder
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
![](umap.png)
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
To run the code, you need to follow the same flow as in TTS.
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
- Watch training on Tensorboard as in TTS

View File

@ -1,88 +0,0 @@
import argparse
import glob
import os
import numpy as np
from tqdm import tqdm
import torch
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
parser = argparse.ArgumentParser(
description='Compute embedding vectors for each wav file in a dataset. ')
parser.add_argument(
'model_path',
type=str,
help='Path to model outputs (checkpoint, tensorboard etc.).')
parser.add_argument(
'config_path',
type=str,
help='Path to config file for training.',
)
parser.add_argument(
'data_path',
type=str,
help='Data path for wav files - directory or CSV file')
parser.add_argument(
'output_path',
type=str,
help='path for training outputs.')
parser.add_argument(
'--use_cuda', type=bool, help='flag to set cuda.', default=False
)
parser.add_argument(
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
)
args = parser.parse_args()
c = load_config(args.config_path)
ap = AudioProcessor(**c['audio'])
data_path = args.data_path
split_ext = os.path.splitext(data_path)
sep = args.separator
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
# Parse CSV
print(f'CSV file: {data_path}')
with open(data_path) as f:
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
wav_files = []
print(f'Separator is: {sep}')
for line in f:
components = line.split(sep)
if len(components) != 2:
print("Invalid line")
continue
wav_file = os.path.join(wav_path, components[0] + '.wav')
#print(f'wav_file: {wav_file}')
if os.path.exists(wav_file):
wav_files.append(wav_file)
print(f'Count of wavs imported: {len(wav_files)}')
else:
# Parse all wav files in data_path
wav_path = data_path
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
output_files = [wav_file.replace(wav_path, args.output_path).replace(
'.wav', '.npy') for wav_file in wav_files]
for output_file in output_files:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
model = SpeakerEncoder(**c.model)
model.load_state_dict(torch.load(args.model_path)['model'])
model.eval()
if args.use_cuda:
model.cuda()
for idx, wav_file in enumerate(tqdm(wav_files)):
mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T
mel_spec = torch.FloatTensor(mel_spec[None, :, :])
if args.use_cuda:
mel_spec = mel_spec.cuda()
embedd = model.compute_embedding(mel_spec)
np.save(output_files[idx], embedd.detach().cpu().numpy())

View File

@ -1,59 +0,0 @@
{
"run_name": "libritts_360-half",
"run_description": "train speaker encoder for libritts 360",
"audio": {
// Audio processing parameters
"num_mels": 40, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": 50, // stft window length in ms.
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"reinit_layers": [],
"grad_clip": 3.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"steps_plot_stats": 10, // number of steps to plot embeddings.
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 1, // Number of steps to log traning on console.
"output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"model": {
"input_dim": 40,
"proj_dim": 128,
"lstm_dim": 384,
"num_lstm_layers": 3
},
"datasets":
[
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-360/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "/home/erogol/Data/Libri-TTS/train-clean-100/",
"meta_file_train": null,
"meta_file_val": null
}
]
}

View File

@ -1,123 +0,0 @@
import numpy as np
import torch
import random
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
"""
Args:
ap (TTS.utils.AudioProcessor): audio processor object.
meta_data (list): list of dataset instances.
seq_len (int): voice segment length in seconds.
verbose (bool): print diagnostic information.
"""
self.items = meta_data
self.sample_rate = ap.sample_rate
self.voice_len = voice_len
self.seq_len = int(voice_len * self.sample_rate)
self.num_speakers_in_batch = num_speakers_in_batch
self.num_utter_per_speaker = num_utter_per_speaker
self.skip_speakers = skip_speakers
self.ap = ap
self.verbose = verbose
self.__parse_items()
if self.verbose:
print("\n > DataLoader initialization")
print(f" | > Number of instances : {len(self.items)}")
print(f" | > Sequence length: {self.seq_len}")
print(f" | > Num speakers: {len(self.speakers)}")
def load_wav(self, filename):
audio = self.ap.load_wav(filename)
return audio
def load_data(self, idx):
text, wav_file, speaker_name = self.items[idx]
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
mel = self.ap.melspectrogram(wav).astype("float32")
# sample seq_len
assert text.size > 0, self.items[idx][1]
assert wav.size > 0, self.items[idx][1]
sample = {
"mel": mel,
"item_idx": self.items[idx][1],
"speaker_name": speaker_name,
}
return sample
def __parse_items(self):
"""
Find unique speaker ids and create a dict mapping utterances from speaker id
"""
speakers = list({item[-1] for item in self.items})
self.speaker_to_utters = {}
self.speakers = []
for speaker in speakers:
speaker_utters = [item[1] for item in self.items if item[2] == speaker]
if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
print(
f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
)
else:
self.speakers.append(speaker)
self.speaker_to_utters[speaker] = speaker_utters
def __len__(self):
return int(1e10)
def __sample_speaker(self):
speaker = random.sample(self.speakers, 1)[0]
if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
utters = random.choices(
self.speaker_to_utters[speaker], k=self.num_utter_per_speaker
)
else:
utters = random.sample(
self.speaker_to_utters[speaker], self.num_utter_per_speaker
)
return speaker, utters
def __sample_speaker_utterances(self, speaker):
"""
Sample all M utterances for the given speaker.
"""
feats = []
labels = []
for _ in range(self.num_utter_per_speaker):
# TODO:dummy but works
while True:
if len(self.speaker_to_utters[speaker]) > 0:
utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
else:
self.speakers.remove(speaker)
speaker, _ = self.__sample_speaker()
continue
wav = self.load_wav(utter)
if wav.shape[0] - self.seq_len > 0:
break
self.speaker_to_utters[speaker].remove(utter)
offset = random.randint(0, wav.shape[0] - self.seq_len)
mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len])
feats.append(torch.FloatTensor(mel))
labels.append(speaker)
return feats, labels
def __getitem__(self, idx):
speaker, _ = self.__sample_speaker()
return speaker
def collate_fn(self, batch):
labels = []
feats = []
for speaker in batch:
feats_, labels_ = self.__sample_speaker_utterances(speaker)
labels.append(labels_)
feats.extend(feats_)
feats = torch.stack(feats)
return feats.transpose(1, 2), labels

View File

@ -1,41 +0,0 @@
import os
import datetime
import torch
def save_checkpoint(model, optimizer, model_loss, out_path,
current_step, epoch):
checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict() if optimizer is not None else None,
'step': current_step,
'epoch': epoch,
'GE2Eloss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
torch.save(state, checkpoint_path)
def save_best_model(model, optimizer, model_loss, best_loss, out_path,
current_step):
if model_loss < best_loss:
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict(),
'step': current_step,
'GE2Eloss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = 'best_model.pth.tar'
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
model_loss, bestmodel_path))
torch.save(state, bestmodel_path)
return best_loss

View File

@ -1,121 +0,0 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
# adapted from https://github.com/cvqluu/GE2E-Loss
class GE2ELoss(nn.Module):
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
"""
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
Accepts an input of size (N, M, D)
where N is the number of speakers in the batch,
M is the number of utterances per speaker,
and D is the dimensionality of the embedding vector (e.g. d-vector)
Args:
- init_w (float): defines the initial value of w in Equation (5) of [1]
- init_b (float): definies the initial value of b in Equation (5) of [1]
"""
super(GE2ELoss, self).__init__()
# pylint: disable=E1102
self.w = nn.Parameter(torch.tensor(init_w))
# pylint: disable=E1102
self.b = nn.Parameter(torch.tensor(init_b))
self.loss_method = loss_method
assert self.loss_method in ["softmax", "contrast"]
if self.loss_method == "softmax":
self.embed_loss = self.embed_loss_softmax
if self.loss_method == "contrast":
self.embed_loss = self.embed_loss_contrast
# pylint: disable=R0201
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
"""
Calculates the new centroids excluding the reference utterance
"""
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
excl = torch.mean(excl, 0)
new_centroids = []
for i, centroid in enumerate(centroids):
if i == spkr:
new_centroids.append(excl)
else:
new_centroids.append(centroid)
return torch.stack(new_centroids)
def calc_cosine_sim(self, dvecs, centroids):
"""
Make the cosine similarity matrix with dims (N,M,N)
"""
cos_sim_matrix = []
for spkr_idx, speaker in enumerate(dvecs):
cs_row = []
for utt_idx, utterance in enumerate(speaker):
new_centroids = self.calc_new_centroids(
dvecs, centroids, spkr_idx, utt_idx
)
# vector based cosine similarity for speed
cs_row.append(
torch.clamp(
torch.mm(
utterance.unsqueeze(1).transpose(0, 1),
new_centroids.transpose(0, 1),
)
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
1e-6,
)
)
cs_row = torch.cat(cs_row, dim=0)
cos_sim_matrix.append(cs_row)
return torch.stack(cos_sim_matrix)
# pylint: disable=R0201
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
"""
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
"""
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
# pylint: disable=R0201
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
"""
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
"""
N, M, _ = dvecs.shape
L = []
for j in range(N):
L_row = []
for i in range(M):
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
excl_centroids_sigmoids = torch.cat(
(centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])
)
L_row.append(
1.0
- torch.sigmoid(cos_sim_matrix[j, i, j])
+ torch.max(excl_centroids_sigmoids)
)
L_row = torch.stack(L_row)
L.append(L_row)
return torch.stack(L)
def forward(self, dvecs):
"""
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
"""
centroids = torch.mean(dvecs, 1)
cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
torch.clamp(self.w, 1e-6)
cos_sim_matrix = self.w * cos_sim_matrix + self.b
L = self.embed_loss(dvecs, cos_sim_matrix)
return L.mean()

View File

@ -1,88 +0,0 @@
import torch
from torch import nn
class LSTMWithProjection(nn.Module):
def __init__(self, input_size, hidden_size, proj_size):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.proj_size = proj_size
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
def forward(self, x):
self.lstm.flatten_parameters()
o, (_, _) = self.lstm(x)
return self.linear(o)
class SpeakerEncoder(nn.Module):
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
super().__init__()
layers = []
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
for _ in range(num_lstm_layers - 1):
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
self.layers = nn.Sequential(*layers)
self._init_layers()
def _init_layers(self):
for name, param in self.layers.named_parameters():
if "bias" in name:
nn.init.constant_(param, 0.0)
elif "weight" in name:
nn.init.xavier_normal_(param)
def forward(self, x):
# TODO: implement state passing for lstms
d = self.layers(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
return d
def inference(self, x):
d = self.layers.forward(x)
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
return d
def compute_embedding(self, x, num_frames=160, overlap=0.5):
"""
Generate embeddings for a batch of utterances
x: 1xTxD
"""
num_overlap = int(num_frames * overlap)
max_len = x.shape[1]
embed = None
cur_iter = 0
for offset in range(0, max_len, num_frames - num_overlap):
cur_iter += 1
end_offset = min(x.shape[1], offset + num_frames)
frames = x[:, offset:end_offset]
if embed is None:
embed = self.inference(frames)
else:
embed += self.inference(frames)
return embed / cur_iter
def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
"""
Generate embeddings for a batch of utterances
x: BxTxD
"""
num_overlap = num_frames * overlap
max_len = x.shape[1]
embed = None
num_iters = seq_lens / (num_frames - num_overlap)
cur_iter = 0
for offset in range(0, max_len, num_frames - num_overlap):
cur_iter += 1
end_offset = min(x.shape[1], offset + num_frames)
frames = x[:, offset:end_offset]
if embed is None:
embed = self.inference(frames)
else:
embed[cur_iter <= num_iters, :] += self.inference(
frames[cur_iter <= num_iters, :, :]
)
return embed / num_iters

View File

@ -1,325 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Overview\n",
"\n",
"This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
"\n",
"Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import random\n",
"import numpy as np\n",
"import torch\n",
"import umap\n",
"\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import load_config\n",
"\n",
"from bokeh.io import output_notebook, show\n",
"from bokeh.plotting import figure\n",
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
"from bokeh.transform import factor_cmap, factor_mark\n",
"from bokeh.palettes import Category10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
"\n",
"List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
"\n",
"**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output_notebook()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should also adjust all the path constants to point at the relevant locations for you locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
"\n",
"# My single speaker locations\n",
"#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
"#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
"\n",
"# My multi speaker locations\n",
"EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
"AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!ls -1 $MODEL_RUN_PATH"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = load_config(CONFIG_PATH)\n",
"ap = AudioProcessor(**CONFIG['audio'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Bring in the embeddings created by **compute_embeddings.py**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
"print(f'Embeddings found: {len(embed_files)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check that we did indeed find an embedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embed_files[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process the speakers\n",
"\n",
"Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
"speaker_to_utter = {}\n",
"for embed_file in embed_files:\n",
" speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
" try:\n",
" speaker_to_utter[speaker_path].append(embed_file)\n",
" except:\n",
" speaker_to_utter[speaker_path]=[embed_file]\n",
"print(f'Speaker count: {len(speaker_paths)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set up the embeddings\n",
"\n",
"Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeds = []\n",
"labels = []\n",
"locations = []\n",
"\n",
"# single speaker \n",
"#num_speakers = 1\n",
"#num_utters = 1000\n",
"\n",
"# multi speaker\n",
"num_speakers = 10\n",
"num_utters = 20\n",
"\n",
"\n",
"speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
"\n",
"for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
" speaker_path = speaker_paths[speaker_idx]\n",
" speakers_utter = speaker_to_utter[speaker_path]\n",
" utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
" for utter_idx in utter_idxs:\n",
" embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
" embed = np.load(embed_path)\n",
" embeds.append(embed)\n",
" labels.append(str(speaker_num))\n",
" locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
"embeds = np.concatenate(embeds)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load embeddings with UMAP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = umap.UMAP()\n",
"projection = model.fit_transform(embeds)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Interactively charting the data in Bokeh\n",
"\n",
"Set up various details for Bokeh to plot the data\n",
"\n",
"You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
"\n",
"Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
"\n",
"File location in the tooltip is given relative to **AUDIO_PATH**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"source_wav_stems = ColumnDataSource(\n",
" data=dict(\n",
" x = projection.T[0].tolist(),\n",
" y = projection.T[1].tolist(),\n",
" desc=locations,\n",
" label=labels\n",
" )\n",
" )\n",
"\n",
"hover = HoverTool(\n",
" tooltips=[\n",
" (\"file\", \"@desc\"),\n",
" (\"speaker\", \"@label\"),\n",
" ]\n",
" )\n",
"\n",
"# optionally consider adding these to the tooltips if you want additional detail\n",
"# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
"# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
"\n",
"factors = list(set(labels))\n",
"pal_size = max(len(factors), 3)\n",
"pal = Category10[pal_size]\n",
"\n",
"p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
"\n",
"\n",
"p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
"\n",
"url = \"http://localhost:8000/@desc\"\n",
"taptool = p.select(type=TapTool)\n",
"taptool.callback = OpenURL(url=url)\n",
"\n",
"show(p)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Local server to serve wav files from corpus\n",
"\n",
"This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n",
"\n",
"There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n",
"\n",
"The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%cd $AUDIO_PATH\n",
"%pwd\n",
"!python -m http.server"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,2 +0,0 @@
umap-learn
numpy>=1.17.0

View File

@ -1,252 +0,0 @@
import argparse
import os
import sys
import time
import traceback
import torch
from torch.utils.data import DataLoader
from TTS.datasets.preprocess import load_meta_data
from TTS.speaker_encoder.dataset import MyDataset
from TTS.speaker_encoder.loss import GE2ELoss
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.visual import plot_embeddings
from TTS.speaker_encoder.generic_utils import save_best_model
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from TTS.utils.io import load_config, copy_config_file
from TTS.utils.training import check_update, NoamLR
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.radam import RAdam
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.manual_seed(54321)
use_cuda = torch.cuda.is_available()
num_gpus = torch.cuda.device_count()
print(" > Using CUDA: ", use_cuda)
print(" > Number of GPUs: ", num_gpus)
def setup_loader(ap, is_val=False, verbose=False):
if is_val:
loader = None
else:
dataset = MyDataset(ap,
meta_data_eval if is_val else meta_data_train,
voice_len=1.6,
num_utter_per_speaker=10,
skip_speakers=False,
verbose=verbose)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset,
batch_size=c.num_speakers_in_batch,
shuffle=False,
num_workers=c.num_loader_workers,
collate_fn=dataset.collate_fn)
return loader
def train(model, criterion, optimizer, scheduler, ap, global_step):
data_loader = setup_loader(ap, is_val=False, verbose=True)
model.train()
epoch_time = 0
best_loss = float('inf')
avg_loss = 0
end_time = time.time()
for _, data in enumerate(data_loader):
start_time = time.time()
# setup input data
inputs = data[0]
loader_time = time.time() - end_time
global_step += 1
# setup lr
if c.lr_decay:
scheduler.step()
optimizer.zero_grad()
# dispatch data to GPU
if use_cuda:
inputs = inputs.cuda(non_blocking=True)
# labels = labels.cuda(non_blocking=True)
# forward pass model
outputs = model(inputs)
# loss computation
loss = criterion(
outputs.view(c.num_speakers_in_batch,
outputs.shape[0] // c.num_speakers_in_batch, -1))
loss.backward()
grad_norm, _ = check_update(model, c.grad_clip)
optimizer.step()
step_time = time.time() - start_time
epoch_time += step_time
avg_loss = 0.01 * loss.item(
) + 0.99 * avg_loss if avg_loss != 0 else loss.item()
current_lr = optimizer.param_groups[0]['lr']
if global_step % c.steps_plot_stats == 0:
# Plot Training Epoch Stats
train_stats = {
"GE2Eloss": avg_loss,
"lr": current_lr,
"grad_norm": grad_norm,
"step_time": step_time
}
tb_logger.tb_train_epoch_stats(global_step, train_stats)
figures = {
# FIXME: not constant
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(),
10),
}
tb_logger.tb_train_figures(global_step, figures)
if global_step % c.print_step == 0:
print(
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
"StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format(
global_step, loss.item(), avg_loss, grad_norm, step_time,
loader_time, current_lr),
flush=True)
# save best model
best_loss = save_best_model(model, optimizer, avg_loss, best_loss,
OUT_PATH, global_step)
end_time = time.time()
return avg_loss, global_step
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train
global meta_data_eval
ap = AudioProcessor(**c.audio)
model = SpeakerEncoder(input_dim=40,
proj_dim=128,
lstm_dim=384,
num_lstm_layers=3)
optimizer = RAdam(model.parameters(), lr=c.lr)
criterion = GE2ELoss(loss_method='softmax')
if args.restore_path:
checkpoint = torch.load(args.restore_path)
try:
# TODO: fix optimizer init, model.cuda() needs to be called before
# optimizer restore
# optimizer.load_state_dict(checkpoint['optimizer'])
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except KeyError:
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint, c)
model.load_state_dict(model_dict)
del model_dict
for group in optimizer.param_groups:
group['lr'] = c.lr
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model = model.cuda()
criterion.cuda()
if c.lr_decay:
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
scheduler = None
num_params = count_parameters(model)
print("\n > Model has {} parameters".format(num_params), flush=True)
# pylint: disable=redefined-outer-name
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
global_step = args.restore_step
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
global_step)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--restore_path',
type=str,
help='Path to model outputs (checkpoint, tensorboard etc.).',
default=0)
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
)
parser.add_argument('--debug',
type=bool,
default=True,
help='Do not verify commit integrity to run training.')
parser.add_argument(
'--data_path',
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument('--output_path',
type=str,
help='path for training outputs.',
default='')
parser.add_argument('--output_folder',
type=str,
default='',
help='folder name for training outputs.')
args = parser.parse_args()
# setup output paths and read configs
c = load_config(args.config_path)
_ = os.path.dirname(os.path.realpath(__file__))
if args.data_path != '':
c.data_path = args.data_path
if args.output_path == '':
OUT_PATH = os.path.join(_, c.output_path)
else:
OUT_PATH = args.output_path
if args.output_folder == '':
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
else:
OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'),
new_fields)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

View File

@ -1,46 +0,0 @@
import umap
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("Agg")
colormap = (
np.array(
[
[76, 255, 0],
[0, 127, 70],
[255, 0, 0],
[255, 217, 38],
[0, 135, 255],
[165, 0, 165],
[255, 167, 255],
[0, 255, 255],
[255, 96, 38],
[142, 76, 0],
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
)
/ 255
)
def plot_embeddings(embeddings, num_utter_per_speaker):
embeddings = embeddings[: 10 * num_utter_per_speaker]
model = umap.UMAP()
projection = model.fit_transform(embeddings)
num_speakers = embeddings.shape[0] // num_utter_per_speaker
ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
colors = [colormap[i] for i in ground_truth]
fig, ax = plt.subplots(figsize=(16, 10))
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
plt.gca().set_aspect("equal", "datalim")
plt.title("UMAP projection")
plt.tight_layout()
plt.savefig("umap")
return fig

View File

@ -1,182 +0,0 @@
# pylint: disable=redefined-outer-name, unused-argument
import os
import time
import argparse
import torch
import json
import string
from TTS.utils.synthesis import synthesis
from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
from TTS.utils.audio import AudioProcessor
def tts(model,
vocoder_model,
C,
VC,
text,
ap,
ap_vocoder,
use_cuda,
batched_vocoder,
speaker_id=None,
figures=False):
t_1 = time.time()
use_vocoder_model = vocoder_model is not None
waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
if C.model == "Tacotron" and use_vocoder_model:
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
# correct if there is a scale difference b/w two models
if use_vocoder_model:
postnet_output = ap._denormalize(postnet_output)
postnet_output = ap_vocoder._normalize(postnet_output)
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
waveform = vocoder_model.generate(
vocoder_input.cuda() if use_cuda else vocoder_input,
batched=batched_vocoder,
target=8000,
overlap=400)
print(" > Run-time: {}".format(time.time() - t_1))
return alignment, postnet_output, stop_tokens, waveform
if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path',
type=str,
help='Path to model config file.')
parser.add_argument(
'model_path',
type=str,
help='Path to model file.',
)
parser.add_argument(
'out_path',
type=str,
help='Path to save final wav file. Wav file will be names as the text given.',
)
parser.add_argument('--use_cuda',
type=bool,
help='Run model on CUDA.',
default=False)
parser.add_argument(
'--vocoder_path',
type=str,
help=
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
default="",
)
parser.add_argument('--vocoder_config_path',
type=str,
help='Path to vocoder model config file.',
default="")
parser.add_argument(
'--batched_vocoder',
type=bool,
help="If True, vocoder model uses faster batch processing.",
default=True)
parser.add_argument('--speakers_json',
type=str,
help="JSON file for multi-speaker model.",
default="")
parser.add_argument(
'--speaker_id',
type=int,
help="target speaker_id if the model is multi-speaker.",
default=None)
args = parser.parse_args()
if args.vocoder_path != "":
assert args.use_cuda, " [!] Enable cuda for vocoder."
from WaveRNN.models.wavernn import Model as VocoderModel
# load the config
C = load_config(args.config_path)
C.forward_attn_mask = True
# load the audio processor
ap = AudioProcessor(**C.audio)
# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
symbols, phonemes = make_symbols(**C.characters)
# load speakers
if args.speakers_json != '':
speakers = json.load(open(args.speakers_json, 'r'))
num_speakers = len(speakers)
else:
num_speakers = 0
# load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, C)
cp = torch.load(args.model_path)
model.load_state_dict(cp['model'])
model.eval()
if args.use_cuda:
model.cuda()
model.decoder.set_r(cp['r'])
# load vocoder model
if args.vocoder_path != "":
VC = load_config(args.vocoder_config_path)
ap_vocoder = AudioProcessor(**VC.audio)
bits = 10
vocoder_model = VocoderModel(rnn_dims=512,
fc_dims=512,
mode=VC.mode,
mulaw=VC.mulaw,
pad=VC.pad,
upsample_factors=VC.upsample_factors,
feat_dims=VC.audio["num_mels"],
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=ap.hop_length,
sample_rate=ap.sample_rate,
use_aux_net=True,
use_upsample_net=True)
check = torch.load(args.vocoder_path)
vocoder_model.load_state_dict(check['model'])
vocoder_model.eval()
if args.use_cuda:
vocoder_model.cuda()
else:
vocoder_model = None
VC = None
ap_vocoder = None
# synthesize voice
print(" > Text: {}".format(args.text))
_, _, _, wav = tts(model,
vocoder_model,
C,
VC,
args.text,
ap,
ap_vocoder,
args.use_cuda,
args.batched_vocoder,
speaker_id=args.speaker_id,
figures=False)
# save the results
file_name = args.text.replace(" ", "_")
file_name = file_name.translate(
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(args.out_path, file_name)
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)

View File

@ -1,8 +1,8 @@
import unittest
import torch as T
from TTS.utils.generic_utils import save_checkpoint, save_best_model
from TTS.layers.tacotron import Prenet
from TTS.tts.utils.generic_utils import save_checkpoint, save_best_model
from TTS.tts.layers.tacotron import Prenet
OUT_PATH = '/tmp/test.pth.tar'

View File

@ -1,7 +1,7 @@
{
"model": "Tacotron2",
"run_name": "ljspeech-ddc-bn",
"run_description": "tacotron2 with ddc and batch-normalization",
"run_name": "test_sample_dataset_run",
"run_description": "sample dataset test run",
// AUDIO PARAMETERS
"audio":{
@ -61,30 +61,30 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16,
"batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":1,
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
"test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"noam_schedule": false, // use noam warmup and lr schedule.
"grad_clip": 1.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"epochs": 1, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"wd": 0.000001, // Weight decay weight.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
// TACOTRON PRENET
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "bn", // "original" or "bn".
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"prenet_type": "bn", // "original" or "bn".
"prenet_dropout": false, // enable/disable dropout at prenet.
// TACOTRON ATTENTION
@ -105,7 +105,7 @@
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console.
"print_step": 1, // Number of steps to log training on console.
"tb_plot_step": 100, // Number of steps to plot TB training figures.
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
@ -122,10 +122,10 @@
"max_seq_len": 153, // DATASET-RELATED: maximum text length
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/",
"output_path": "tests/train_outputs/",
// PHONEMES
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
@ -135,13 +135,15 @@
"use_gst": false, // TACOTRON ONLY: use global style tokens
// DATASETS
"train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
"eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
"datasets": // List of datasets. They all merged and they get different speaker_ids.
[
{
"name": "ljspeech",
"path": "/home/erogol/Data/LJSpeech-1.1/",
"path": "tests/data/ljspeech/",
"meta_file_train": "metadata.csv",
"meta_file_val": null
"meta_file_val": "metadata.csv"
}
]

View File

@ -31,7 +31,7 @@
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// DISTRIBUTED TRAINING
@ -90,7 +90,7 @@
},
// DATASET
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
"data_path": "tests/data/ljspeech/wavs/",
"feature_path": null,
"seq_len": 16384,
"pad_short": 2000,
@ -101,7 +101,7 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
// VALIDATION
"run_eval": true,
@ -109,7 +109,7 @@
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
"epochs": 10000, // total number of epochs to train.
"epochs": 1, // total number of epochs to train.
"wd": 0.0, // Weight decay weight.
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
@ -127,7 +127,7 @@
"lr_disc": 1e-4,
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_step": 1, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
@ -139,6 +139,6 @@
"eval_split_size": 10,
// PATHS
"output_path": "/home/erogol/Models/LJSpeech/"
"output_path": "tests/outputs/train_outputs/"
}

View File

@ -1,88 +0,0 @@
{
"run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
"run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
"audio":{
// Audio processing parameters
"num_mels": 80, // size of the mel spec frame.
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"hop_length": 256,
"win_length": 1024,
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": false, // move normalization to range [-1, 1]
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [],
"model": "Tacotron2", // one of the model in models/
"grad_clip": 1, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"windowing": false, // Enables attention windowing. Used only in eval mode.
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false,
"attention_type": "original",
"attention_heads": 5,
"bidirectional_decoder": false,
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"use_gst": false,
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
"ddc_r": 7, // reduction rate for coarse decoder.
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
"eval_batch_size":16,
"r": 1, // Number of frames to predict for step.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 10, // Number of steps to log traning on console.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false // whether to use additional embeddings for separate speakers
}

View File

@ -1,6 +1,6 @@
import unittest
from TTS.utils.text import phonemes
from TTS.tts.utils.text import phonemes
class SymbolsTest(unittest.TestCase):
def test_uniqueness(self): #pylint: disable=no-self-use

View File

@ -1,7 +1,7 @@
import os
import unittest
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
# pylint: disable=protected-access

View File

@ -4,10 +4,11 @@ import unittest
import torch as T
from TTS.server.synthesizer import Synthesizer
from TTS.tests import get_tests_input_path, get_tests_output_path
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config, save_checkpoint
from tests import get_tests_input_path, get_tests_output_path
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import save_checkpoint
from TTS.utils.io import load_config
class DemoServerTest(unittest.TestCase):

View File

@ -2,12 +2,13 @@ import os
import unittest
import torch as T
from tests import get_tests_path, get_tests_input_path
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.loss import GE2ELoss
from TTS.utils.io import load_config
file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
file_path = get_tests_input_path()
c = load_config(os.path.join(file_path, "test_config.json"))

View File

@ -1,9 +1,9 @@
import unittest
import torch as T
from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder
from TTS.layers.losses import L1LossMasked
from TTS.utils.generic_utils import sequence_mask
from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder
from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.utils.generic_utils import sequence_mask
# pylint: disable=unused-variable

View File

@ -4,18 +4,18 @@ import shutil
import torch
import numpy as np
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from torch.utils.data import DataLoader
from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor
from TTS.datasets import TTSDataset
from TTS.datasets.preprocess import ljspeech
from TTS.tts.datasets import TTSDataset
from TTS.tts.datasets.preprocess import ljspeech
#pylint: disable=unused-variable
file_path = os.path.dirname(os.path.realpath(__file__))
OUTPATH = os.path.join(file_path, "outputs/loader_tests/")
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
c = load_config(os.path.join(file_path, 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
ok_ljspeech = os.path.exists(c.data_path)
DATA_EXIST = True

View File

@ -1,8 +1,8 @@
import unittest
import os
from TTS.tests import get_tests_input_path
from tests import get_tests_input_path
from TTS.datasets.preprocess import common_voice
from TTS.tts.datasets.preprocess import common_voice
class TestPreprocessors(unittest.TestCase):

View File

@ -1,14 +1,14 @@
import os
import copy
import torch
import os
import unittest
import numpy as np
from torch import optim
from torch import nn
import torch
from tests import get_tests_input_path
from torch import nn, optim
from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.io import load_config
from TTS.layers.losses import MSELossMasked
from TTS.models.tacotron2 import Tacotron2
#pylint: disable=unused-variable
@ -16,8 +16,7 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
file_path = os.path.dirname(os.path.realpath(__file__))
c = load_config(os.path.join(file_path, 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
class TacotronTrainTest(unittest.TestCase):

View File

@ -5,9 +5,11 @@ import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('INFO')
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from TTS.utils.io import load_config
from TTS.tf.models.tacotron2 import Tacotron2
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
from TTS.tts.tf.models.tacotron2 import Tacotron2
from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
#pylint: disable=unused-variable
@ -15,8 +17,7 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
c = load_config(os.path.join(file_path, 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
class TacotronTFTrainTest(unittest.TestCase):

View File

@ -1,13 +1,14 @@
import os
import copy
import torch
import os
import unittest
from torch import optim
from torch import nn
import torch
from tests import get_tests_input_path
from torch import nn, optim
from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.models.tacotron import Tacotron
from TTS.utils.io import load_config
from TTS.layers.losses import L1LossMasked
from TTS.models.tacotron import Tacotron
#pylint: disable=unused-variable
@ -15,8 +16,7 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
file_path = os.path.dirname(os.path.realpath(__file__))
c = load_config(os.path.join(file_path, 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
def count_parameters(model):

View File

@ -3,12 +3,12 @@ import os
# pylint: disable=wildcard-import
# pylint: disable=unused-import
import unittest
from TTS.utils.text import *
from TTS.tests import get_tests_path
from tests import get_tests_input_path
from TTS.tts.utils.text import *
from tests import get_tests_path
from TTS.utils.io import load_config
TESTS_PATH = get_tests_path()
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
def test_phoneme_to_sequence():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"

13
tests/test_tts_train.sh Executable file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
BASEDIR=$(dirname "$0")
echo "$BASEDIR"
# run training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json
# find the training folder
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
echo $LATEST_FOLDER
# continue the previous training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
# remove all the outputs
rm -rf $BASEDIR/train_outputs/

View File

@ -1,24 +1,24 @@
import os
import numpy as np
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from torch.utils.data import DataLoader
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data
file_path = os.path.dirname(os.path.realpath(__file__))
OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/")
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
C = load_config(os.path.join(file_path, 'test_config.json'))
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/")
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
ok_ljspeech = os.path.exists(test_data_path)
def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers):
''' run dataloader with given parameters and check conditions '''
ap = AudioProcessor(**C.audio)

View File

@ -1,11 +1,11 @@
import os
import torch
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
from TTS.vocoder.layers.losses import TorchSTFT, STFTLoss, MultiScaleSTFTLoss
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.vocoder.layers.losses import MultiScaleSTFTLoss, STFTLoss, TorchSTFT
TESTS_PATH = get_tests_path()
@ -14,8 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
file_path = os.path.dirname(os.path.realpath(__file__))
C = load_config(os.path.join(file_path, 'test_config.json'))
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
ap = AudioProcessor(**C.audio)
@ -53,9 +52,3 @@ def test_multiscale_stft_loss():
loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
assert loss_sc < 1.0
assert loss_m + loss_sc > 0

View File

@ -4,7 +4,7 @@ import torch
import soundfile as sf
from librosa.core import load
from TTS.tests import get_tests_path, get_tests_input_path
from tests import get_tests_path, get_tests_input_path
from TTS.vocoder.layers.pqmf import PQMF

15
tests/test_vocoder_train.sh Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
BASEDIR=$(dirname "$0")
echo "$BASEDIR"
# create run dir
mkdir $BASEDIR/train_outputs
# run training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
# find the training folder
LATEST_FOLDER=$(ls $BASEDIR/outputs/train_outputs/| sort | tail -1)
echo $LATEST_FOLDER
# continue the previous training
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/outputs/train_outputs/$LATEST_FOLDER
# remove all the outputs
rm -rf $BASEDIR/train_outputs/

View File

@ -1,20 +0,0 @@
## Utilities to Convert Models to Tensorflow2
Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=).
Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations.
Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome.
To see how you can use TF model at inference, check the notebook.
This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own.
### Converting a Model
- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments.
### Known issues ans limitations
- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors.
- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```.
- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper.
- You can only convert models trained after TF model implementation since model layers has been updated in Torch model.

View File

View File

@ -1,37 +0,0 @@
# Convert Tensorflow Tacotron2 model to TF-Lite binary
import argparse
from TTS.utils.io import load_config
from TTS.utils.text.symbols import symbols, phonemes
from TTS.tf.utils.generic_utils import setup_model
from TTS.tf.utils.io import load_checkpoint
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite
parser = argparse.ArgumentParser()
parser.add_argument('--tf_model',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument('--output_path',
type=str,
help='path to tflite output binary.')
args = parser.parse_args()
# Set constants
CONFIG = load_config(args.config_path)
# load the model
c = CONFIG
num_speakers = 0
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, c, enable_tflite=True)
model.build_inference()
model = load_checkpoint(model, args.tf_model)
model.decoder.set_max_decoder_steps(1000)
# create tflite model
tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)

View File

@ -1,210 +0,0 @@
# %%
import sys
sys.path.append('/home/erogol/Projects')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
# %%
import argparse
import numpy as np
import torch
import tensorflow as tf
from fuzzywuzzy import fuzz
from TTS.utils.text.symbols import phonemes, symbols
from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.tf.models.tacotron2 import Tacotron2
from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
from TTS.tf.utils.generic_utils import save_checkpoint
parser = argparse.ArgumentParser()
parser.add_argument('--torch_model_path',
type=str,
help='Path to target torch model to be converted to TF.')
parser.add_argument('--config_path',
type=str,
help='Path to config file of torch model.')
parser.add_argument('--output_path',
type=str,
help='path to output file including file name to save TF model.')
args = parser.parse_args()
# load model config
config_path = args.config_path
c = load_config(config_path)
num_speakers = 0
# init torch model
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
model = setup_model(num_chars, num_speakers, c)
checkpoint = torch.load(args.torch_model_path,
map_location=torch.device('cpu'))
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
# init tf model
model_tf = Tacotron2(num_chars=num_chars,
num_speakers=num_speakers,
r=model.decoder.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder)
# set initial layer mapping - these are not captured by the below heuristic approach
# TODO: set layer names so that we can remove these manual matching
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
var_map = [
('embedding/embeddings:0', 'embedding.weight'),
('encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
'encoder.lstm.weight_ih_l0'),
('encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
'encoder.lstm.weight_hh_l0'),
('encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
'encoder.lstm.weight_ih_l0_reverse'),
('encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
'encoder.lstm.weight_hh_l0_reverse'),
('encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
('encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
('decoder/linear_projection/kernel:0',
'decoder.linear_projection.linear_layer.weight'),
('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
]
# %%
# get tf_model graph
mel_pred = model_tf.build_inference()
# get tf variables
tf_vars = model_tf.weights
# match variable names with fuzzy logic
torch_var_names = list(state_dict.keys())
tf_var_names = [we.name for we in model_tf.weights]
for tf_name in tf_var_names:
# skip re-mapped layer names
if tf_name in [name[0] for name in var_map]:
continue
tf_name_edited = convert_tf_name(tf_name)
ratios = [
fuzz.ratio(torch_name, tf_name_edited)
for torch_name in torch_var_names
]
max_idx = np.argmax(ratios)
matching_name = torch_var_names[max_idx]
del torch_var_names[max_idx]
var_map.append((tf_name, matching_name))
# %%
# print variable match
from pprint import pprint
pprint(var_map)
pprint(torch_var_names)
# pass weights
tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
# Compare TF and TORCH models
# %%
# check embedding outputs
model.eval()
input_ids = torch.randint(0, 24, (1, 128)).long()
o_t = model.embedding(input_ids)
o_tf = model_tf.embedding(input_ids.detach().numpy())
assert abs(o_t.detach().numpy() -
o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
o_tf.numpy()).sum()
# compare encoder outputs
oo_en = model.encoder.inference(o_t.transpose(1, 2))
ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
assert compare_torch_tf(oo_en, ooo_en) < 1e-5
#pylint: disable=redefined-builtin
# compare decoder.attention_rnn
inp = torch.rand([1, 768])
inp_tf = inp.numpy()
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
output, cell_state = model.decoder.attention_rnn(inp)
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
states[2],
training=False)
assert compare_torch_tf(output, output_tf).mean() < 1e-5
query = output
inputs = torch.rand([1, 128, 512])
query_tf = query.detach().numpy()
inputs_tf = inputs.numpy()
# compare decoder.attention
model.decoder.attention.init_states(inputs)
processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
loc_attn, proc_query = model.decoder.attention.get_location_attention(
query, processes_inputs)
context = model.decoder.attention(query, inputs, processes_inputs, None)
attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1]
model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states)
context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False)
assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5
assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5
assert compare_torch_tf(context, context_tf) < 1e-5
# compare decoder.decoder_rnn
input = torch.rand([1, 1536])
input_tf = input.numpy()
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
output, cell_state = model.decoder.decoder_rnn(
input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
states[3],
training=False)
assert abs(input - input_tf).mean() < 1e-5
assert compare_torch_tf(output, output_tf).mean() < 1e-5
# compare decoder.linear_projection
input = torch.rand([1, 1536])
input_tf = input.numpy()
output = model.decoder.linear_projection(input)
output_tf = model_tf.decoder.linear_projection(input_tf, training=False)
assert compare_torch_tf(output, output_tf) < 1e-5
# compare decoder outputs
model.decoder.max_decoder_steps = 100
model_tf.decoder.set_max_decoder_steps(100)
output, align, stop = model.decoder.inference(oo_en)
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
# compare the whole model output
outputs_torch = model.inference(input_ids)
outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
assert compare_torch_tf(outputs_torch[2][:, 50, :],
outputs_tf[2][:, 50, :]) < 1e-5
assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
# %%
# save tf model
save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
checkpoint['r'], args.output_path)
print(' > Model conversion is successfully completed :).')

View File

@ -1,285 +0,0 @@
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.ops import math_ops
# from tensorflow_addons.seq2seq import BahdanauAttention
class Linear(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs):
super(Linear, self).__init__(**kwargs)
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
self.activation = keras.layers.ReLU()
def call(self, x):
"""
shapes:
x: B x T x C
"""
return self.activation(self.linear_layer(x))
class LinearBN(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs):
super(LinearBN, self).__init__(**kwargs)
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization')
self.activation = keras.layers.ReLU()
def call(self, x, training=None):
"""
shapes:
x: B x T x C
"""
out = self.linear_layer(x)
out = self.batch_normalization(out, training=training)
return self.activation(out)
class Prenet(keras.layers.Layer):
def __init__(self,
prenet_type,
prenet_dropout,
units,
bias,
**kwargs):
super(Prenet, self).__init__(**kwargs)
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
self.linear_layers = []
if prenet_type == "bn":
self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
elif prenet_type == "original":
self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
else:
raise RuntimeError(' [!] Unknown prenet type.')
if prenet_dropout:
self.dropout = keras.layers.Dropout(rate=0.5)
def call(self, x, training=None):
"""
shapes:
x: B x T x C
"""
for linear in self.linear_layers:
if self.prenet_dropout:
x = self.dropout(linear(x), training=training)
else:
x = linear(x)
return x
def _sigmoid_norm(score):
attn_weights = tf.nn.sigmoid(score)
attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True)
return attn_weights
class Attention(keras.layers.Layer):
"""TODO: implement forward_attention
TODO: location sensitive attention
TODO: implement attention windowing """
def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters,
loc_attn_kernel_size, use_windowing, norm, use_forward_attn,
use_trans_agent, use_forward_attn_mask, **kwargs):
super(Attention, self).__init__(**kwargs)
self.use_loc_attn = use_loc_attn
self.loc_attn_n_filters = loc_attn_n_filters
self.loc_attn_kernel_size = loc_attn_kernel_size
self.use_windowing = use_windowing
self.norm = norm
self.use_forward_attn = use_forward_attn
self.use_trans_agent = use_trans_agent
self.use_forward_attn_mask = use_forward_attn_mask
self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer')
self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer')
self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer')
if use_loc_attn:
self.location_conv1d = keras.layers.Conv1D(
filters=loc_attn_n_filters,
kernel_size=loc_attn_kernel_size,
padding='same',
use_bias=False,
name='location_layer/location_conv1d')
self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense')
if norm == 'softmax':
self.norm_func = tf.nn.softmax
elif norm == 'sigmoid':
self.norm_func = _sigmoid_norm
else:
raise ValueError("Unknown value for attention norm type")
def init_states(self, batch_size, value_length):
states = []
if self.use_loc_attn:
attention_cum = tf.zeros([batch_size, value_length])
attention_old = tf.zeros([batch_size, value_length])
states = [attention_cum, attention_old]
if self.use_forward_attn:
alpha = tf.concat([
tf.ones([batch_size, 1]),
tf.zeros([batch_size, value_length])[:, :-1] + 1e-7
], 1)
states.append(alpha)
return tuple(states)
def process_values(self, values):
""" cache values for decoder iterations """
#pylint: disable=attribute-defined-outside-init
self.processed_values = self.inputs_layer(values)
self.values = values
def get_loc_attn(self, query, states):
""" compute location attention, query layer and
unnorm. attention weights"""
attention_cum, attention_old = states[:2]
attn_cat = tf.stack([attention_old, attention_cum], axis=2)
processed_query = self.query_layer(tf.expand_dims(query, 1))
processed_attn = self.location_dense(self.location_conv1d(attn_cat))
score = self.v(
tf.nn.tanh(self.processed_values + processed_query +
processed_attn))
score = tf.squeeze(score, axis=2)
return score, processed_query
def get_attn(self, query):
""" compute query layer and unnormalized attention weights """
processed_query = self.query_layer(tf.expand_dims(query, 1))
score = self.v(tf.nn.tanh(self.processed_values + processed_query))
score = tf.squeeze(score, axis=2)
return score, processed_query
def apply_score_masking(self, score, mask): #pylint: disable=no-self-use
""" ignore sequence paddings """
padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
# Bias so padding positions do not contribute to attention distribution.
score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
return score
def apply_forward_attention(self, alignment, alpha): #pylint: disable=no-self-use
# forward attention
fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0)
# compute transition potentials
new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment
# renormalize attention weights
new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True)
return new_alpha
def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None):
states = []
if self.use_loc_attn:
states = [old_states[0] + scores_norm, attn_weights]
if self.use_forward_attn:
states.append(new_alpha)
return tuple(states)
def call(self, query, states):
"""
shapes:
query: B x D
"""
if self.use_loc_attn:
score, _ = self.get_loc_attn(query, states)
else:
score, _ = self.get_attn(query)
# TODO: masking
# if mask is not None:
# self.apply_score_masking(score, mask)
# attn_weights shape == (batch_size, max_length, 1)
# normalize attention scores
scores_norm = self.norm_func(score)
attn_weights = scores_norm
# apply forward attention
new_alpha = None
if self.use_forward_attn:
new_alpha = self.apply_forward_attention(attn_weights, states[-1])
attn_weights = new_alpha
# update states tuple
# states = (cum_attn_weights, attn_weights, new_alpha)
states = self.update_states(states, scores_norm, attn_weights, new_alpha)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False)
context_vector = tf.squeeze(context_vector, axis=1)
return context_vector, attn_weights, states
# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b):
# dtype = processed_query.dtype
# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1]
# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2])
# class LocationSensitiveAttention(BahdanauAttention):
# def __init__(self,
# units,
# memory=None,
# memory_sequence_length=None,
# normalize=False,
# probability_fn="softmax",
# kernel_initializer="glorot_uniform",
# dtype=None,
# name="LocationSensitiveAttention",
# location_attention_filters=32,
# location_attention_kernel_size=31):
# super(LocationSensitiveAttention,
# self).__init__(units=units,
# memory=memory,
# memory_sequence_length=memory_sequence_length,
# normalize=normalize,
# probability_fn='softmax', ## parent module default
# kernel_initializer=kernel_initializer,
# dtype=dtype,
# name=name)
# if probability_fn == 'sigmoid':
# self.probability_fn = lambda score, _: self._sigmoid_normalization(score)
# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False)
# self.location_dense = keras.layers.Dense(units, use_bias=False)
# # self.v = keras.layers.Dense(1, use_bias=True)
# def _location_sensitive_score(self, processed_query, keys, processed_loc):
# processed_query = tf.expand_dims(processed_query, 1)
# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2])
# def _location_sensitive(self, alignment_cum, alignment_old):
# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2)
# return self.location_dense(self.location_conv(alignment_cat))
# def _sigmoid_normalization(self, score):
# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True)
# # def _apply_masking(self, score, mask):
# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
# # # Bias so padding positions do not contribute to attention distribution.
# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
# # return score
# def _calculate_attention(self, query, state):
# alignment_cum, alignment_old = state[:2]
# processed_query = self.query_layer(
# query) if self.query_layer else query
# processed_loc = self._location_sensitive(alignment_cum, alignment_old)
# score = self._location_sensitive_score(
# processed_query,
# self.keys,
# processed_loc)
# alignment = self.probability_fn(score, state)
# alignment_cum = alignment_cum + alignment
# state[0] = alignment_cum
# state[1] = alignment
# return alignment, state
# def compute_context(self, alignments):
# expanded_alignments = tf.expand_dims(alignments, 1)
# context = tf.matmul(expanded_alignments, self.values)
# context = tf.squeeze(context, [1])
# return context
# # def call(self, query, state):
# # alignment, next_state = self._calculate_attention(query, state)
# # return alignment, next_state

View File

@ -1,300 +0,0 @@
import tensorflow as tf
from tensorflow import keras
from TTS.tf.utils.tf_utils import shape_list
from TTS.tf.layers.common_layers import Prenet, Attention
# from tensorflow_addons.seq2seq import AttentionWrapper
class ConvBNBlock(keras.layers.Layer):
def __init__(self, filters, kernel_size, activation, **kwargs):
super(ConvBNBlock, self).__init__(**kwargs)
self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d')
self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization')
self.dropout = keras.layers.Dropout(rate=0.5, name='dropout')
self.activation = keras.layers.Activation(activation, name='activation')
def call(self, x, training=None):
o = self.convolution1d(x)
o = self.batch_normalization(o, training=training)
o = self.activation(o)
o = self.dropout(o, training=training)
return o
class Postnet(keras.layers.Layer):
def __init__(self, output_filters, num_convs, **kwargs):
super(Postnet, self).__init__(**kwargs)
self.convolutions = []
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0'))
for idx in range(1, num_convs - 1):
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}'))
self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}'))
def call(self, x, training=None):
o = x
for layer in self.convolutions:
o = layer(o, training=training)
return o
class Encoder(keras.layers.Layer):
def __init__(self, output_input_dim, **kwargs):
super(Encoder, self).__init__(**kwargs)
self.convolutions = []
for idx in range(3):
self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}'))
self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm')
def call(self, x, training=None):
o = x
for layer in self.convolutions:
o = layer(o, training=training)
o = self.lstm(o)
return o
class Decoder(keras.layers.Layer):
#pylint: disable=unused-argument
def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type,
prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask,
use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, enable_tflite, **kwargs):
super(Decoder, self).__init__(**kwargs)
self.frame_dim = frame_dim
self.r_init = tf.constant(r, dtype=tf.int32)
self.r = tf.constant(r, dtype=tf.int32)
self.output_dim = r * self.frame_dim
self.separate_stopnet = separate_stopnet
self.enable_tflite = enable_tflite
# layer constants
self.max_decoder_steps = tf.constant(1000, dtype=tf.int32)
self.stop_thresh = tf.constant(0.5, dtype=tf.float32)
# model dimensions
self.query_dim = 1024
self.decoder_rnn_dim = 1024
self.prenet_dim = 256
self.attn_dim = 128
self.p_attention_dropout = 0.1
self.p_decoder_dropout = 0.1
self.prenet = Prenet(prenet_type,
prenet_dropout,
[self.prenet_dim, self.prenet_dim],
bias=False,
name='prenet')
self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name='attention_rnn', )
self.attention_rnn_dropout = keras.layers.Dropout(0.5)
# TODO: implement other attn options
self.attention = Attention(attn_dim=self.attn_dim,
use_loc_attn=True,
loc_attn_n_filters=32,
loc_attn_kernel_size=31,
use_windowing=False,
norm=attn_norm,
use_forward_attn=use_forward_attn,
use_trans_agent=use_trans_agent,
use_forward_attn_mask=use_forward_attn_mask,
name='attention')
self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name='decoder_rnn')
self.decoder_rnn_dropout = keras.layers.Dropout(0.5)
self.linear_projection = keras.layers.Dense(self.frame_dim * r, name='linear_projection/linear_layer')
self.stopnet = keras.layers.Dense(1, name='stopnet/linear_layer')
def set_max_decoder_steps(self, new_max_steps):
self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32)
def set_r(self, new_r):
self.r = tf.constant(new_r, dtype=tf.int32)
self.output_dim = self.frame_dim * new_r
def build_decoder_initial_states(self, batch_size, memory_dim, memory_length):
zero_frame = tf.zeros([batch_size, self.frame_dim])
zero_context = tf.zeros([batch_size, memory_dim])
attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
attention_states = self.attention.init_states(batch_size, memory_length)
return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states
def step(self, prenet_next, states,
memory_seq_length=None, training=None):
_, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states
attention_rnn_input = tf.concat([prenet_next, context_next], -1)
attention_rnn_output, attention_rnn_state = \
self.attention_rnn(attention_rnn_input,
attention_rnn_state, training=training)
attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training)
context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training)
decoder_rnn_input = tf.concat([attention_rnn_output, context], -1)
decoder_rnn_output, decoder_rnn_state = \
self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training)
decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training)
linear_projection_input = tf.concat([decoder_rnn_output, context], -1)
output_frame = self.linear_projection(linear_projection_input, training=training)
stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1)
stopnet_output = self.stopnet(stopnet_input, training=training)
output_frame = output_frame[:, :self.r * self.frame_dim]
states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states)
return output_frame, stopnet_output, states, attention
def decode(self, memory, states, frames, memory_seq_length=None):
B, _, _ = shape_list(memory)
num_iter = shape_list(frames)[1] // self.r
# init states
frame_zero = tf.expand_dims(states[0], 1)
frames = tf.concat([frame_zero, frames], axis=1)
outputs = tf.TensorArray(dtype=tf.float32, size=num_iter)
attentions = tf.TensorArray(dtype=tf.float32, size=num_iter)
stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter)
# pre-computes
self.attention.process_values(memory)
prenet_output = self.prenet(frames, training=True)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions):
prenet_next = prenet_output[:, step]
output, stop_token, states, attention = self.step(prenet_next,
states,
memory_seq_length)
outputs = outputs.write(step, output)
attentions = attentions.write(step, attention)
stop_tokens = stop_tokens.write(step, stop_token)
return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions
_, memory, _, states, outputs, stop_tokens, attentions = \
tf.while_loop(lambda *arg: True,
_body,
loop_vars=(step_count, memory, prenet_output,
states, outputs, stop_tokens, attentions),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=num_iter)
outputs = outputs.stack()
attentions = attentions.stack()
stop_tokens = stop_tokens.stack()
outputs = tf.transpose(outputs, [1, 0, 2])
attentions = tf.transpose(attentions, [1, 0, 2])
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
stop_tokens = tf.squeeze(stop_tokens, axis=2)
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def decode_inference(self, memory, states):
B, _, _ = shape_list(memory)
# init states
outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
# pre-computes
self.attention.process_values(memory)
# iter vars
stop_flag = tf.constant(False, dtype=tf.bool)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag):
frame_next = states[0]
prenet_next = self.prenet(frame_next, training=False)
output, stop_token, states, attention = self.step(prenet_next,
states,
None,
training=False)
stop_token = tf.math.sigmoid(stop_token)
outputs = outputs.write(step, output)
attentions = attentions.write(step, attention)
stop_tokens = stop_tokens.write(step, stop_token)
stop_flag = tf.greater(stop_token, self.stop_thresh)
stop_flag = tf.reduce_all(stop_flag)
return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag
cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
_, memory, states, outputs, stop_tokens, attentions, stop_flag = \
tf.while_loop(cond,
_body,
loop_vars=(step_count, memory, states, outputs,
stop_tokens, attentions, stop_flag),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=self.max_decoder_steps)
outputs = outputs.stack()
attentions = attentions.stack()
stop_tokens = stop_tokens.stack()
outputs = tf.transpose(outputs, [1, 0, 2])
attentions = tf.transpose(attentions, [1, 0, 2])
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
stop_tokens = tf.squeeze(stop_tokens, axis=2)
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def decode_inference_tflite(self, memory, states):
"""Inference with TF-Lite compatibility. It assumes
batch_size is 1"""
# init states
# dynamic_shape is not supported in TFLite
outputs = tf.TensorArray(dtype=tf.float32,
size=self.max_decoder_steps,
element_shape=tf.TensorShape(
[self.output_dim]),
clear_after_read=False,
dynamic_size=False)
# stop_flags = tf.TensorArray(dtype=tf.bool,
# size=self.max_decoder_steps,
# element_shape=tf.TensorShape(
# []),
# clear_after_read=False,
# dynamic_size=False)
attentions = ()
stop_tokens = ()
# pre-computes
self.attention.process_values(memory)
# iter vars
stop_flag = tf.constant(False, dtype=tf.bool)
step_count = tf.constant(0, dtype=tf.int32)
def _body(step, memory, states, outputs, stop_flag):
frame_next = states[0]
prenet_next = self.prenet(frame_next, training=False)
output, stop_token, states, _ = self.step(prenet_next,
states,
None,
training=False)
stop_token = tf.math.sigmoid(stop_token)
stop_flag = tf.greater(stop_token, self.stop_thresh)
stop_flag = tf.reduce_all(stop_flag)
# stop_flags = stop_flags.write(step, tf.logical_not(stop_flag))
outputs = outputs.write(step, tf.reshape(output, [-1]))
return step + 1, memory, states, outputs, stop_flag
cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
step_count, memory, states, outputs, stop_flag = \
tf.while_loop(cond,
_body,
loop_vars=(step_count, memory, states, outputs,
stop_flag),
parallel_iterations=32,
swap_memory=True,
maximum_iterations=self.max_decoder_steps)
outputs = outputs.stack()
outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter
outputs = tf.expand_dims(outputs, axis=[0])
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = tf.reshape(outputs, [1, -1, self.frame_dim])
return outputs, stop_tokens, attentions
def call(self, memory, states, frames=None, memory_seq_length=None, training=False):
if training:
return self.decode(memory, states, frames, memory_seq_length)
if self.enable_tflite:
return self.decode_inference_tflite(memory, states)
return self.decode_inference(memory, states)

View File

@ -1,108 +0,0 @@
import tensorflow as tf
from tensorflow import keras
from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
from TTS.tf.utils.tf_utils import shape_list
#pylint: disable=too-many-ancestors
class Tacotron2(keras.models.Model):
def __init__(self,
num_chars,
num_speakers,
r,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm="softmax",
attn_K=4,
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=False):
super(Tacotron2, self).__init__()
self.r = r
self.decoder_output_dim = decoder_output_dim
self.postnet_output_dim = postnet_output_dim
self.bidirectional_decoder = bidirectional_decoder
self.num_speakers = num_speakers
self.speaker_embed_dim = 256
self.enable_tflite = enable_tflite
self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding')
self.encoder = Encoder(512, name='encoder')
# TODO: most of the decoder args have no use at the momment
self.decoder = Decoder(decoder_output_dim,
r,
attn_type=attn_type,
use_attn_win=attn_win,
attn_norm=attn_norm,
prenet_type=prenet_type,
prenet_dropout=prenet_dropout,
use_forward_attn=forward_attn,
use_trans_agent=trans_agent,
use_forward_attn_mask=forward_attn_mask,
use_location_attn=location_attn,
attn_K=attn_K,
separate_stopnet=separate_stopnet,
speaker_emb_dim=self.speaker_embed_dim,
name='decoder',
enable_tflite=enable_tflite)
self.postnet = Postnet(postnet_output_dim, 5, name='postnet')
@tf.function(experimental_relax_shapes=True)
def call(self, characters, text_lengths=None, frames=None, training=None):
if training:
return self.training(characters, text_lengths, frames)
if not training:
return self.inference(characters)
raise RuntimeError(' [!] Set model training mode True or False')
def training(self, characters, text_lengths, frames):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=True)
encoder_output = self.encoder(embedding_vectors, training=True)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True)
postnet_frames = self.postnet(decoder_frames, training=True)
output_frames = decoder_frames + postnet_frames
return decoder_frames, output_frames, attentions, stop_tokens
def inference(self, characters):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=False)
encoder_output = self.encoder(embedding_vectors, training=False)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
postnet_frames = self.postnet(decoder_frames, training=False)
output_frames = decoder_frames + postnet_frames
print(output_frames.shape)
return decoder_frames, output_frames, attentions, stop_tokens
@tf.function(
experimental_relax_shapes=True,
input_signature=[
tf.TensorSpec([1, None], dtype=tf.int32),
],)
def inference_tflite(self, characters):
B, T = shape_list(characters)
embedding_vectors = self.embedding(characters, training=False)
encoder_output = self.encoder(embedding_vectors, training=False)
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
postnet_frames = self.postnet(decoder_frames, training=False)
output_frames = decoder_frames + postnet_frames
print(output_frames.shape)
return decoder_frames, output_frames, attentions, stop_tokens
def build_inference(self, ):
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
self(input_ids)

View File

@ -1,714 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"This is to test TTS tensorflow models with benchmark sentences.\n",
"\n",
"Before running this script please DON'T FORGET: \n",
"- to set file paths.\n",
"- to download related models.\n",
" - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n",
"- download or clone related repos, linked below.\n",
"- setup the repositories. ```python setup.py install```\n",
"- to checkout right commit versions (given next to the model in the models page).\n",
"- to set the file paths below.\n",
"\n",
"Repositories:\n",
"- TTS: https://github.com/mozilla/TTS\n",
"- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n",
"\n",
"Known Issues:\n",
"- To load the model second time you need to restart the notebook kernel. \n",
"- Some of the advance methods are not yet implemented for Tensorflow."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"scrolled": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"\n",
"# you may need to change this depending on your system\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"\n",
"import sys\n",
"import io\n",
"import torch \n",
"import tensorflow as tf\n",
"print(tf.config.list_physical_devices('GPU'))\n",
"\n",
"import time\n",
"import json\n",
"import yaml\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams[\"figure.figsize\"] = (16,5)\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.tf.models.tacotron2 import Tacotron2\n",
"from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config\n",
"from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import visualize\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" print(mel_postnet_spec.shape)\n",
" print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n",
" if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
" waveform = waveform.numpy()\n",
" waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" if figures: \n",
" visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n",
" IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n",
" os.makedirs(OUT_FOLDER, exist_ok=True)\n",
" file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
" ap.save_wav(waveform, out_path)\n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '../torch_model/'\n",
"MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"# Run FLAGs\n",
"use_cuda = True # use the available GPU (only for torch)\n",
"# Set the vocoder\n",
"use_gl = True # use GL if True\n",
"BACKEND = 'tf' # set the backend for inference "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"scrolled": true
},
"outputs": [],
"source": [
"from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n",
"from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n",
"c = CONFIG\n",
"num_speakers = 0\n",
"r = 1\n",
"num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, num_speakers, c)\n",
"\n",
"# before loading weights you need to run the model once to generate the variables\n",
"input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n",
"mel_pred = model(input_ids, training=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"scrolled": true
},
"outputs": [],
"source": [
"model = load_checkpoint(model, MODEL_PATH)\n",
"# model = tf.function(model, experimental_relax_shapes=True)\n",
"ap = AudioProcessor(**CONFIG.audio) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# wrapper class to use tf.function\n",
"class ModelInference(tf.keras.Model):\n",
" def __init__(self, model):\n",
" super(ModelInference, self).__init__()\n",
" self.model = model\n",
" \n",
" @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n",
" def call(self, characters):\n",
" return self.model(characters, training=False)\n",
" \n",
"model = ModelInference(model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# LOAD WAVERNN\n",
"if use_gl == False:\n",
" from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n",
" \n",
" vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
" vocoder_model.remove_weight_norm()\n",
" ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
" if use_cuda:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval();\n",
" print(count_parameters(vocoder_model))"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Comparision with https://mycroft.ai/blog/available-voices/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### https://espnet.github.io/icassp2020-tts/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"The Commission also recommends\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Other examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Comparison with https://keithito.github.io/audio-samples/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \" He has read the whole thing.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"He reads books.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Thisss isrealy awhsome.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"This is your internet browser, Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"This is your internet browser Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Eren, how are you?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Hard Sentences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Encouraged, he started with a minute a day.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"If he decided to watch TV he really watched it.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# for twb dataset\n",
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"wavs = []\n",
"model.eval()\n",
"model.decoder.prenet.eval()\n",
"model.decoder.max_decoder_steps = 2000\n",
"# model.decoder.prenet.train()\n",
"speaker_id = None\n",
"sentence = '''This is App Store Optimization report.\n",
"The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and whats new section.\n",
"In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n",
"Product name is the actual app name on the AppStore or Google Play Store.\n",
"Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n",
"Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n",
"If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
"If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps Store presence back then.\n",
"You can also filter for a specific app using Product Name.\n",
"If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
"'''\n",
"\n",
"for s in sentence.split('\\n'):\n",
" print(s)\n",
" align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n",
" wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1 +0,0 @@

View File

@ -1,81 +0,0 @@
import numpy as np
import tensorflow as tf
def tf_create_dummy_inputs():
""" Create dummy inputs for TF Tacotron2 model """
batch_size = 4
max_input_length = 32
max_mel_length = 128
pad = 1
n_chars = 24
input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32)
input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size])
input_lengths[-1] = max_input_length
input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32)
mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80])
mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
mel_lengths[-1] = max_mel_length
mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32)
return input_ids, input_lengths, mel_outputs, mel_lengths
def compare_torch_tf(torch_tensor, tf_tensor):
""" Compute the average absolute difference b/w torch and tf tensors """
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
def convert_tf_name(tf_name):
""" Convert certain patterns in TF layer names to Torch patterns """
tf_name_tmp = tf_name
tf_name_tmp = tf_name_tmp.replace(':0', '')
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0')
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1')
tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh')
tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight')
tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight')
tf_name_tmp = tf_name_tmp.replace('/beta', '/bias')
tf_name_tmp = tf_name_tmp.replace('/', '.')
return tf_name_tmp
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
""" Transfer weigths from torch state_dict to TF variables """
print(" > Passing weights from Torch to TF ...")
for tf_var in tf_vars:
torch_var_name = var_map_dict[tf_var.name]
print(f' | > {tf_var.name} <-- {torch_var_name}')
# if tuple, it is a bias variable
if not isinstance(torch_var_name, tuple):
torch_layer_name = '.'.join(torch_var_name.split('.')[-2:])
torch_weight = state_dict[torch_var_name]
if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name:
# out_dim, in_dim, filter -> filter, in_dim, out_dim
numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy()
elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
# if variable is for bidirectional lstm and it is a bias vector there
# needs to be pre-defined two matching torch bias vectors
elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name:
bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name]
assert len(bias_vectors) == 2
numpy_weight = bias_vectors[0] + bias_vectors[1]
elif 'rnn' in tf_var.name and 'kernel' in tf_var.name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
elif 'rnn' in tf_var.name and 'bias' in tf_var.name:
bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key]
assert len(bias_vectors) == 2
numpy_weight = bias_vectors[0] + bias_vectors[1]
elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name:
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
else:
numpy_weight = torch_weight.detach().cpu().numpy()
assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}"
tf.keras.backend.set_value(tf_var, numpy_weight)
return tf_vars
def load_tf_vars(model_tf, tf_vars):
for tf_var in tf_vars:
model_tf.get_layer(tf_var.name).set_weights(tf_var)
return model_tf

View File

@ -1,104 +0,0 @@
import os
import datetime
import importlib
import pickle
import numpy as np
import tensorflow as tf
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
state = {
'model': model.weights,
'optimizer': optimizer,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
pickle.dump(state, open(output_path, 'wb'))
def load_checkpoint(model, checkpoint_path):
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
tf_vars = model.weights
for tf_var in tf_vars:
layer_name = tf_var.name
try:
chkp_var_value = chkp_var_dict[layer_name]
except KeyError:
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
layer_name = f"{class_name}/{layer_name}"
chkp_var_value = chkp_var_dict[layer_name]
tf.keras.backend.set_value(tf_var, chkp_var_value)
if 'r' in checkpoint.keys():
model.decoder.set_r(checkpoint['r'])
return model
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.max()
batch_size = sequence_length.size(0)
seq_range = np.empty([0, max_len], dtype=np.int8)
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
seq_range_expand = seq_range_expand.cuda()
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
return seq_range_expand < seq_length_expand
# @tf.custom_gradient
def check_gradient(x, grad_clip):
x_normed = tf.clip_by_norm(x, grad_clip)
grad_norm = tf.norm(grad_clip)
return x_normed, grad_norm
def count_parameters(model, c):
try:
return model.count_params()
except RuntimeError:
input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32'))
input_lengths = np.random.randint(100, 129, (8, ))
input_lengths[-1] = 128
input_lengths = tf.convert_to_tensor(input_lengths.astype('int32'))
mel_spec = np.random.rand(8, 2 * c.r,
c.audio['num_mels']).astype('float32')
mel_spec = tf.convert_to_tensor(mel_spec)
speaker_ids = np.random.randint(
0, 5, (8, )) if c.use_speaker_embedding else None
_ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids)
return model.count_params()
def setup_model(num_chars, num_speakers, c, enable_tflite=False):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in "tacotron":
raise NotImplementedError(' [!] Tacotron model is not ready.')
# tacotron2
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
enable_tflite=enable_tflite)
return model

View File

@ -1,42 +0,0 @@
import pickle
import datetime
import tensorflow as tf
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
state = {
'model': model.weights,
'optimizer': optimizer,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
pickle.dump(state, open(output_path, 'wb'))
def load_checkpoint(model, checkpoint_path):
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
tf_vars = model.weights
for tf_var in tf_vars:
layer_name = tf_var.name
try:
chkp_var_value = chkp_var_dict[layer_name]
except KeyError:
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
layer_name = f"{class_name}/{layer_name}"
chkp_var_value = chkp_var_dict[layer_name]
tf.keras.backend.set_value(tf_var, chkp_var_value)
if 'r' in checkpoint.keys():
model.decoder.set_r(checkpoint['r'])
return model
def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors()
return tflite_model

View File

@ -1,8 +0,0 @@
import tensorflow as tf
def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]

View File

@ -1,31 +0,0 @@
import tensorflow as tf
def convert_tacotron2_to_tflite(model,
output_path=None,
experimental_converter=True):
"""Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is
provided, else return TFLite model."""
concrete_function = model.inference_tflite.get_concrete_function()
converter = tf.lite.TFLiteConverter.from_concrete_functions(
[concrete_function])
converter.experimental_new_converter = experimental_converter
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
]
tflite_model = converter.convert()
print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.')
if output_path is not None:
# same model binary if outputpath is provided
with open(output_path, 'wb') as f:
f.write(tflite_model)
return None
return tflite_model
def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors()
return tflite_model

641
train.py
View File

@ -1,641 +0,0 @@
import argparse
import os
import sys
import glob
import time
import traceback
import numpy as np
import torch
from torch.utils.data import DataLoader
from TTS.datasets.TTSDataset import MyDataset
from distribute import (DistributedSampler, apply_gradient_allreduce,
init_distributed, reduce_tensor)
from TTS.layers.losses import TacotronLoss
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder,
get_git_branch, set_init_dict,
setup_model, KeepAverage, check_config)
from TTS.utils.io import (save_best_model, save_checkpoint,
load_config, copy_config_file)
from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
gradual_training_scheduler, set_weight_decay,
setup_torch_training_env)
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers
from TTS.utils.synthesis import synthesis
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.visual import plot_alignment, plot_spectrogram
from TTS.datasets.preprocess import load_meta_data
from TTS.utils.radam import RAdam
from TTS.utils.measures import alignment_diagonal_score
use_cuda, num_gpus = setup_torch_training_env(True, False)
def setup_loader(ap, r, is_val=False, verbose=False):
if is_val and not c.run_eval:
loader = None
else:
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size,
min_seq_len=c.min_seq_len,
max_seq_len=c.max_seq_len,
phoneme_cache_path=c.phoneme_cache_path,
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
verbose=verbose)
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
batch_size=c.eval_batch_size if is_val else c.batch_size,
shuffle=False,
collate_fn=dataset.collate_fn,
drop_last=False,
sampler=sampler,
num_workers=c.num_val_loader_workers
if is_val else c.num_loader_workers,
pin_memory=False)
return loader
def format_data(data):
if c.use_speaker_embedding:
speaker_mapping = load_speaker_mapping(OUT_PATH)
# setup input data
text_input = data[0]
text_lengths = data[1]
speaker_names = data[2]
linear_input = data[3] if c.model in ["Tacotron"] else None
mel_input = data[4]
mel_lengths = data[5]
stop_targets = data[6]
avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float())
if c.use_speaker_embedding:
speaker_ids = [
speaker_mapping[speaker_name] for speaker_name in speaker_names
]
speaker_ids = torch.LongTensor(speaker_ids)
else:
speaker_ids = None
# set stop targets view, we predict a single stop token per iteration.
stop_targets = stop_targets.view(text_input.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze(2)
# dispatch data to GPU
if use_cuda:
text_input = text_input.cuda(non_blocking=True)
text_lengths = text_lengths.cuda(non_blocking=True)
mel_input = mel_input.cuda(non_blocking=True)
mel_lengths = mel_lengths.cuda(non_blocking=True)
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
stop_targets = stop_targets.cuda(non_blocking=True)
if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True)
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
def train(model, criterion, optimizer, optimizer_st, scheduler,
ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
verbose=(epoch == 0))
model.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(
len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
c_logger.print_train_start()
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
loader_time = time.time() - end_time
global_step += 1
# setup lr
if c.noam_schedule:
scheduler.step()
optimizer.zero_grad()
if optimizer_st:
optimizer_st.zero_grad()
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
decoder_backward_output = None
alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
alignment_lengths = mel_lengths // model.decoder.r
# compute loss
loss_dict = criterion(postnet_output, decoder_output, mel_input,
linear_input, stop_tokens, stop_targets,
mel_lengths, decoder_backward_output,
alignments, alignment_lengths, alignments_backward,
text_lengths)
# backward pass
loss_dict['loss'].backward()
optimizer, current_lr = adam_weight_decay(optimizer)
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
optimizer.step()
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(alignments)
loss_dict['align_error'] = align_error
# backpass and check the grad norm for stop loss
if c.separate_stopnet:
loss_dict['stopnet_loss'].backward()
optimizer_st, _ = adam_weight_decay(optimizer_st)
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
optimizer_st.step()
else:
grad_norm_st = 0
step_time = time.time() - start_time
epoch_time += step_time
# aggregate losses from processes
if num_gpus > 1:
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
update_train_values['avg_loader_time'] = loader_time
update_train_values['avg_step_time'] = step_time
keep_avg.update_values(update_train_values)
# print training progress
if global_step % c.print_step == 0:
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
avg_spec_length, avg_text_length,
step_time, loader_time, current_lr,
loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Plot Training Iter Stats
# reduce TB load
if global_step % c.tb_plot_step == 0:
iter_stats = {
"lr": current_lr,
"grad_norm": grad_norm,
"grad_norm_st": grad_norm_st,
"step_time": step_time
}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
if global_step % c.save_step == 0:
if c.checkpoint:
# save model
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
optimizer_st=optimizer_st,
model_loss=loss_dict['postnet_loss'])
# Diagnostic visualizations
const_spec = postnet_output[0].data.cpu().numpy()
gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img),
}
if c.bidirectional_decoder or c.double_decoder_consistency:
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
tb_logger.tb_train_figures(global_step, figures)
# Sample audio
if c.model in ["Tacotron", "TacotronGST"]:
train_audio = ap.inv_spectrogram(const_spec.T)
else:
train_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_train_audios(global_step,
{'TrainAudio': train_audio},
c.audio["sample_rate"])
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Epoch Stats
if args.rank == 0:
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch):
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
c_logger.print_eval_start()
if data_loader is not None:
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
assert mel_input.shape[1] % model.decoder.r == 0
# forward pass model
if c.bidirectional_decoder or c.double_decoder_consistency:
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
else:
decoder_output, postnet_output, alignments, stop_tokens = model(
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
decoder_backward_output = None
alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
alignment_lengths = mel_lengths // model.decoder.r
# compute loss
loss_dict = criterion(postnet_output, decoder_output, mel_input,
linear_input, stop_tokens, stop_targets,
mel_lengths, decoder_backward_output,
alignments, alignment_lengths, alignments_backward,
text_lengths)
# step time
step_time = time.time() - start_time
epoch_time += step_time
# compute alignment score
align_error = 1 - alignment_diagonal_score(alignments)
loss_dict['align_error'] = align_error
# aggregate losses from processes
if num_gpus > 1:
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
if c.stopnet:
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
# detach loss values
loss_dict_new = dict()
for key, value in loss_dict.items():
if isinstance(value, (int, float)):
loss_dict_new[key] = value
else:
loss_dict_new[key] = value.item()
loss_dict = loss_dict_new
# update avg stats
update_train_values = dict()
for key, value in loss_dict.items():
update_train_values['avg_' + key] = value
keep_avg.update_values(update_train_values)
if c.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if args.rank == 0:
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
const_spec = postnet_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
"Tacotron", "TacotronGST"
] else mel_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
eval_figures = {
"prediction": plot_spectrogram(const_spec, ap),
"ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img)
}
# Sample audio
if c.model in ["Tacotron", "TacotronGST"]:
eval_audio = ap.inv_spectrogram(const_spec.T)
else:
eval_audio = ap.inv_melspectrogram(const_spec.T)
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
c.audio["sample_rate"])
# Plot Validation Stats
if c.bidirectional_decoder or c.double_decoder_consistency:
align_b_img = alignments_backward[idx].data.cpu().numpy()
eval_figures['alignment2'] = plot_alignment(align_b_img)
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
tb_logger.tb_eval_figures(global_step, eval_figures)
if args.rank == 0 and epoch > c.test_delay_epochs:
if c.test_sentences_file is None:
test_sentences = [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963."
]
else:
with open(c.test_sentences_file, "r") as f:
test_sentences = [s.strip() for s in f.readlines()]
# test sentences
test_audios = {}
test_figures = {}
print(" | > Synthesizing test sentences")
speaker_id = 0 if c.use_speaker_embedding else None
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
model,
test_sentence,
c,
use_cuda,
ap,
speaker_id=speaker_id,
style_wav=style_wav,
truncated=False,
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
use_griffin_lim=True,
do_trim_silence=False)
file_path = os.path.join(AUDIO_PATH, str(global_step))
os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx))
ap.save_wav(wav, file_path)
test_audios['{}-audio'.format(idx)] = wav
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
postnet_output, ap)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment)
except:
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios,
c.audio['sample_rate'])
tb_logger.tb_test_figures(global_step, test_figures)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global meta_data_train, meta_data_eval, symbols, phonemes
# Audio processor
ap = AudioProcessor(**c.audio)
if 'characters' in c.keys():
symbols, phonemes = make_symbols(**c.characters)
# DISTRUBUTED
if num_gpus > 1:
init_distributed(args.rank, num_gpus, args.group_id,
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
# parse speakers
if c.use_speaker_embedding:
speakers = get_speakers(meta_data_train)
if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)
assert all([speaker in speaker_mapping
for speaker in speakers]), "As of now you, you cannot " \
"introduce new speakers to " \
"a previously trained model."
else:
speaker_mapping = {name: i for i, name in enumerate(speakers)}
save_speaker_mapping(OUT_PATH, speaker_mapping)
num_speakers = len(speaker_mapping)
print("Training with {} speakers: {}".format(num_speakers,
", ".join(speakers)))
else:
num_speakers = 0
model = setup_model(num_chars, num_speakers, c)
params = set_weight_decay(model, c.wd)
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
if c.stopnet and c.separate_stopnet:
optimizer_st = RAdam(model.decoder.stopnet.parameters(),
lr=c.lr,
weight_decay=0)
else:
optimizer_st = None
# setup criterion
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu')
try:
# TODO: fix optimizer init, model.cuda() needs to be called before
# optimizer restore
# optimizer.load_state_dict(checkpoint['optimizer'])
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except:
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
model.load_state_dict(model_dict)
del model_dict
for group in optimizer.param_groups:
group['lr'] = c.lr
print(" > Model restored from step %d" % checkpoint['step'],
flush=True)
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
if use_cuda:
model.cuda()
criterion.cuda()
# DISTRUBUTED
if num_gpus > 1:
model = apply_gradient_allreduce(model)
if c.noam_schedule:
scheduler = NoamLR(optimizer,
warmup_steps=c.warmup_steps,
last_epoch=args.restore_step - 1)
else:
scheduler = None
num_params = count_parameters(model)
print("\n > Model has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals():
best_loss = float('inf')
global_step = args.restore_step
for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs)
# set gradual training
if c.gradual_training is not None:
r, c.batch_size = gradual_training_scheduler(global_step, c)
c.r = r
model.decoder.set_r(r)
if c.bidirectional_decoder:
model.decoder_backward.set_r(r)
print("\n > Number of output frames:", model.decoder.r)
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
optimizer_st, scheduler, ap,
global_step, epoch)
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = train_avg_loss_dict['avg_postnet_loss']
if c.run_eval:
target_loss = eval_avg_loss_dict['avg_postnet_loss']
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
OUT_PATH)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--continue_path',
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default='',
required='--config_path' not in sys.argv)
parser.add_argument(
'--restore_path',
type=str,
help='Model file to be restored. Use to finetune a model.',
default='')
parser.add_argument(
'--config_path',
type=str,
help='Path to config file for training.',
required='--continue_path' not in sys.argv
)
parser.add_argument('--debug',
type=bool,
default=False,
help='Do not verify commit integrity to run training.')
# DISTRUBUTED
parser.add_argument(
'--rank',
type=int,
default=0,
help='DISTRIBUTED: process rank for distributed training.')
parser.add_argument('--group_id',
type=str,
default="",
help='DISTRIBUTED: process group id.')
args = parser.parse_args()
if args.continue_path != '':
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json')
list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
c = load_config(args.config_path)
check_config(c)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == '':
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path,
os.path.join(OUT_PATH, 'config.json'), new_fields)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
# write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

View File

@ -1,29 +0,0 @@
This folder contains a symlink called TTS to the parent folder:
lrwxr-xr-x TTS -> ..
This is used to appease the distribute/setuptools gods. When the project was
initially set up, the repository folder itself was considered a namespace, and
development was done with `sys.path` hacks. This means if you tried to install
TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of
`TTS.models`, `TTS.utils`...
Installing TTS would then pollute the package namespace with generic names like
those above. In order to make things installable in both install and development
modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed
to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect
using `packages_dir` in `setup.py` is not enough because it breaks the editable
installation, which can only handle the simplest of `package_dir` redirects.
Our solution is to use a symlink in order to add the extra `TTS` namespace. In
`setup.py`, we only look for packages inside `tts_namespace` (this folder),
which contains a symlink called TTS pointing to the repository root. The final
result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`...
With this hack, `pip install -e` will then add a symlink to the `tts_namespace`
in your `site-packages` folder, which works properly. It's important not to add
anything else in this folder because it will pollute the package namespace when
installing the project.
This does not work if you check out your project on a filesystem that does not
support symlinks.

View File

@ -1 +0,0 @@
..

Binary file not shown.

Binary file not shown.

View File

View File

@ -1,356 +0,0 @@
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
from TTS.utils.data import StandardScaler
class AudioProcessor(object):
def __init__(self,
sample_rate=None,
num_mels=None,
min_level_db=None,
frame_shift_ms=None,
frame_length_ms=None,
hop_length=None,
win_length=None,
ref_level_db=None,
fft_size=1024,
power=None,
preemphasis=0.0,
signal_norm=None,
symmetric_norm=None,
max_norm=None,
mel_fmin=None,
mel_fmax=None,
spec_gain=20,
stft_pad_mode='reflect',
clip_norm=True,
griffin_lim_iters=None,
do_trim_silence=False,
trim_db=60,
do_sound_norm=False,
stats_path=None,
**_):
print(" > Setting up Audio Processor...")
# setup class attributed
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db or 0
self.frame_shift_ms = frame_shift_ms
self.frame_length_ms = frame_length_ms
self.ref_level_db = ref_level_db
self.fft_size = fft_size
self.power = power
self.preemphasis = preemphasis
self.griffin_lim_iters = griffin_lim_iters
self.signal_norm = signal_norm
self.symmetric_norm = symmetric_norm
self.mel_fmin = mel_fmin or 0
self.mel_fmax = mel_fmax
self.spec_gain = float(spec_gain)
self.stft_pad_mode = 'reflect'
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.trim_db = trim_db
self.do_sound_norm = do_sound_norm
self.stats_path = stats_path
# setup stft parameters
if hop_length is None:
# compute stft parameters from given time values
self.hop_length, self.win_length = self._stft_parameters()
else:
# use stft parameters from config file
self.hop_length = hop_length
self.win_length = win_length
assert min_level_db != 0.0, " [!] min_level_db is 0"
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
members = vars(self)
for key, value in members.items():
print(" | > {}:{}".format(key, value))
# create spectrogram utils
self.mel_basis = self._build_mel_basis()
self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
# setup scaler
if stats_path:
mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
self.signal_norm = True
self.max_norm = None
self.clip_norm = None
self.symmetric_norm = None
### setting up the parameters ###
def _build_mel_basis(self, ):
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.fft_size,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _stft_parameters(self, ):
"""Compute necessary stft parameters with given time values"""
factor = self.frame_length_ms / self.frame_shift_ms
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(hop_length * factor)
return hop_length, win_length
### normalization ###
def _normalize(self, S):
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
#pylint: disable=no-else-return
S = S.copy()
if self.signal_norm:
# mean-var scaling
if hasattr(self, 'mel_scaler'):
if S.shape[0] == self.num_mels:
return self.mel_scaler.transform(S.T).T
elif S.shape[0] == self.fft_size / 2:
return self.linear_scaler.transform(S.T).T
else:
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
# range normalization
S -= self.ref_level_db # discard certain range of DB assuming it is air noise
S_norm = ((S - self.min_level_db) / (-self.min_level_db))
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
return S_norm
else:
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
return S_norm
else:
return S
def _denormalize(self, S):
"""denormalize values"""
#pylint: disable=no-else-return
S_denorm = S.copy()
if self.signal_norm:
# mean-var scaling
if hasattr(self, 'mel_scaler'):
if S_denorm.shape[0] == self.num_mels:
return self.mel_scaler.inverse_transform(S_denorm.T).T
elif S_denorm.shape[0] == self.fft_size / 2:
return self.linear_scaler.inverse_transform(S_denorm.T).T
else:
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
return S_denorm + self.ref_level_db
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = (S_denorm * -self.min_level_db /
self.max_norm) + self.min_level_db
return S_denorm + self.ref_level_db
else:
return S_denorm
### Mean-STD scaling ###
def load_stats(self, stats_path):
stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg
mel_mean = stats['mel_mean']
mel_std = stats['mel_std']
linear_mean = stats['linear_mean']
linear_std = stats['linear_std']
stats_config = stats['audio_config']
# check all audio parameters used for computing stats
skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power']
for key in stats_config.keys():
if key in skip_parameters:
continue
assert stats_config[key] == self.__dict__[key],\
f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
return mel_mean, mel_std, linear_mean, linear_std, stats_config
# pylint: disable=attribute-defined-outside-init
def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std):
self.mel_scaler = StandardScaler()
self.mel_scaler.set_stats(mel_mean, mel_std)
self.linear_scaler = StandardScaler()
self.linear_scaler.set_stats(linear_mean, linear_std)
### DB and AMP conversion ###
# pylint: disable=no-self-use
def _amp_to_db(self, x):
return self.spec_gain * np.log10(np.maximum(1e-5, x))
# pylint: disable=no-self-use
def _db_to_amp(self, x):
return np.power(10.0, x / self.spec_gain)
### Preemphasis ###
def apply_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
### SPECTROGRAMs ###
def _linear_to_mel(self, spectrogram):
return np.dot(self.mel_basis, spectrogram)
def _mel_to_linear(self, mel_spec):
return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
def spectrogram(self, y):
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amp_to_db(np.abs(D))
return self._normalize(S)
def melspectrogram(self, y):
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
"""Converts spectrogram to waveform using librosa"""
S = self._denormalize(spectrogram)
S = self._db_to_amp(S)
# Reconstruct phase
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram):
'''Converts melspectrogram to waveform using librosa'''
D = self._denormalize(mel_spectrogram)
S = self._db_to_amp(D)
S = self._mel_to_linear(S) # Convert back to linear
if self.preemphasis != 0:
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
S = self._denormalize(linear_spec)
S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
S = self._amp_to_db(S)
mel = self._normalize(S)
return mel
### STFT and ISTFT ###
def _stft(self, y):
return librosa.stft(
y=y,
n_fft=self.fft_size,
hop_length=self.hop_length,
win_length=self.win_length,
pad_mode=self.stft_pad_mode,
)
def _istft(self, y):
return librosa.istft(
y, hop_length=self.hop_length, win_length=self.win_length)
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = self._istft(S_complex * angles)
for _ in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
return y
def compute_stft_paddings(self, x, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
if pad_sides == 1:
return 0, pad
return pad // 2, pad // 2 + pad % 2
### Audio Processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = self._db_to_amp(threshold_db)
for x in range(hop_length, len(wav) - window_length, hop_length):
if np.max(wav[x:x + window_length]) < threshold:
return x + hop_length
return len(wav)
def trim_silence(self, wav):
""" Trim silent parts with a threshold and 0.01 sec margin """
margin = int(self.sample_rate * 0.01)
wav = wav[margin:-margin]
return librosa.effects.trim(
wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
@staticmethod
def sound_norm(x):
return x / abs(x).max() * 0.9
### save and load ###
def load_wav(self, filename, sr=None):
if sr is None:
x, sr = sf.read(filename)
else:
x, sr = librosa.load(filename, sr=sr)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(f' [!] File cannot be trimmed for silence - {filename}')
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
if self.do_sound_norm:
x = self.sound_norm(x)
return x
def save_wav(self, wav, path):
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,)
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2 ** qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
return x
@staticmethod
def encode_16bits(x):
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
@staticmethod
def quantize(x, bits):
return (x + 1.) * (2**bits - 1) / 2
@staticmethod
def dequantize(x, bits):
return 2 * x / (2**bits - 1) - 1

View File

@ -1,95 +0,0 @@
import datetime
from TTS.utils.io import AttrDict
tcolors = AttrDict({
'OKBLUE': '\033[94m',
'HEADER': '\033[95m',
'OKGREEN': '\033[92m',
'WARNING': '\033[93m',
'FAIL': '\033[91m',
'ENDC': '\033[0m',
'BOLD': '\033[1m',
'UNDERLINE': '\033[4m'
})
class ConsoleLogger():
def __init__(self):
# TODO: color code for value changes
# use these to compare values between iterations
self.old_train_loss_dict = None
self.old_epoch_loss_dict = None
self.old_eval_loss_dict = None
# pylint: disable=no-self-use
def get_time(self):
now = datetime.datetime.now()
return now.strftime("%Y-%m-%d %H:%M:%S")
def print_epoch_start(self, epoch, max_epoch):
print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD,
epoch, max_epoch, tcolors.ENDC),
flush=True)
def print_train_start(self):
print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
def print_train_step(self, batch_steps, step, global_step, avg_spec_length,
avg_text_length, step_time, loader_time, lr,
loss_dict, avg_loss_dict):
indent = " | > "
print()
log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC)
for key, value in loss_dict.items():
# print the avg value if given
if f'avg_{key}' in avg_loss_dict.keys():
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
else:
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\
f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"
print(log_text, flush=True)
# pylint: disable=unused-argument
def print_train_epoch_end(self, global_step, epoch, epoch_time,
print_dict):
indent = " | > "
log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n"
for key, value in print_dict.items():
log_text += "{}{}: {:.5f}\n".format(indent, key, value)
print(log_text, flush=True)
def print_eval_start(self):
print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n")
def print_eval_step(self, step, loss_dict, avg_loss_dict):
indent = " | > "
print()
log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n"
for key, value in loss_dict.items():
# print the avg value if given
if f'avg_{key}' in avg_loss_dict.keys():
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
else:
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
print(log_text, flush=True)
def print_epoch_end(self, epoch, avg_loss_dict):
indent = " | > "
log_text = " {}--> EVAL PERFORMANCE{}\n".format(
tcolors.BOLD, tcolors.ENDC)
for key, value in avg_loss_dict.items():
# print the avg value if given
color = tcolors.FAIL
sign = '+'
diff = 0
if self.old_eval_loss_dict is not None:
diff = value - self.old_eval_loss_dict[key]
if diff <= 0:
color = tcolors.OKGREEN
sign = ''
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
self.old_eval_loss_dict = avg_loss_dict
print(log_text, flush=True)

View File

@ -1,77 +0,0 @@
import numpy as np
def _pad_data(x, length):
_pad = 0
assert x.ndim == 1
return np.pad(
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def prepare_data(inputs):
max_len = max((len(x) for x in inputs))
return np.stack([_pad_data(x, max_len) for x in inputs])
def _pad_tensor(x, length):
_pad = 0.
assert x.ndim == 2
x = np.pad(
x, [[0, 0], [0, length - x.shape[1]]],
mode='constant',
constant_values=_pad)
return x
def prepare_tensor(inputs, out_steps):
max_len = max((x.shape[1] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
def _pad_stop_target(x, length):
_pad = 0.
assert x.ndim == 1
return np.pad(
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
def prepare_stop_target(inputs, out_steps):
""" Pad row vectors with 1. """
max_len = max((x.shape[0] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
def pad_per_step(inputs, pad_len):
return np.pad(
inputs, [[0, 0], [0, 0], [0, pad_len]],
mode='constant',
constant_values=0.0)
# pylint: disable=attribute-defined-outside-init
class StandardScaler():
def set_stats(self, mean, scale):
self.mean_ = mean
self.scale_ = scale
def reset_stats(self):
delattr(self, 'mean_')
delattr(self, 'scale_')
def transform(self, X):
X = np.asarray(X)
X -= self.mean_
X /= self.scale_
return X
def inverse_transform(self, X):
X = np.asarray(X)
X *= self.scale_
X += self.mean_
return X

View File

@ -1,362 +0,0 @@
import os
import glob
import torch
import shutil
import datetime
import subprocess
import importlib
import numpy as np
from collections import Counter
def get_git_branch():
try:
out = subprocess.check_output(["git", "branch"]).decode("utf8")
current = next(line for line in out.split("\n")
if line.startswith("*"))
current.replace("* ", "")
except subprocess.CalledProcessError:
current = "inside_docker"
return current
def get_commit_hash():
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
# try:
# subprocess.check_output(['git', 'diff-index', '--quiet',
# 'HEAD']) # Verify client is clean
# except:
# raise RuntimeError(
# " !! Commit before training to get the commit hash.")
try:
commit = subprocess.check_output(
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
# Not copying .git folder into docker container
except subprocess.CalledProcessError:
commit = "0000000"
print(' > Git Hash: {}'.format(commit))
return commit
def create_experiment_folder(root_path, model_name, debug):
""" Create a folder with the current date and time """
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
if debug:
commit_hash = 'debug'
else:
commit_hash = get_commit_hash()
output_folder = os.path.join(
root_path, model_name + '-' + date_str + '-' + commit_hash)
os.makedirs(output_folder, exist_ok=True)
print(" > Experiment folder: {}".format(output_folder))
return output_folder
def remove_experiment_folder(experiment_path):
"""Check folder if there is a checkpoint, otherwise remove the folder"""
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
if not checkpoint_files:
if os.path.exists(experiment_path):
shutil.rmtree(experiment_path, ignore_errors=True)
print(" ! Run is removed from {}".format(experiment_path))
else:
print(" ! Run is kept in {}".format(experiment_path))
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def split_dataset(items):
is_multi_speaker = False
speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
len(items) * 0.01)
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
np.random.seed(0)
np.random.shuffle(items)
if is_multi_speaker:
items_eval = []
# most stupid code ever -- Fix it !
while len(items_eval) < eval_split_size:
speakers = [item[-1] for item in items]
speaker_counter = Counter(speakers)
item_idx = np.random.randint(0, len(items))
if speaker_counter[items[item_idx][-1]] > 1:
items_eval.append(items[item_idx])
del items[item_idx]
return items_eval, items
return items[:eval_split_size], items[eval_split_size:]
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
if max_len is None:
max_len = sequence_length.data.max()
batch_size = sequence_length.size(0)
seq_range = torch.arange(0, max_len).long()
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
if sequence_length.is_cuda:
seq_range_expand = seq_range_expand.to(sequence_length.device)
seq_length_expand = (
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
# B x T_max
return seq_range_expand < seq_length_expand
def set_init_dict(model_dict, checkpoint_state, c):
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
for k, v in checkpoint_state.items():
if k not in model_dict:
print(" | > Layer missing in the model definition: {}".format(k))
# 1. filter out unnecessary keys
pretrained_dict = {
k: v
for k, v in checkpoint_state.items() if k in model_dict
}
# 2. filter out different size layers
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if v.numel() == model_dict[k].numel()
}
# 3. skip reinit layers
if c.reinit_layers is not None:
for reinit_layer_name in c.reinit_layers:
pretrained_dict = {
k: v
for k, v in pretrained_dict.items()
if reinit_layer_name not in k
}
# 4. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
len(model_dict)))
return model_dict
def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('TTS.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model)
if c.model.lower() in "tacotron":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
memory_size=c.memory_size,
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
elif c.model.lower() == "tacotron2":
model = MyModel(num_chars=num_chars,
num_speakers=num_speakers,
r=c.r,
postnet_output_dim=c.audio['num_mels'],
decoder_output_dim=c.audio['num_mels'],
gst=c.use_gst,
attn_type=c.attention_type,
attn_win=c.windowing,
attn_norm=c.attention_norm,
prenet_type=c.prenet_type,
prenet_dropout=c.prenet_dropout,
forward_attn=c.use_forward_attn,
trans_agent=c.transition_agent,
forward_attn_mask=c.forward_attn_mask,
location_attn=c.location_attn,
attn_K=c.attention_heads,
separate_stopnet=c.separate_stopnet,
bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r)
return model
class KeepAverage():
def __init__(self):
self.avg_values = {}
self.iters = {}
def __getitem__(self, key):
return self.avg_values[key]
def items(self):
return self.avg_values.items()
def add_value(self, name, init_val=0, init_iter=0):
self.avg_values[name] = init_val
self.iters[name] = init_iter
def update_value(self, name, value, weighted_avg=False):
if name not in self.avg_values:
# add value if not exist before
self.add_value(name, init_val=value)
else:
# else update existing value
if weighted_avg:
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
self.iters[name] += 1
else:
self.avg_values[name] = self.avg_values[name] * \
self.iters[name] + value
self.iters[name] += 1
self.avg_values[name] /= self.iters[name]
def add_values(self, name_dict):
for key, value in name_dict.items():
self.add_value(key, init_val=value)
def update_values(self, value_dict):
for key, value in value_dict.items():
self.update_value(key, value)
def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
if alternative in c.keys() and c[alternative] is not None:
return
if restricted:
assert name in c.keys(), f' [!] {name} not defined in config.json'
if name in c.keys():
if max_val:
assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
if min_val:
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
if enum_list:
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
if val_type:
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
def check_config(c):
_check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
_check_argument('run_name', c, restricted=True, val_type=str)
_check_argument('run_description', c, val_type=str)
# AUDIO
_check_argument('audio', c, restricted=True, val_type=dict)
# audio processing parameters
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
_check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
_check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
_check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
_check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
_check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
# vocabulary parameters
_check_argument('characters', c, restricted=False, val_type=dict)
_check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
_check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
# normalization parameters
_check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
# training parameters
_check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
_check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
_check_argument('r', c, restricted=True, val_type=int, min_val=1)
_check_argument('gradual_training', c, restricted=False, val_type=list)
_check_argument('loss_masking', c, restricted=True, val_type=bool)
# _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
# validation parameters
_check_argument('run_eval', c, restricted=True, val_type=bool)
_check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
_check_argument('test_sentences_file', c, restricted=False, val_type=str)
# optimizer
_check_argument('noam_schedule', c, restricted=False, val_type=bool)
_check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
_check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
_check_argument('lr', c, restricted=True, val_type=float, min_val=0)
_check_argument('wd', c, restricted=True, val_type=float, min_val=0)
_check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
_check_argument('seq_len_norm', c, restricted=True, val_type=bool)
# tacotron prenet
_check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
_check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
_check_argument('prenet_dropout', c, restricted=True, val_type=bool)
# attention
_check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
_check_argument('attention_heads', c, restricted=True, val_type=int)
_check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
_check_argument('windowing', c, restricted=True, val_type=bool)
_check_argument('use_forward_attn', c, restricted=True, val_type=bool)
_check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
_check_argument('transition_agent', c, restricted=True, val_type=bool)
_check_argument('transition_agent', c, restricted=True, val_type=bool)
_check_argument('location_attn', c, restricted=True, val_type=bool)
_check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
_check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
_check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
# stopnet
_check_argument('stopnet', c, restricted=True, val_type=bool)
_check_argument('separate_stopnet', c, restricted=True, val_type=bool)
# tensorboard
_check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
_check_argument('checkpoint', c, restricted=True, val_type=bool)
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
# dataloading
# pylint: disable=import-outside-toplevel
from TTS.utils.text import cleaners
_check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
_check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
_check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
_check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
_check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
_check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
_check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
# paths
_check_argument('output_path', c, restricted=True, val_type=str)
# multi-speaker gst
_check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
_check_argument('style_wav_for_test', c, restricted=True, val_type=str)
_check_argument('use_gst', c, restricted=True, val_type=bool)
# datasets - checking only the first entry
_check_argument('datasets', c, restricted=True, val_type=list)
for dataset_entry in c['datasets']:
_check_argument('name', dataset_entry, restricted=True, val_type=str)
_check_argument('path', dataset_entry, restricted=True, val_type=str)
_check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
_check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)

View File

@ -1,78 +0,0 @@
import os
import json
import re
import torch
import datetime
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def load_config(config_path):
config = AttrDict()
with open(config_path, "r") as f:
input_str = f.read()
input_str = re.sub(r'\\\n', '', input_str)
input_str = re.sub(r'//.*\n', '\n', input_str)
data = json.loads(input_str)
config.update(data)
return config
def copy_config_file(config_file, out_path, new_fields):
config_lines = open(config_file, "r").readlines()
# add extra information fields
for key, value in new_fields.items():
if isinstance(value, str):
new_line = '"{}":"{}",\n'.format(key, value)
else:
new_line = '"{}":{},\n'.format(key, value)
config_lines.insert(1, new_line)
config_out_file = open(out_path, "w")
config_out_file.writelines(config_lines)
config_out_file.close()
def load_checkpoint(model, checkpoint_path, use_cuda=False):
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
if use_cuda:
model.cuda()
# set model stepsize
if 'r' in state.keys():
model.decoder.set_r(state['r'])
return model, state
def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
new_state_dict = model.state_dict()
state = {
'model': new_state_dict,
'optimizer': optimizer.state_dict() if optimizer is not None else None,
'step': current_step,
'epoch': epoch,
'date': datetime.date.today().strftime("%B %d, %Y"),
'r': r
}
state.update(kwargs)
torch.save(state, output_path)
def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
checkpoint_path = os.path.join(output_folder, file_name)
print(" > CHECKPOINT : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs)
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
if target_loss < best_loss:
file_name = 'best_model.pth.tar'
checkpoint_path = os.path.join(output_folder, file_name)
print(" > BEST MODEL : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
best_loss = target_loss
return best_loss

View File

@ -1,18 +0,0 @@
import torch
def alignment_diagonal_score(alignments, binary=False):
"""
Compute how diagonal alignment predictions are. It is useful
to measure the alignment consistency of a model
Args:
alignments (torch.Tensor): batch of alignments.
binary (bool): if True, ignore scores and consider attention
as a binary mask.
Shape:
alignments : batch x decoder_steps x encoder_steps
"""
maxs = alignments.max(dim=1)[0]
if binary:
maxs[maxs > 0] = 1
return maxs.mean(dim=1).mean(dim=0).item()

Some files were not shown because too many files have changed in this diff Show More