mirror of https://github.com/coqui-ai/TTS.git
Mass refactoring
This commit is contained in:
parent
3bc38517aa
commit
82dd465365
|
@ -17,5 +17,9 @@ matrix:
|
|||
python: "3.6"
|
||||
install: pip install --quiet -r requirements_tests.txt
|
||||
env: TEST_SUITE="unittest"
|
||||
- name: "Unit tests"
|
||||
python: "3.6"
|
||||
install: pip install --quiet -r requirements_tests.txt
|
||||
env: TEST_SUITE="testscripts"
|
||||
|
||||
script: ./.travis/script
|
||||
|
|
|
@ -14,9 +14,15 @@ if [[ "$TEST_SUITE" == "unittest" ]]; then
|
|||
pushd tts_namespace
|
||||
nosetests TTS.speaker_encoder.tests --nocapture
|
||||
nosetests TTS.vocoder.tests --nocapture
|
||||
nosetests TTS.tests --nocapture
|
||||
nosetests TTS.tf.tests --nocapture
|
||||
nosetests TTS.tts.tests --nocapture
|
||||
nosetests TTS.tts.tf.tests --nocapture
|
||||
popd
|
||||
# Test server package
|
||||
./tests/test_server_package.sh
|
||||
fi
|
||||
|
||||
if [[ "$TEST_SUITE" == "testscripts" ]]; then
|
||||
# Test server package
|
||||
./tts/tests/test_server_package.sh
|
||||
# test model training scripts
|
||||
./tts/tests/test_tts_train.sh
|
||||
./vocoder/tests/test_vocoder_train.sh
|
||||
fi
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute mean and variance of spectrogtram features.")
|
||||
parser.add_argument("--config_path", type=str, required=True,
|
||||
help="TTS config file path to define audio processin parameters.")
|
||||
parser.add_argument("--out_path", default=None, type=str,
|
||||
help="directory to save the output file.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# load config
|
||||
CONFIG = load_config(args.config_path)
|
||||
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization
|
||||
CONFIG.audio['stats_path'] = None # discard pre-defined stats
|
||||
|
||||
# load audio processor
|
||||
ap = AudioProcessor(**CONFIG.audio)
|
||||
|
||||
# load the meta data of target dataset
|
||||
dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data
|
||||
print(f" > There are {len(dataset_items)} files.")
|
||||
|
||||
mel_sum = 0
|
||||
mel_square_sum = 0
|
||||
linear_sum = 0
|
||||
linear_square_sum = 0
|
||||
N = 0
|
||||
for item in tqdm(dataset_items):
|
||||
# compute features
|
||||
wav = ap.load_wav(item[1])
|
||||
linear = ap.spectrogram(wav)
|
||||
mel = ap.melspectrogram(wav)
|
||||
|
||||
# compute stats
|
||||
N += mel.shape[1]
|
||||
mel_sum += mel.sum(1)
|
||||
linear_sum += linear.sum(1)
|
||||
mel_square_sum += (mel ** 2).sum(axis=1)
|
||||
linear_square_sum += (linear ** 2).sum(axis=1)
|
||||
|
||||
mel_mean = mel_sum / N
|
||||
mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2)
|
||||
linear_mean = linear_sum / N
|
||||
linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2)
|
||||
|
||||
output_file_path = os.path.join(args.out_path, "scale_stats.npy")
|
||||
stats = {}
|
||||
stats['mel_mean'] = mel_mean
|
||||
stats['mel_std'] = mel_scale
|
||||
stats['linear_mean'] = linear_mean
|
||||
stats['linear_std'] = linear_scale
|
||||
|
||||
print(f' > Avg mel spec mean: {mel_mean.mean()}')
|
||||
print(f' > Avg mel spec scale: {mel_scale.mean()}')
|
||||
print(f' > Avg linear spec mean: {linear_mean.mean()}')
|
||||
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
|
||||
|
||||
# set default config values for mean-var scaling
|
||||
CONFIG.audio['stats_path'] = output_file_path
|
||||
CONFIG.audio['signal_norm'] = True
|
||||
# remove redundant values
|
||||
del CONFIG.audio['max_norm']
|
||||
del CONFIG.audio['min_level_db']
|
||||
del CONFIG.audio['symmetric_norm']
|
||||
del CONFIG.audio['clip_norm']
|
||||
stats['audio_config'] = CONFIG.audio
|
||||
np.save(output_file_path, stats, allow_pickle=True)
|
||||
print(f' > scale_stats.npy is saved to {output_file_path}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,240 +0,0 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import collections
|
||||
import torch
|
||||
import random
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
|
||||
from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __init__(self,
|
||||
outputs_per_step,
|
||||
text_cleaner,
|
||||
compute_linear_spec,
|
||||
ap,
|
||||
meta_data,
|
||||
tp=None,
|
||||
batch_group_size=0,
|
||||
min_seq_len=0,
|
||||
max_seq_len=float("inf"),
|
||||
use_phonemes=True,
|
||||
phoneme_cache_path=None,
|
||||
phoneme_language="en-us",
|
||||
enable_eos_bos=False,
|
||||
verbose=False):
|
||||
"""
|
||||
Args:
|
||||
outputs_per_step (int): number of time frames predicted per step.
|
||||
text_cleaner (str): text cleaner used for the dataset.
|
||||
compute_linear_spec (bool): compute linear spectrogram if True.
|
||||
ap (TTS.utils.AudioProcessor): audio processor object.
|
||||
meta_data (list): list of dataset instances.
|
||||
batch_group_size (int): (0) range of batch randomization after sorting
|
||||
sequences by length.
|
||||
min_seq_len (int): (0) minimum sequence length to be processed
|
||||
by the loader.
|
||||
max_seq_len (int): (float("inf")) maximum sequence length.
|
||||
use_phonemes (bool): (true) if true, text converted to phonemes.
|
||||
phoneme_cache_path (str): path to cache phoneme features.
|
||||
phoneme_language (str): one the languages from
|
||||
https://github.com/bootphon/phonemizer#languages
|
||||
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
|
||||
verbose (bool): print diagnostic information.
|
||||
"""
|
||||
self.batch_group_size = batch_group_size
|
||||
self.items = meta_data
|
||||
self.outputs_per_step = outputs_per_step
|
||||
self.sample_rate = ap.sample_rate
|
||||
self.cleaners = text_cleaner
|
||||
self.compute_linear_spec = compute_linear_spec
|
||||
self.min_seq_len = min_seq_len
|
||||
self.max_seq_len = max_seq_len
|
||||
self.ap = ap
|
||||
self.tp = tp
|
||||
self.use_phonemes = use_phonemes
|
||||
self.phoneme_cache_path = phoneme_cache_path
|
||||
self.phoneme_language = phoneme_language
|
||||
self.enable_eos_bos = enable_eos_bos
|
||||
self.verbose = verbose
|
||||
if use_phonemes and not os.path.isdir(phoneme_cache_path):
|
||||
os.makedirs(phoneme_cache_path, exist_ok=True)
|
||||
if self.verbose:
|
||||
print("\n > DataLoader initialization")
|
||||
print(" | > Use phonemes: {}".format(self.use_phonemes))
|
||||
if use_phonemes:
|
||||
print(" | > phoneme language: {}".format(phoneme_language))
|
||||
print(" | > Number of instances : {}".format(len(self.items)))
|
||||
self.sort_items()
|
||||
|
||||
def load_wav(self, filename):
|
||||
audio = self.ap.load_wav(filename)
|
||||
return audio
|
||||
|
||||
@staticmethod
|
||||
def load_np(filename):
|
||||
data = np.load(filename).astype('float32')
|
||||
return data
|
||||
|
||||
def _generate_and_cache_phoneme_sequence(self, text, cache_path):
|
||||
"""generate a phoneme sequence from text.
|
||||
since the usage is for subsequent caching, we never add bos and
|
||||
eos chars here. Instead we add those dynamically later; based on the
|
||||
config option."""
|
||||
phonemes = phoneme_to_sequence(text, [self.cleaners],
|
||||
language=self.phoneme_language,
|
||||
enable_eos_bos=False,
|
||||
tp=self.tp)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
np.save(cache_path, phonemes)
|
||||
return phonemes
|
||||
|
||||
def _load_or_generate_phoneme_sequence(self, wav_file, text):
|
||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
||||
cache_path = os.path.join(self.phoneme_cache_path,
|
||||
file_name + '_phoneme.npy')
|
||||
try:
|
||||
phonemes = np.load(cache_path)
|
||||
except FileNotFoundError:
|
||||
phonemes = self._generate_and_cache_phoneme_sequence(text,
|
||||
cache_path)
|
||||
except (ValueError, IOError):
|
||||
print(" > ERROR: failed loading phonemes for {}. "
|
||||
"Recomputing.".format(wav_file))
|
||||
phonemes = self._generate_and_cache_phoneme_sequence(text,
|
||||
cache_path)
|
||||
if self.enable_eos_bos:
|
||||
phonemes = pad_with_eos_bos(phonemes, tp=self.tp)
|
||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||
return phonemes
|
||||
|
||||
def load_data(self, idx):
|
||||
text, wav_file, speaker_name = self.items[idx]
|
||||
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
|
||||
|
||||
if self.use_phonemes:
|
||||
text = self._load_or_generate_phoneme_sequence(wav_file, text)
|
||||
else:
|
||||
text = np.asarray(
|
||||
text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32)
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
assert wav.size > 0, self.items[idx][1]
|
||||
|
||||
sample = {
|
||||
'text': text,
|
||||
'wav': wav,
|
||||
'item_idx': self.items[idx][1],
|
||||
'speaker_name': speaker_name
|
||||
}
|
||||
return sample
|
||||
|
||||
def sort_items(self):
|
||||
r"""Sort instances based on text length in ascending order"""
|
||||
lengths = np.array([len(ins[0]) for ins in self.items])
|
||||
|
||||
idxs = np.argsort(lengths)
|
||||
new_items = []
|
||||
ignored = []
|
||||
for i, idx in enumerate(idxs):
|
||||
length = lengths[idx]
|
||||
if length < self.min_seq_len or length > self.max_seq_len:
|
||||
ignored.append(idx)
|
||||
else:
|
||||
new_items.append(self.items[idx])
|
||||
# shuffle batch groups
|
||||
if self.batch_group_size > 0:
|
||||
for i in range(len(new_items) // self.batch_group_size):
|
||||
offset = i * self.batch_group_size
|
||||
end_offset = offset + self.batch_group_size
|
||||
temp_items = new_items[offset:end_offset]
|
||||
random.shuffle(temp_items)
|
||||
new_items[offset:end_offset] = temp_items
|
||||
self.items = new_items
|
||||
|
||||
if self.verbose:
|
||||
print(" | > Max length sequence: {}".format(np.max(lengths)))
|
||||
print(" | > Min length sequence: {}".format(np.min(lengths)))
|
||||
print(" | > Avg length sequence: {}".format(np.mean(lengths)))
|
||||
print(" | > Num. instances discarded by max-min (max={}, min={}) seq limits: {}".format(
|
||||
self.max_seq_len, self.min_seq_len, len(ignored)))
|
||||
print(" | > Batch group size: {}.".format(self.batch_group_size))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.items)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.load_data(idx)
|
||||
|
||||
def collate_fn(self, batch):
|
||||
r"""
|
||||
Perform preprocessing and create a final data batch:
|
||||
1. Sort batch instances by text-length
|
||||
2. Convert Audio signal to Spectrograms.
|
||||
3. PAD sequences wrt r.
|
||||
4. Load to Torch.
|
||||
"""
|
||||
|
||||
# Puts each data field into a tensor with outer dimension batch size
|
||||
if isinstance(batch[0], collections.Mapping):
|
||||
|
||||
text_lenghts = np.array([len(d["text"]) for d in batch])
|
||||
|
||||
# sort items with text input length for RNN efficiency
|
||||
text_lenghts, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor(text_lenghts), dim=0, descending=True)
|
||||
|
||||
wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing]
|
||||
item_idxs = [
|
||||
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
|
||||
]
|
||||
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
|
||||
speaker_name = [batch[idx]['speaker_name']
|
||||
for idx in ids_sorted_decreasing]
|
||||
|
||||
# compute features
|
||||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||
|
||||
mel_lengths = [m.shape[1] for m in mel]
|
||||
|
||||
# compute 'stop token' targets
|
||||
stop_targets = [
|
||||
np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths
|
||||
]
|
||||
|
||||
# PAD stop targets
|
||||
stop_targets = prepare_stop_target(stop_targets,
|
||||
self.outputs_per_step)
|
||||
|
||||
# PAD sequences with longest instance in the batch
|
||||
text = prepare_data(text).astype(np.int32)
|
||||
|
||||
# PAD features with longest instance
|
||||
mel = prepare_tensor(mel, self.outputs_per_step)
|
||||
|
||||
# B x D x T --> B x T x D
|
||||
mel = mel.transpose(0, 2, 1)
|
||||
|
||||
# convert things to pytorch
|
||||
text_lenghts = torch.LongTensor(text_lenghts)
|
||||
text = torch.LongTensor(text)
|
||||
mel = torch.FloatTensor(mel).contiguous()
|
||||
mel_lengths = torch.LongTensor(mel_lengths)
|
||||
stop_targets = torch.FloatTensor(stop_targets)
|
||||
|
||||
# compute linear spectrogram
|
||||
if self.compute_linear_spec:
|
||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||
linear = prepare_tensor(linear, self.outputs_per_step)
|
||||
linear = linear.transpose(0, 2, 1)
|
||||
assert mel.shape[1] == linear.shape[1]
|
||||
linear = torch.FloatTensor(linear).contiguous()
|
||||
else:
|
||||
linear = None
|
||||
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
|
||||
stop_targets, item_idxs
|
||||
|
||||
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
|
||||
found {}".format(type(batch[0]))))
|
|
@ -1,207 +0,0 @@
|
|||
import os
|
||||
from glob import glob
|
||||
import re
|
||||
import sys
|
||||
from TTS.utils.generic_utils import split_dataset
|
||||
|
||||
|
||||
def load_meta_data(datasets):
|
||||
meta_data_train_all = []
|
||||
meta_data_eval_all = []
|
||||
for dataset in datasets:
|
||||
name = dataset['name']
|
||||
root_path = dataset['path']
|
||||
meta_file_train = dataset['meta_file_train']
|
||||
meta_file_val = dataset['meta_file_val']
|
||||
preprocessor = get_preprocessor_by_name(name)
|
||||
|
||||
meta_data_train = preprocessor(root_path, meta_file_train)
|
||||
if meta_file_val is None:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
else:
|
||||
meta_data_eval = preprocessor(root_path, meta_file_val)
|
||||
meta_data_train_all += meta_data_train
|
||||
meta_data_eval_all += meta_data_eval
|
||||
return meta_data_train_all, meta_data_eval_all
|
||||
|
||||
|
||||
def get_preprocessor_by_name(name):
|
||||
"""Returns the respective preprocessing function."""
|
||||
thismodule = sys.modules[__name__]
|
||||
return getattr(thismodule, name.lower())
|
||||
|
||||
|
||||
def tweb(root_path, meta_file):
|
||||
"""Normalize TWEB dataset.
|
||||
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
|
||||
"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "tweb"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('\t')
|
||||
wav_file = os.path.join(root_path, cols[0] + '.wav')
|
||||
text = cols[1]
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
# def kusal(root_path, meta_file):
|
||||
# txt_file = os.path.join(root_path, meta_file)
|
||||
# texts = []
|
||||
# wavs = []
|
||||
# with open(txt_file, "r", encoding="utf8") as f:
|
||||
# frames = [
|
||||
# line.split('\t') for line in f
|
||||
# if line.split('\t')[0] in self.wav_files_dict.keys()
|
||||
# ]
|
||||
# # TODO: code the rest
|
||||
# return {'text': texts, 'wavs': wavs}
|
||||
|
||||
|
||||
def mozilla(root_path, meta_file):
|
||||
"""Normalizes Mozilla meta data files to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "mozilla"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = cols[1].strip()
|
||||
text = cols[0].strip()
|
||||
wav_file = os.path.join(root_path, "wavs", wav_file)
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def mozilla_de(root_path, meta_file):
|
||||
"""Normalizes Mozilla meta data files to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "mozilla"
|
||||
with open(txt_file, 'r', encoding="ISO 8859-1") as ttf:
|
||||
for line in ttf:
|
||||
cols = line.strip().split('|')
|
||||
wav_file = cols[0].strip()
|
||||
text = cols[1].strip()
|
||||
folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
|
||||
wav_file = os.path.join(root_path, folder_name, wav_file)
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def mailabs(root_path, meta_files=None):
|
||||
"""Normalizes M-AI-Labs meta data files to TTS format"""
|
||||
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
|
||||
if meta_files is None:
|
||||
csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
|
||||
else:
|
||||
csv_files = meta_files
|
||||
# meta_files = [f.strip() for f in meta_files.split(",")]
|
||||
items = []
|
||||
for csv_file in csv_files:
|
||||
txt_file = os.path.join(root_path, csv_file)
|
||||
folder = os.path.dirname(txt_file)
|
||||
# determine speaker based on folder structure...
|
||||
speaker_name_match = speaker_regex.search(txt_file)
|
||||
if speaker_name_match is None:
|
||||
continue
|
||||
speaker_name = speaker_name_match.group("speaker_name")
|
||||
print(" | > {}".format(csv_file))
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
if meta_files is None:
|
||||
wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
|
||||
else:
|
||||
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
|
||||
if os.path.isfile(wav_file):
|
||||
text = cols[1].strip()
|
||||
items.append([text, wav_file, speaker_name])
|
||||
else:
|
||||
raise RuntimeError("> File %s does not exist!"%(wav_file))
|
||||
return items
|
||||
|
||||
|
||||
def ljspeech(root_path, meta_file):
|
||||
"""Normalizes the Nancy meta data file to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "ljspeech"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav')
|
||||
text = cols[1]
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def nancy(root_path, meta_file):
|
||||
"""Normalizes the Nancy meta data file to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "nancy"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
utt_id = line.split()[1]
|
||||
text = line[line.find('"') + 1:line.rfind('"') - 1]
|
||||
wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def common_voice(root_path, meta_file):
|
||||
"""Normalize the common voice meta data file to TTS format."""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
if line.startswith("client_id"):
|
||||
continue
|
||||
cols = line.split("\t")
|
||||
text = cols[2]
|
||||
speaker_name = cols[0]
|
||||
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav")
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def libri_tts(root_path, meta_files=None):
|
||||
"""https://ai.google/tools/datasets/libri-tts/"""
|
||||
items = []
|
||||
if meta_files is None:
|
||||
meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
|
||||
for meta_file in meta_files:
|
||||
_meta_file = os.path.basename(meta_file).split('.')[0]
|
||||
speaker_name = _meta_file.split('_')[0]
|
||||
chapter_id = _meta_file.split('_')[1]
|
||||
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
|
||||
with open(meta_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('\t')
|
||||
wav_file = os.path.join(_root_path, cols[0] + '.wav')
|
||||
text = cols[1]
|
||||
items.append([text, wav_file, speaker_name])
|
||||
for item in items:
|
||||
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
|
||||
return items
|
||||
|
||||
|
||||
def custom_turkish(root_path, meta_file):
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "turkish-female"
|
||||
skipped_files = []
|
||||
with open(txt_file, 'r', encoding='utf-8') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
|
||||
if not os.path.exists(wav_file):
|
||||
skipped_files.append(wav_file)
|
||||
continue
|
||||
text = cols[1].strip()
|
||||
items.append([text, wav_file, speaker_name])
|
||||
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
|
||||
return items
|
178
distribute.py
178
distribute.py
|
@ -1,178 +0,0 @@
|
|||
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
|
||||
import os, sys
|
||||
import math
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.utils.data.sampler import Sampler
|
||||
from torch.autograd import Variable
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from TTS.utils.generic_utils import create_experiment_folder
|
||||
|
||||
|
||||
class DistributedSampler(Sampler):
|
||||
"""
|
||||
Non shuffling Distributed Sampler
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, num_replicas=None, rank=None):
|
||||
super(DistributedSampler, self).__init__(dataset)
|
||||
if num_replicas is None:
|
||||
if not dist.is_available():
|
||||
raise RuntimeError("Requires distributed package to be available")
|
||||
num_replicas = dist.get_world_size()
|
||||
if rank is None:
|
||||
if not dist.is_available():
|
||||
raise RuntimeError("Requires distributed package to be available")
|
||||
rank = dist.get_rank()
|
||||
self.dataset = dataset
|
||||
self.num_replicas = num_replicas
|
||||
self.rank = rank
|
||||
self.epoch = 0
|
||||
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
|
||||
self.total_size = self.num_samples * self.num_replicas
|
||||
|
||||
def __iter__(self):
|
||||
indices = torch.arange(len(self.dataset)).tolist()
|
||||
|
||||
# add extra samples to make it evenly divisible
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# subsample
|
||||
indices = indices[self.rank:self.total_size:self.num_replicas]
|
||||
assert len(indices) == self.num_samples
|
||||
|
||||
return iter(indices)
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
||||
def set_epoch(self, epoch):
|
||||
self.epoch = epoch
|
||||
|
||||
|
||||
def reduce_tensor(tensor, num_gpus):
|
||||
rt = tensor.clone()
|
||||
dist.all_reduce(rt, op=dist.reduce_op.SUM)
|
||||
rt /= num_gpus
|
||||
return rt
|
||||
|
||||
|
||||
def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
|
||||
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
||||
|
||||
# Set cuda device so everything is done on the right GPU.
|
||||
torch.cuda.set_device(rank % torch.cuda.device_count())
|
||||
|
||||
# Initialize distributed communication
|
||||
dist.init_process_group(
|
||||
dist_backend,
|
||||
init_method=dist_url,
|
||||
world_size=num_gpus,
|
||||
rank=rank,
|
||||
group_name=group_name)
|
||||
|
||||
|
||||
def apply_gradient_allreduce(module):
|
||||
|
||||
# sync model parameters
|
||||
for p in module.state_dict().values():
|
||||
if not torch.is_tensor(p):
|
||||
continue
|
||||
dist.broadcast(p, 0)
|
||||
|
||||
def allreduce_params():
|
||||
if module.needs_reduction:
|
||||
module.needs_reduction = False
|
||||
# bucketing params based on value types
|
||||
buckets = {}
|
||||
for param in module.parameters():
|
||||
if param.requires_grad and param.grad is not None:
|
||||
tp = type(param.data)
|
||||
if tp not in buckets:
|
||||
buckets[tp] = []
|
||||
buckets[tp].append(param)
|
||||
for tp in buckets:
|
||||
bucket = buckets[tp]
|
||||
grads = [param.grad.data for param in bucket]
|
||||
coalesced = _flatten_dense_tensors(grads)
|
||||
dist.all_reduce(coalesced, op=dist.reduce_op.SUM)
|
||||
coalesced /= dist.get_world_size()
|
||||
for buf, synced in zip(
|
||||
grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||
buf.copy_(synced)
|
||||
|
||||
for param in list(module.parameters()):
|
||||
|
||||
def allreduce_hook(*_):
|
||||
Variable._execution_engine.queue_callback(allreduce_params)
|
||||
|
||||
if param.requires_grad:
|
||||
param.register_hook(allreduce_hook)
|
||||
|
||||
def set_needs_reduction(self, *_):
|
||||
self.needs_reduction = True
|
||||
|
||||
module.register_forward_hook(set_needs_reduction)
|
||||
return module
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
|
||||
# True)
|
||||
# stdout_path = os.path.join(OUT_PATH, "process_stdout/")
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
group_id = time.strftime("%Y_%m_%d-%H%M%S")
|
||||
|
||||
# set arguments for train.py
|
||||
command = ['train.py']
|
||||
command.append('--continue_path={}'.format(args.continue_path))
|
||||
command.append('--restore_path={}'.format(args.restore_path))
|
||||
command.append('--config_path={}'.format(args.config_path))
|
||||
command.append('--group_id=group_{}'.format(group_id))
|
||||
command.append('')
|
||||
|
||||
# run processes
|
||||
processes = []
|
||||
for i in range(num_gpus):
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||
command[-1] = '--rank={}'.format(i)
|
||||
stdout = None if i == 0 else open(os.devnull, 'w')
|
||||
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
|
||||
processes.append(p)
|
||||
print(command)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,389 +0,0 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.autograd import Variable
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class Linear(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
bias=True,
|
||||
init_gain='linear'):
|
||||
super(Linear, self).__init__()
|
||||
self.linear_layer = torch.nn.Linear(
|
||||
in_features, out_features, bias=bias)
|
||||
self._init_w(init_gain)
|
||||
|
||||
def _init_w(self, init_gain):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain(init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear_layer(x)
|
||||
|
||||
|
||||
class LinearBN(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
out_features,
|
||||
bias=True,
|
||||
init_gain='linear'):
|
||||
super(LinearBN, self).__init__()
|
||||
self.linear_layer = torch.nn.Linear(
|
||||
in_features, out_features, bias=bias)
|
||||
self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
|
||||
self._init_w(init_gain)
|
||||
|
||||
def _init_w(self, init_gain):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain(init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
out = self.linear_layer(x)
|
||||
if len(out.shape) == 3:
|
||||
out = out.permute(1, 2, 0)
|
||||
out = self.batch_normalization(out)
|
||||
if len(out.shape) == 3:
|
||||
out = out.permute(2, 0, 1)
|
||||
return out
|
||||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
out_features=[256, 256],
|
||||
bias=True):
|
||||
super(Prenet, self).__init__()
|
||||
self.prenet_type = prenet_type
|
||||
self.prenet_dropout = prenet_dropout
|
||||
in_features = [in_features] + out_features[:-1]
|
||||
if prenet_type == "bn":
|
||||
self.linear_layers = nn.ModuleList([
|
||||
LinearBN(in_size, out_size, bias=bias)
|
||||
for (in_size, out_size) in zip(in_features, out_features)
|
||||
])
|
||||
elif prenet_type == "original":
|
||||
self.linear_layers = nn.ModuleList([
|
||||
Linear(in_size, out_size, bias=bias)
|
||||
for (in_size, out_size) in zip(in_features, out_features)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
for linear in self.linear_layers:
|
||||
if self.prenet_dropout:
|
||||
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training)
|
||||
else:
|
||||
x = F.relu(linear(x))
|
||||
return x
|
||||
|
||||
|
||||
####################
|
||||
# ATTENTION MODULES
|
||||
####################
|
||||
|
||||
|
||||
class LocationLayer(nn.Module):
|
||||
def __init__(self,
|
||||
attention_dim,
|
||||
attention_n_filters=32,
|
||||
attention_kernel_size=31):
|
||||
super(LocationLayer, self).__init__()
|
||||
self.location_conv1d = nn.Conv1d(
|
||||
in_channels=2,
|
||||
out_channels=attention_n_filters,
|
||||
kernel_size=attention_kernel_size,
|
||||
stride=1,
|
||||
padding=(attention_kernel_size - 1) // 2,
|
||||
bias=False)
|
||||
self.location_dense = Linear(
|
||||
attention_n_filters, attention_dim, bias=False, init_gain='tanh')
|
||||
|
||||
def forward(self, attention_cat):
|
||||
processed_attention = self.location_conv1d(attention_cat)
|
||||
processed_attention = self.location_dense(
|
||||
processed_attention.transpose(1, 2))
|
||||
return processed_attention
|
||||
|
||||
|
||||
class GravesAttention(nn.Module):
|
||||
""" Discretized Graves attention:
|
||||
- https://arxiv.org/abs/1910.10288
|
||||
- https://arxiv.org/pdf/1906.01083.pdf
|
||||
"""
|
||||
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
|
||||
|
||||
def __init__(self, query_dim, K):
|
||||
super(GravesAttention, self).__init__()
|
||||
self._mask_value = 1e-8
|
||||
self.K = K
|
||||
# self.attention_alignment = 0.05
|
||||
self.eps = 1e-5
|
||||
self.J = None
|
||||
self.N_a = nn.Sequential(
|
||||
nn.Linear(query_dim, query_dim, bias=True),
|
||||
nn.ReLU(),
|
||||
nn.Linear(query_dim, 3*K, bias=True))
|
||||
self.attention_weights = None
|
||||
self.mu_prev = None
|
||||
self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean
|
||||
torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std
|
||||
|
||||
def init_states(self, inputs):
|
||||
if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
|
||||
self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5
|
||||
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
|
||||
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
|
||||
|
||||
# pylint: disable=R0201
|
||||
# pylint: disable=unused-argument
|
||||
def preprocess_inputs(self, inputs):
|
||||
return None
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
shapes:
|
||||
query: B x D_attention_rnn
|
||||
inputs: B x T_in x D_encoder
|
||||
processed_inputs: place_holder
|
||||
mask: B x T_in
|
||||
"""
|
||||
gbk_t = self.N_a(query)
|
||||
gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K)
|
||||
|
||||
# attention model parameters
|
||||
# each B x K
|
||||
g_t = gbk_t[:, 0, :]
|
||||
b_t = gbk_t[:, 1, :]
|
||||
k_t = gbk_t[:, 2, :]
|
||||
|
||||
# dropout to decorrelate attention heads
|
||||
g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
|
||||
|
||||
# attention GMM parameters
|
||||
sig_t = torch.nn.functional.softplus(b_t) + self.eps
|
||||
|
||||
mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
|
||||
g_t = torch.softmax(g_t, dim=-1) + self.eps
|
||||
|
||||
j = self.J[:inputs.size(1)+1]
|
||||
|
||||
# attention weights
|
||||
phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1))))
|
||||
|
||||
# discritize attention weights
|
||||
alpha_t = torch.sum(phi_t, 1)
|
||||
alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1]
|
||||
alpha_t[alpha_t == 0] = 1e-8
|
||||
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
alpha_t.data.masked_fill_(~mask, self._mask_value)
|
||||
|
||||
context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1)
|
||||
self.attention_weights = alpha_t
|
||||
self.mu_prev = mu_t
|
||||
return context
|
||||
|
||||
|
||||
class OriginalAttention(nn.Module):
|
||||
"""Following the methods proposed here:
|
||||
- https://arxiv.org/abs/1712.05884
|
||||
- https://arxiv.org/abs/1807.06736 + state masking at inference
|
||||
- Using sigmoid instead of softmax normalization
|
||||
- Attention windowing at inference time
|
||||
"""
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
def __init__(self, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask):
|
||||
super(OriginalAttention, self).__init__()
|
||||
self.query_layer = Linear(
|
||||
query_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.inputs_layer = Linear(
|
||||
embedding_dim, attention_dim, bias=False, init_gain='tanh')
|
||||
self.v = Linear(attention_dim, 1, bias=True)
|
||||
if trans_agent:
|
||||
self.ta = nn.Linear(
|
||||
query_dim + embedding_dim, 1, bias=True)
|
||||
if location_attention:
|
||||
self.location_layer = LocationLayer(
|
||||
attention_dim,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size,
|
||||
)
|
||||
self._mask_value = -float("inf")
|
||||
self.windowing = windowing
|
||||
self.win_idx = None
|
||||
self.norm = norm
|
||||
self.forward_attn = forward_attn
|
||||
self.trans_agent = trans_agent
|
||||
self.forward_attn_mask = forward_attn_mask
|
||||
self.location_attention = location_attention
|
||||
|
||||
def init_win_idx(self):
|
||||
self.win_idx = -1
|
||||
self.win_back = 2
|
||||
self.win_front = 6
|
||||
|
||||
def init_forward_attn(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.alpha = torch.cat(
|
||||
[torch.ones([B, 1]),
|
||||
torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device)
|
||||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||
|
||||
def init_location_attention(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_())
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights = Variable(inputs.data.new(B, T).zero_())
|
||||
if self.location_attention:
|
||||
self.init_location_attention(inputs)
|
||||
if self.forward_attn:
|
||||
self.init_forward_attn(inputs)
|
||||
if self.windowing:
|
||||
self.init_win_idx()
|
||||
|
||||
def preprocess_inputs(self, inputs):
|
||||
return self.inputs_layer(inputs)
|
||||
|
||||
def update_location_attention(self, alignments):
|
||||
self.attention_weights_cum += alignments
|
||||
|
||||
def get_location_attention(self, query, processed_inputs):
|
||||
attention_cat = torch.cat((self.attention_weights.unsqueeze(1),
|
||||
self.attention_weights_cum.unsqueeze(1)),
|
||||
dim=1)
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
processed_attention_weights = self.location_layer(attention_cat)
|
||||
energies = self.v(
|
||||
torch.tanh(processed_query + processed_attention_weights +
|
||||
processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def get_attention(self, query, processed_inputs):
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
energies = self.v(torch.tanh(processed_query + processed_inputs))
|
||||
energies = energies.squeeze(-1)
|
||||
return energies, processed_query
|
||||
|
||||
def apply_windowing(self, attention, inputs):
|
||||
back_win = self.win_idx - self.win_back
|
||||
front_win = self.win_idx + self.win_front
|
||||
if back_win > 0:
|
||||
attention[:, :back_win] = -float("inf")
|
||||
if front_win < inputs.shape[1]:
|
||||
attention[:, front_win:] = -float("inf")
|
||||
# this is a trick to solve a special problem.
|
||||
# but it does not hurt.
|
||||
if self.win_idx == -1:
|
||||
attention[:, 0] = attention.max()
|
||||
# Update the window
|
||||
self.win_idx = torch.argmax(attention, 1).long()[0].item()
|
||||
return attention
|
||||
|
||||
def apply_forward_attention(self, alignment):
|
||||
# forward attention
|
||||
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
|
||||
(1, 0, 0, 0))
|
||||
# compute transition potentials
|
||||
alpha = ((1 - self.u) * self.alpha
|
||||
+ self.u * fwd_shifted_alpha
|
||||
+ 1e-8) * alignment
|
||||
# force incremental alignment
|
||||
if not self.training and self.forward_attn_mask:
|
||||
_, n = fwd_shifted_alpha.max(1)
|
||||
val, n2 = alpha.max(1)
|
||||
for b in range(alignment.shape[0]):
|
||||
alpha[b, n[b] + 3:] = 0
|
||||
alpha[b, :(
|
||||
n[b] - 1
|
||||
)] = 0 # ignore all previous states to prevent repetition.
|
||||
alpha[b,
|
||||
(n[b] - 2
|
||||
)] = 0.01 * val[b] # smoothing factor for the prev step
|
||||
# renormalize attention weights
|
||||
alpha = alpha / alpha.sum(dim=1, keepdim=True)
|
||||
return alpha
|
||||
|
||||
def forward(self, query, inputs, processed_inputs, mask):
|
||||
"""
|
||||
shapes:
|
||||
query: B x D_attn_rnn
|
||||
inputs: B x T_en x D_en
|
||||
processed_inputs:: B x T_en x D_attn
|
||||
mask: B x T_en
|
||||
"""
|
||||
if self.location_attention:
|
||||
attention, _ = self.get_location_attention(
|
||||
query, processed_inputs)
|
||||
else:
|
||||
attention, _ = self.get_attention(
|
||||
query, processed_inputs)
|
||||
# apply masking
|
||||
if mask is not None:
|
||||
attention.data.masked_fill_(~mask, self._mask_value)
|
||||
# apply windowing - only in eval mode
|
||||
if not self.training and self.windowing:
|
||||
attention = self.apply_windowing(attention, inputs)
|
||||
|
||||
# normalize attention values
|
||||
if self.norm == "softmax":
|
||||
alignment = torch.softmax(attention, dim=-1)
|
||||
elif self.norm == "sigmoid":
|
||||
alignment = torch.sigmoid(attention) / torch.sigmoid(
|
||||
attention).sum(
|
||||
dim=1, keepdim=True)
|
||||
else:
|
||||
raise ValueError("Unknown value for attention norm type")
|
||||
|
||||
if self.location_attention:
|
||||
self.update_location_attention(alignment)
|
||||
|
||||
# apply forward attention if enabled
|
||||
if self.forward_attn:
|
||||
alignment = self.apply_forward_attention(alignment)
|
||||
self.alpha = alignment
|
||||
|
||||
context = torch.bmm(alignment.unsqueeze(1), inputs)
|
||||
context = context.squeeze(1)
|
||||
self.attention_weights = alignment
|
||||
|
||||
# compute transition agent
|
||||
if self.forward_attn and self.trans_agent:
|
||||
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
|
||||
self.u = torch.sigmoid(self.ta(ta_input))
|
||||
return context
|
||||
|
||||
|
||||
def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
|
||||
location_attention, attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing, norm, forward_attn,
|
||||
trans_agent, forward_attn_mask, attn_K):
|
||||
if attn_type == "original":
|
||||
return OriginalAttention(query_dim, embedding_dim, attention_dim,
|
||||
location_attention,
|
||||
attention_location_n_filters,
|
||||
attention_location_kernel_size, windowing,
|
||||
norm, forward_attn, trans_agent,
|
||||
forward_attn_mask)
|
||||
if attn_type == "graves":
|
||||
return GravesAttention(query_dim, attn_K)
|
||||
raise RuntimeError(
|
||||
" [!] Given Attention Type '{attn_type}' is not exist.")
|
|
@ -1,169 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class GST(nn.Module):
|
||||
"""Global Style Token Module for factorizing prosody in speech.
|
||||
|
||||
See https://arxiv.org/pdf/1803.09017"""
|
||||
|
||||
def __init__(self, num_mel, num_heads, num_style_tokens, embedding_dim):
|
||||
super().__init__()
|
||||
self.encoder = ReferenceEncoder(num_mel, embedding_dim)
|
||||
self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens,
|
||||
embedding_dim)
|
||||
|
||||
def forward(self, inputs):
|
||||
enc_out = self.encoder(inputs)
|
||||
style_embed = self.style_token_layer(enc_out)
|
||||
|
||||
return style_embed
|
||||
|
||||
|
||||
class ReferenceEncoder(nn.Module):
|
||||
"""NN module creating a fixed size prosody embedding from a spectrogram.
|
||||
|
||||
inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
|
||||
outputs: [batch_size, embedding_dim]
|
||||
"""
|
||||
|
||||
def __init__(self, num_mel, embedding_dim):
|
||||
|
||||
super().__init__()
|
||||
self.num_mel = num_mel
|
||||
filters = [1] + [32, 32, 64, 64, 128, 128]
|
||||
num_layers = len(filters) - 1
|
||||
convs = [
|
||||
nn.Conv2d(
|
||||
in_channels=filters[i],
|
||||
out_channels=filters[i + 1],
|
||||
kernel_size=(3, 3),
|
||||
stride=(2, 2),
|
||||
padding=(1, 1)) for i in range(num_layers)
|
||||
]
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList([
|
||||
nn.BatchNorm2d(num_features=filter_size)
|
||||
for filter_size in filters[1:]
|
||||
])
|
||||
|
||||
post_conv_height = self.calculate_post_conv_height(
|
||||
num_mel, 3, 2, 1, num_layers)
|
||||
self.recurrence = nn.GRU(
|
||||
input_size=filters[-1] * post_conv_height,
|
||||
hidden_size=embedding_dim // 2,
|
||||
batch_first=True)
|
||||
|
||||
def forward(self, inputs):
|
||||
batch_size = inputs.size(0)
|
||||
x = inputs.view(batch_size, 1, -1, self.num_mel)
|
||||
# x: 4D tensor [batch_size, num_channels==1, num_frames, num_mel]
|
||||
for conv, bn in zip(self.convs, self.bns):
|
||||
x = conv(x)
|
||||
x = bn(x)
|
||||
x = F.relu(x)
|
||||
|
||||
x = x.transpose(1, 2)
|
||||
# x: 4D tensor [batch_size, post_conv_width,
|
||||
# num_channels==128, post_conv_height]
|
||||
post_conv_width = x.size(1)
|
||||
x = x.contiguous().view(batch_size, post_conv_width, -1)
|
||||
# x: 3D tensor [batch_size, post_conv_width,
|
||||
# num_channels*post_conv_height]
|
||||
self.recurrence.flatten_parameters()
|
||||
memory, out = self.recurrence(x)
|
||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||
|
||||
return out.squeeze(0)
|
||||
|
||||
@staticmethod
|
||||
def calculate_post_conv_height(height, kernel_size, stride, pad,
|
||||
n_convs):
|
||||
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
|
||||
for _ in range(n_convs):
|
||||
height = (height - kernel_size + 2 * pad) // stride + 1
|
||||
return height
|
||||
|
||||
|
||||
class StyleTokenLayer(nn.Module):
|
||||
"""NN Module attending to style tokens based on prosody encodings."""
|
||||
|
||||
def __init__(self, num_heads, num_style_tokens,
|
||||
embedding_dim):
|
||||
super().__init__()
|
||||
self.query_dim = embedding_dim // 2
|
||||
self.key_dim = embedding_dim // num_heads
|
||||
self.style_tokens = nn.Parameter(
|
||||
torch.FloatTensor(num_style_tokens, self.key_dim))
|
||||
nn.init.orthogonal_(self.style_tokens)
|
||||
self.attention = MultiHeadAttention(
|
||||
query_dim=self.query_dim,
|
||||
key_dim=self.key_dim,
|
||||
num_units=embedding_dim,
|
||||
num_heads=num_heads)
|
||||
|
||||
def forward(self, inputs):
|
||||
batch_size = inputs.size(0)
|
||||
prosody_encoding = inputs.unsqueeze(1)
|
||||
# prosody_encoding: 3D tensor [batch_size, 1, encoding_size==128]
|
||||
tokens = torch.tanh(self.style_tokens) \
|
||||
.unsqueeze(0) \
|
||||
.expand(batch_size, -1, -1)
|
||||
# tokens: 3D tensor [batch_size, num tokens, token embedding size]
|
||||
style_embed = self.attention(prosody_encoding, tokens)
|
||||
|
||||
return style_embed
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
'''
|
||||
input:
|
||||
query --- [N, T_q, query_dim]
|
||||
key --- [N, T_k, key_dim]
|
||||
output:
|
||||
out --- [N, T_q, num_units]
|
||||
'''
|
||||
|
||||
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
||||
|
||||
super().__init__()
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.key_dim = key_dim
|
||||
|
||||
self.W_query = nn.Linear(
|
||||
in_features=query_dim, out_features=num_units, bias=False)
|
||||
self.W_key = nn.Linear(
|
||||
in_features=key_dim, out_features=num_units, bias=False)
|
||||
self.W_value = nn.Linear(
|
||||
in_features=key_dim, out_features=num_units, bias=False)
|
||||
|
||||
def forward(self, query, key):
|
||||
queries = self.W_query(query) # [N, T_q, num_units]
|
||||
keys = self.W_key(key) # [N, T_k, num_units]
|
||||
values = self.W_value(key)
|
||||
|
||||
split_size = self.num_units // self.num_heads
|
||||
queries = torch.stack(
|
||||
torch.split(queries, split_size, dim=2),
|
||||
dim=0) # [h, N, T_q, num_units/h]
|
||||
keys = torch.stack(
|
||||
torch.split(keys, split_size, dim=2),
|
||||
dim=0) # [h, N, T_k, num_units/h]
|
||||
values = torch.stack(
|
||||
torch.split(values, split_size, dim=2),
|
||||
dim=0) # [h, N, T_k, num_units/h]
|
||||
|
||||
# score = softmax(QK^T / (d_k ** 0.5))
|
||||
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
||||
scores = scores / (self.key_dim**0.5)
|
||||
scores = F.softmax(scores, dim=3)
|
||||
|
||||
# out = score * V
|
||||
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
||||
out = torch.cat(
|
||||
torch.split(out, 1, dim=0),
|
||||
dim=3).squeeze(0) # [N, T_q, num_units]
|
||||
|
||||
return out
|
246
layers/losses.py
246
layers/losses.py
|
@ -1,246 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class L1LossMasked(nn.Module):
|
||||
|
||||
def __init__(self, seq_len_norm):
|
||||
super(L1LossMasked, self).__init__()
|
||||
self.seq_len_norm = seq_len_norm
|
||||
|
||||
def forward(self, x, target, length):
|
||||
"""
|
||||
Args:
|
||||
x: A Variable containing a FloatTensor of size
|
||||
(batch, max_len, dim) which contains the
|
||||
unnormalized probability for each class.
|
||||
target: A Variable containing a LongTensor of size
|
||||
(batch, max_len, dim) which contains the index of the true
|
||||
class for each corresponding step.
|
||||
length: A Variable containing a LongTensor of size (batch,)
|
||||
which contains the length of each data in a batch.
|
||||
Returns:
|
||||
loss: An average loss value in range [0, 1] masked by the length.
|
||||
"""
|
||||
# mask: (batch, max_len, 1)
|
||||
target.requires_grad = False
|
||||
mask = sequence_mask(
|
||||
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
|
||||
if self.seq_len_norm:
|
||||
norm_w = mask / mask.sum(dim=1, keepdim=True)
|
||||
out_weights = norm_w.div(target.shape[0] * target.shape[2])
|
||||
mask = mask.expand_as(x)
|
||||
loss = functional.l1_loss(
|
||||
x * mask, target * mask, reduction='none')
|
||||
loss = loss.mul(out_weights.to(loss.device)).sum()
|
||||
else:
|
||||
mask = mask.expand_as(x)
|
||||
loss = functional.l1_loss(
|
||||
x * mask, target * mask, reduction='sum')
|
||||
loss = loss / mask.sum()
|
||||
return loss
|
||||
|
||||
|
||||
class MSELossMasked(nn.Module):
|
||||
|
||||
def __init__(self, seq_len_norm):
|
||||
super(MSELossMasked, self).__init__()
|
||||
self.seq_len_norm = seq_len_norm
|
||||
|
||||
def forward(self, x, target, length):
|
||||
"""
|
||||
Args:
|
||||
x: A Variable containing a FloatTensor of size
|
||||
(batch, max_len, dim) which contains the
|
||||
unnormalized probability for each class.
|
||||
target: A Variable containing a LongTensor of size
|
||||
(batch, max_len, dim) which contains the index of the true
|
||||
class for each corresponding step.
|
||||
length: A Variable containing a LongTensor of size (batch,)
|
||||
which contains the length of each data in a batch.
|
||||
Returns:
|
||||
loss: An average loss value in range [0, 1] masked by the length.
|
||||
"""
|
||||
# mask: (batch, max_len, 1)
|
||||
target.requires_grad = False
|
||||
mask = sequence_mask(
|
||||
sequence_length=length, max_len=target.size(1)).unsqueeze(2).float()
|
||||
if self.seq_len_norm:
|
||||
norm_w = mask / mask.sum(dim=1, keepdim=True)
|
||||
out_weights = norm_w.div(target.shape[0] * target.shape[2])
|
||||
mask = mask.expand_as(x)
|
||||
loss = functional.mse_loss(
|
||||
x * mask, target * mask, reduction='none')
|
||||
loss = loss.mul(out_weights.to(loss.device)).sum()
|
||||
else:
|
||||
mask = mask.expand_as(x)
|
||||
loss = functional.mse_loss(
|
||||
x * mask, target * mask, reduction='sum')
|
||||
loss = loss / mask.sum()
|
||||
return loss
|
||||
|
||||
|
||||
class AttentionEntropyLoss(nn.Module):
|
||||
# pylint: disable=R0201
|
||||
def forward(self, align):
|
||||
"""
|
||||
Forces attention to be more decisive by penalizing
|
||||
soft attention weights
|
||||
|
||||
TODO: arguments
|
||||
TODO: unit_test
|
||||
"""
|
||||
entropy = torch.distributions.Categorical(probs=align).entropy()
|
||||
loss = (entropy / np.log(align.shape[1])).mean()
|
||||
return loss
|
||||
|
||||
|
||||
class BCELossMasked(nn.Module):
|
||||
|
||||
def __init__(self, pos_weight):
|
||||
super(BCELossMasked, self).__init__()
|
||||
self.pos_weight = pos_weight
|
||||
|
||||
def forward(self, x, target, length):
|
||||
"""
|
||||
Args:
|
||||
x: A Variable containing a FloatTensor of size
|
||||
(batch, max_len) which contains the
|
||||
unnormalized probability for each class.
|
||||
target: A Variable containing a LongTensor of size
|
||||
(batch, max_len) which contains the index of the true
|
||||
class for each corresponding step.
|
||||
length: A Variable containing a LongTensor of size (batch,)
|
||||
which contains the length of each data in a batch.
|
||||
Returns:
|
||||
loss: An average loss value in range [0, 1] masked by the length.
|
||||
"""
|
||||
# mask: (batch, max_len, 1)
|
||||
target.requires_grad = False
|
||||
mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float()
|
||||
loss = functional.binary_cross_entropy_with_logits(
|
||||
x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum')
|
||||
loss = loss / mask.sum()
|
||||
return loss
|
||||
|
||||
|
||||
class GuidedAttentionLoss(torch.nn.Module):
|
||||
def __init__(self, sigma=0.4):
|
||||
super(GuidedAttentionLoss, self).__init__()
|
||||
self.sigma = sigma
|
||||
|
||||
def _make_ga_masks(self, ilens, olens):
|
||||
B = len(ilens)
|
||||
max_ilen = max(ilens)
|
||||
max_olen = max(olens)
|
||||
ga_masks = torch.zeros((B, max_olen, max_ilen))
|
||||
for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
|
||||
ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma)
|
||||
return ga_masks
|
||||
|
||||
def forward(self, att_ws, ilens, olens):
|
||||
ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device)
|
||||
seq_masks = self._make_masks(ilens, olens).to(att_ws.device)
|
||||
losses = ga_masks * att_ws
|
||||
loss = torch.mean(losses.masked_select(seq_masks))
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def _make_ga_mask(ilen, olen, sigma):
|
||||
grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
|
||||
grid_x, grid_y = grid_x.float(), grid_y.float()
|
||||
return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))
|
||||
|
||||
@staticmethod
|
||||
def _make_masks(ilens, olens):
|
||||
in_masks = sequence_mask(ilens)
|
||||
out_masks = sequence_mask(olens)
|
||||
return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
|
||||
|
||||
|
||||
class TacotronLoss(torch.nn.Module):
|
||||
def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
|
||||
super(TacotronLoss, self).__init__()
|
||||
self.stopnet_pos_weight = stopnet_pos_weight
|
||||
self.ga_alpha = c.ga_alpha
|
||||
self.config = c
|
||||
# postnet decoder loss
|
||||
if c.loss_masking:
|
||||
self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
|
||||
"Tacotron"
|
||||
] else MSELossMasked(c.seq_len_norm)
|
||||
else:
|
||||
self.criterion = nn.L1Loss() if c.model in ["Tacotron"
|
||||
] else nn.MSELoss()
|
||||
# guided attention loss
|
||||
if c.ga_alpha > 0:
|
||||
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
|
||||
# stopnet loss
|
||||
# pylint: disable=not-callable
|
||||
self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None
|
||||
|
||||
def forward(self, postnet_output, decoder_output, mel_input, linear_input,
|
||||
stopnet_output, stopnet_target, output_lens, decoder_b_output,
|
||||
alignments, alignment_lens, alignments_backwards, input_lens):
|
||||
|
||||
return_dict = {}
|
||||
# decoder and postnet losses
|
||||
if self.config.loss_masking:
|
||||
decoder_loss = self.criterion(decoder_output, mel_input,
|
||||
output_lens)
|
||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = self.criterion(postnet_output, linear_input,
|
||||
output_lens)
|
||||
else:
|
||||
postnet_loss = self.criterion(postnet_output, mel_input,
|
||||
output_lens)
|
||||
else:
|
||||
decoder_loss = self.criterion(decoder_output, mel_input)
|
||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||
postnet_loss = self.criterion(postnet_output, linear_input)
|
||||
else:
|
||||
postnet_loss = self.criterion(postnet_output, mel_input)
|
||||
loss = decoder_loss + postnet_loss
|
||||
return_dict['decoder_loss'] = decoder_loss
|
||||
return_dict['postnet_loss'] = postnet_loss
|
||||
|
||||
# stopnet loss
|
||||
stop_loss = self.criterion_st(
|
||||
stopnet_output, stopnet_target,
|
||||
output_lens) if self.config.stopnet else torch.zeros(1)
|
||||
if not self.config.separate_stopnet and self.config.stopnet:
|
||||
loss += stop_loss
|
||||
return_dict['stopnet_loss'] = stop_loss
|
||||
|
||||
# backward decoder loss (if enabled)
|
||||
if self.config.bidirectional_decoder:
|
||||
if self.config.loss_masking:
|
||||
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens)
|
||||
else:
|
||||
decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input)
|
||||
decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output)
|
||||
loss += decoder_b_loss + decoder_c_loss
|
||||
return_dict['decoder_b_loss'] = decoder_b_loss
|
||||
return_dict['decoder_c_loss'] = decoder_c_loss
|
||||
|
||||
# double decoder consistency loss (if enabled)
|
||||
if self.config.double_decoder_consistency:
|
||||
decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens)
|
||||
# decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output)
|
||||
attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards)
|
||||
loss += decoder_b_loss + attention_c_loss
|
||||
return_dict['decoder_coarse_loss'] = decoder_b_loss
|
||||
return_dict['decoder_ddc_loss'] = attention_c_loss
|
||||
|
||||
# guided attention loss (if enabled)
|
||||
if self.config.ga_alpha > 0:
|
||||
ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
|
||||
loss += ga_loss * self.ga_alpha
|
||||
return_dict['ga_loss'] = ga_loss * self.ga_alpha
|
||||
|
||||
return_dict['loss'] = loss
|
||||
return return_dict
|
||||
|
|
@ -1,496 +0,0 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
from .common_layers import Prenet, init_attn, Linear
|
||||
|
||||
|
||||
class BatchNormConv1d(nn.Module):
|
||||
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
|
||||
function between Conv and BatchNorm layers. BatchNorm layer
|
||||
is initialized with the TF default values for momentum and eps.
|
||||
|
||||
Args:
|
||||
in_channels: size of each input sample
|
||||
out_channels: size of each output samples
|
||||
kernel_size: kernel size of conv filters
|
||||
stride: stride of conv filters
|
||||
padding: padding of conv filters
|
||||
activation: activation function set b/w Conv1d and BatchNorm
|
||||
|
||||
Shapes:
|
||||
- input: batch x dims
|
||||
- output: batch x dims
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
activation=None):
|
||||
|
||||
super(BatchNormConv1d, self).__init__()
|
||||
self.padding = padding
|
||||
self.padder = nn.ConstantPad1d(padding, 0)
|
||||
self.conv1d = nn.Conv1d(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=0,
|
||||
bias=False)
|
||||
# Following tensorflow's default parameters
|
||||
self.bn = nn.BatchNorm1d(out_channels, momentum=0.99, eps=1e-3)
|
||||
self.activation = activation
|
||||
# self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
if type(self.activation) == torch.nn.ReLU:
|
||||
w_gain = 'relu'
|
||||
elif type(self.activation) == torch.nn.Tanh:
|
||||
w_gain = 'tanh'
|
||||
elif self.activation is None:
|
||||
w_gain = 'linear'
|
||||
else:
|
||||
raise RuntimeError('Unknown activation function')
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_gain))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.padder(x)
|
||||
x = self.conv1d(x)
|
||||
x = self.bn(x)
|
||||
if self.activation is not None:
|
||||
x = self.activation(x)
|
||||
return x
|
||||
|
||||
|
||||
class Highway(nn.Module):
|
||||
# TODO: Try GLU layer
|
||||
def __init__(self, in_size, out_size):
|
||||
super(Highway, self).__init__()
|
||||
self.H = nn.Linear(in_size, out_size)
|
||||
self.H.bias.data.zero_()
|
||||
self.T = nn.Linear(in_size, out_size)
|
||||
self.T.bias.data.fill_(-1)
|
||||
self.relu = nn.ReLU()
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
# self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.H.weight, gain=torch.nn.init.calculate_gain('relu'))
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.T.weight, gain=torch.nn.init.calculate_gain('sigmoid'))
|
||||
|
||||
def forward(self, inputs):
|
||||
H = self.relu(self.H(inputs))
|
||||
T = self.sigmoid(self.T(inputs))
|
||||
return H * T + inputs * (1.0 - T)
|
||||
|
||||
|
||||
class CBHG(nn.Module):
|
||||
"""CBHG module: a recurrent neural network composed of:
|
||||
- 1-d convolution banks
|
||||
- Highway networks + residual connections
|
||||
- Bidirectional gated recurrent units
|
||||
|
||||
Args:
|
||||
in_features (int): sample size
|
||||
K (int): max filter size in conv bank
|
||||
projections (list): conv channel sizes for conv projections
|
||||
num_highways (int): number of highways layers
|
||||
|
||||
Shapes:
|
||||
- input: B x D x T_in
|
||||
- output: B x T_in x D*2
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_features,
|
||||
K=16,
|
||||
conv_bank_features=128,
|
||||
conv_projections=[128, 128],
|
||||
highway_features=128,
|
||||
gru_features=128,
|
||||
num_highways=4):
|
||||
super(CBHG, self).__init__()
|
||||
self.in_features = in_features
|
||||
self.conv_bank_features = conv_bank_features
|
||||
self.highway_features = highway_features
|
||||
self.gru_features = gru_features
|
||||
self.conv_projections = conv_projections
|
||||
self.relu = nn.ReLU()
|
||||
# list of conv1d bank with filter size k=1...K
|
||||
# TODO: try dilational layers instead
|
||||
self.conv1d_banks = nn.ModuleList([
|
||||
BatchNormConv1d(in_features,
|
||||
conv_bank_features,
|
||||
kernel_size=k,
|
||||
stride=1,
|
||||
padding=[(k - 1) // 2, k // 2],
|
||||
activation=self.relu) for k in range(1, K + 1)
|
||||
])
|
||||
# max pooling of conv bank, with padding
|
||||
# TODO: try average pooling OR larger kernel size
|
||||
out_features = [K * conv_bank_features] + conv_projections[:-1]
|
||||
activations = [self.relu] * (len(conv_projections) - 1)
|
||||
activations += [None]
|
||||
# setup conv1d projection layers
|
||||
layer_set = []
|
||||
for (in_size, out_size, ac) in zip(out_features, conv_projections,
|
||||
activations):
|
||||
layer = BatchNormConv1d(in_size,
|
||||
out_size,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=[1, 1],
|
||||
activation=ac)
|
||||
layer_set.append(layer)
|
||||
self.conv1d_projections = nn.ModuleList(layer_set)
|
||||
# setup Highway layers
|
||||
if self.highway_features != conv_projections[-1]:
|
||||
self.pre_highway = nn.Linear(conv_projections[-1],
|
||||
highway_features,
|
||||
bias=False)
|
||||
self.highways = nn.ModuleList([
|
||||
Highway(highway_features, highway_features)
|
||||
for _ in range(num_highways)
|
||||
])
|
||||
# bi-directional GPU layer
|
||||
self.gru = nn.GRU(gru_features,
|
||||
gru_features,
|
||||
1,
|
||||
batch_first=True,
|
||||
bidirectional=True)
|
||||
|
||||
def forward(self, inputs):
|
||||
# (B, in_features, T_in)
|
||||
x = inputs
|
||||
# (B, hid_features*K, T_in)
|
||||
# Concat conv1d bank outputs
|
||||
outs = []
|
||||
for conv1d in self.conv1d_banks:
|
||||
out = conv1d(x)
|
||||
outs.append(out)
|
||||
x = torch.cat(outs, dim=1)
|
||||
assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
|
||||
for conv1d in self.conv1d_projections:
|
||||
x = conv1d(x)
|
||||
x += inputs
|
||||
x = x.transpose(1, 2)
|
||||
if self.highway_features != self.conv_projections[-1]:
|
||||
x = self.pre_highway(x)
|
||||
# Residual connection
|
||||
# TODO: try residual scaling as in Deep Voice 3
|
||||
# TODO: try plain residual layers
|
||||
for highway in self.highways:
|
||||
x = highway(x)
|
||||
# (B, T_in, hid_features*2)
|
||||
# TODO: replace GRU with convolution as in Deep Voice 3
|
||||
self.gru.flatten_parameters()
|
||||
outputs, _ = self.gru(x)
|
||||
return outputs
|
||||
|
||||
|
||||
class EncoderCBHG(nn.Module):
|
||||
def __init__(self):
|
||||
super(EncoderCBHG, self).__init__()
|
||||
self.cbhg = CBHG(
|
||||
128,
|
||||
K=16,
|
||||
conv_bank_features=128,
|
||||
conv_projections=[128, 128],
|
||||
highway_features=128,
|
||||
gru_features=128,
|
||||
num_highways=4)
|
||||
|
||||
def forward(self, x):
|
||||
return self.cbhg(x)
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
r"""Encapsulate Prenet and CBHG modules for encoder"""
|
||||
|
||||
def __init__(self, in_features):
|
||||
super(Encoder, self).__init__()
|
||||
self.prenet = Prenet(in_features, out_features=[256, 128])
|
||||
self.cbhg = EncoderCBHG()
|
||||
|
||||
def forward(self, inputs):
|
||||
r"""
|
||||
Args:
|
||||
inputs (FloatTensor): embedding features
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x in_features
|
||||
- outputs: batch x time x 128*2
|
||||
"""
|
||||
# B x T x prenet_dim
|
||||
outputs = self.prenet(inputs)
|
||||
outputs = self.cbhg(outputs.transpose(1, 2))
|
||||
return outputs
|
||||
|
||||
|
||||
class PostCBHG(nn.Module):
|
||||
def __init__(self, mel_dim):
|
||||
super(PostCBHG, self).__init__()
|
||||
self.cbhg = CBHG(
|
||||
mel_dim,
|
||||
K=8,
|
||||
conv_bank_features=128,
|
||||
conv_projections=[256, mel_dim],
|
||||
highway_features=128,
|
||||
gru_features=128,
|
||||
num_highways=4)
|
||||
|
||||
def forward(self, x):
|
||||
return self.cbhg(x)
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""Decoder module.
|
||||
|
||||
Args:
|
||||
in_features (int): input vector (encoder output) sample size.
|
||||
memory_dim (int): memory vector (prev. time-step output) sample size.
|
||||
r (int): number of outputs per time step.
|
||||
memory_size (int): size of the past window. if <= 0 memory_size = r
|
||||
TODO: arguments
|
||||
"""
|
||||
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
|
||||
def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing,
|
||||
attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn, attn_K,
|
||||
separate_stopnet, speaker_embedding_dim):
|
||||
super(Decoder, self).__init__()
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
self.in_features = in_features
|
||||
self.max_decoder_steps = 500
|
||||
self.use_memory_queue = memory_size > 0
|
||||
self.memory_size = memory_size if memory_size > 0 else r
|
||||
self.memory_dim = memory_dim
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.query_dim = 256
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
|
||||
self.prenet = Prenet(
|
||||
prenet_dim,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
out_features=[256, 128])
|
||||
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
|
||||
# attention_rnn generates queries for the attention mechanism
|
||||
self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
|
||||
|
||||
self.attention = init_attn(attn_type=attn_type,
|
||||
query_dim=self.query_dim,
|
||||
embedding_dim=in_features,
|
||||
attention_dim=128,
|
||||
location_attention=location_attn,
|
||||
attention_location_n_filters=32,
|
||||
attention_location_kernel_size=31,
|
||||
windowing=attn_windowing,
|
||||
norm=attn_norm,
|
||||
forward_attn=forward_attn,
|
||||
trans_agent=trans_agent,
|
||||
forward_attn_mask=forward_attn_mask,
|
||||
attn_K=attn_K)
|
||||
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
||||
self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
|
||||
# decoder_RNN_input -> |RNN| -> RNN_state
|
||||
self.decoder_rnns = nn.ModuleList(
|
||||
[nn.GRUCell(256, 256) for _ in range(2)])
|
||||
# RNN_state -> |Linear| -> mel_spec
|
||||
self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
|
||||
# learn init values instead of zero init.
|
||||
self.stopnet = StopNet(256 + memory_dim * self.r_init)
|
||||
|
||||
def set_r(self, new_r):
|
||||
self.r = new_r
|
||||
|
||||
def _reshape_memory(self, memory):
|
||||
"""
|
||||
Reshape the spectrograms for given 'r'
|
||||
"""
|
||||
# Grouping multiple frames if necessary
|
||||
if memory.size(-1) == self.memory_dim:
|
||||
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
|
||||
# Time first (T_decoder, B, memory_dim)
|
||||
memory = memory.transpose(0, 1)
|
||||
return memory
|
||||
|
||||
def _init_states(self, inputs):
|
||||
"""
|
||||
Initialization of decoder states
|
||||
"""
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
# go frame as zeros matrix
|
||||
if self.use_memory_queue:
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
|
||||
else:
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
|
||||
# decoder states
|
||||
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
self.decoder_rnn_hiddens = [
|
||||
torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
for idx in range(len(self.decoder_rnns))
|
||||
]
|
||||
self.context_vec = inputs.data.new(B, self.in_features).zero_()
|
||||
# cache attention inputs
|
||||
self.processed_inputs = self.attention.preprocess_inputs(inputs)
|
||||
|
||||
def _parse_outputs(self, outputs, attentions, stop_tokens):
|
||||
# Back to batch first
|
||||
attentions = torch.stack(attentions).transpose(0, 1)
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
outputs = outputs.view(
|
||||
outputs.size(0), -1, self.memory_dim)
|
||||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, attentions, stop_tokens
|
||||
|
||||
def decode(self, inputs, mask=None):
|
||||
# Prenet
|
||||
processed_memory = self.prenet(self.memory_input)
|
||||
# Attention RNN
|
||||
self.attention_rnn_hidden = self.attention_rnn(
|
||||
torch.cat((processed_memory, self.context_vec), -1),
|
||||
self.attention_rnn_hidden)
|
||||
self.context_vec = self.attention(
|
||||
self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
|
||||
# Concat RNN output and attention context vector
|
||||
decoder_input = self.project_to_decoder_in(
|
||||
torch.cat((self.attention_rnn_hidden, self.context_vec), -1))
|
||||
|
||||
# Pass through the decoder RNNs
|
||||
for idx in range(len(self.decoder_rnns)):
|
||||
self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
|
||||
decoder_input, self.decoder_rnn_hiddens[idx])
|
||||
# Residual connection
|
||||
decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
|
||||
decoder_output = decoder_input
|
||||
|
||||
# predict mel vectors from decoder vectors
|
||||
output = self.proj_to_mel(decoder_output)
|
||||
# output = torch.sigmoid(output)
|
||||
# predict stop token
|
||||
stopnet_input = torch.cat([decoder_output, output], -1)
|
||||
if self.separate_stopnet:
|
||||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
output = output[:, : self.r * self.memory_dim]
|
||||
return output, stop_token, self.attention.attention_weights
|
||||
|
||||
def _update_memory_input(self, new_memory):
|
||||
if self.use_memory_queue:
|
||||
if self.memory_size > self.r:
|
||||
# memory queue size is larger than number of frames per decoder iter
|
||||
self.memory_input = torch.cat([
|
||||
new_memory, self.memory_input[:, :(
|
||||
self.memory_size - self.r) * self.memory_dim].clone()
|
||||
], dim=-1)
|
||||
else:
|
||||
# memory queue size smaller than number of frames per decoder iter
|
||||
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
|
||||
else:
|
||||
# use only the last frame prediction
|
||||
# assert new_memory.shape[-1] == self.r * self.memory_dim
|
||||
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
|
||||
|
||||
def forward(self, inputs, memory, mask, speaker_embeddings=None):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
memory: Decoder memory (autoregression. If None (at eval-time),
|
||||
decoder outputs are used as decoder inputs. If None, it uses the last
|
||||
output as the input.
|
||||
mask: Attention mask for sequence padding.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- memory: batch x #mel_specs x mel_spec_dim
|
||||
"""
|
||||
# Run greedy decoding if memory is None
|
||||
memory = self._reshape_memory(memory)
|
||||
outputs = []
|
||||
attentions = []
|
||||
stop_tokens = []
|
||||
t = 0
|
||||
self._init_states(inputs)
|
||||
self.attention.init_states(inputs)
|
||||
while len(outputs) < memory.size(0):
|
||||
if t > 0:
|
||||
new_memory = memory[t - 1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
output, stop_token, attention = self.decode(inputs, mask)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
stop_tokens += [stop_token.squeeze(1)]
|
||||
t += 1
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
def inference(self, inputs, speaker_embeddings=None):
|
||||
"""
|
||||
Args:
|
||||
inputs: encoder outputs.
|
||||
speaker_embeddings: speaker vectors.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- speaker_embeddings: batch x embed_dim
|
||||
"""
|
||||
outputs = []
|
||||
attentions = []
|
||||
stop_tokens = []
|
||||
t = 0
|
||||
self._init_states(inputs)
|
||||
self.attention.init_win_idx()
|
||||
self.attention.init_states(inputs)
|
||||
while True:
|
||||
if t > 0:
|
||||
new_memory = outputs[-1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
output, stop_token, attention = self.decode(inputs, None)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
stop_tokens += [stop_token]
|
||||
t += 1
|
||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6
|
||||
or attention[:, -1].item() > 0.6):
|
||||
break
|
||||
elif t > self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
|
||||
class StopNet(nn.Module):
|
||||
r"""
|
||||
Args:
|
||||
in_features (int): feature dimension of input.
|
||||
"""
|
||||
|
||||
def __init__(self, in_features):
|
||||
super(StopNet, self).__init__()
|
||||
self.dropout = nn.Dropout(0.1)
|
||||
self.linear = nn.Linear(in_features, 1)
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear.weight, gain=torch.nn.init.calculate_gain('linear'))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.dropout(inputs)
|
||||
outputs = self.linear(outputs)
|
||||
return outputs
|
|
@ -1,353 +0,0 @@
|
|||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from .common_layers import init_attn, Prenet, Linear
|
||||
|
||||
|
||||
class ConvBNBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, activation=None):
|
||||
super(ConvBNBlock, self).__init__()
|
||||
assert (kernel_size - 1) % 2 == 0
|
||||
padding = (kernel_size - 1) // 2
|
||||
self.convolution1d = nn.Conv1d(in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
padding=padding)
|
||||
self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
|
||||
self.dropout = nn.Dropout(p=0.5)
|
||||
if activation == 'relu':
|
||||
self.activation = nn.ReLU()
|
||||
elif activation == 'tanh':
|
||||
self.activation = nn.Tanh()
|
||||
else:
|
||||
self.activation = nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
o = self.convolution1d(x)
|
||||
o = self.batch_normalization(o)
|
||||
o = self.activation(o)
|
||||
o = self.dropout(o)
|
||||
return o
|
||||
|
||||
|
||||
class Postnet(nn.Module):
|
||||
def __init__(self, output_dim, num_convs=5):
|
||||
super(Postnet, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
|
||||
for _ in range(1, num_convs - 1):
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
|
||||
|
||||
def forward(self, x):
|
||||
o = x
|
||||
for layer in self.convolutions:
|
||||
o = layer(o)
|
||||
return o
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, output_input_dim=512):
|
||||
super(Encoder, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
for _ in range(3):
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
|
||||
self.lstm = nn.LSTM(output_input_dim,
|
||||
int(output_input_dim / 2),
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
bias=True,
|
||||
bidirectional=True)
|
||||
self.rnn_state = None
|
||||
|
||||
def forward(self, x, input_lengths):
|
||||
o = x
|
||||
for layer in self.convolutions:
|
||||
o = layer(o)
|
||||
o = o.transpose(1, 2)
|
||||
o = nn.utils.rnn.pack_padded_sequence(o,
|
||||
input_lengths,
|
||||
batch_first=True)
|
||||
self.lstm.flatten_parameters()
|
||||
o, _ = self.lstm(o)
|
||||
o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
|
||||
return o
|
||||
|
||||
def inference(self, x):
|
||||
o = x
|
||||
for layer in self.convolutions:
|
||||
o = layer(o)
|
||||
o = o.transpose(1, 2)
|
||||
# self.lstm.flatten_parameters()
|
||||
o, _ = self.lstm(o)
|
||||
return o
|
||||
|
||||
|
||||
# adapted from https://github.com/NVIDIA/tacotron2/
|
||||
class Decoder(nn.Module):
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
|
||||
prenet_type, prenet_dropout, forward_attn, trans_agent,
|
||||
forward_attn_mask, location_attn, attn_K, separate_stopnet,
|
||||
speaker_embedding_dim):
|
||||
super(Decoder, self).__init__()
|
||||
self.frame_dim = frame_dim
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
self.encoder_embedding_dim = input_dim
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.max_decoder_steps = 1000
|
||||
self.gate_threshold = 0.5
|
||||
|
||||
# model dimensions
|
||||
self.query_dim = 1024
|
||||
self.decoder_rnn_dim = 1024
|
||||
self.prenet_dim = 256
|
||||
self.attn_dim = 128
|
||||
self.p_attention_dropout = 0.1
|
||||
self.p_decoder_dropout = 0.1
|
||||
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
prenet_dim = self.frame_dim
|
||||
self.prenet = Prenet(prenet_dim,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
out_features=[self.prenet_dim, self.prenet_dim],
|
||||
bias=False)
|
||||
|
||||
self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
|
||||
self.query_dim,
|
||||
bias=True)
|
||||
|
||||
self.attention = init_attn(attn_type=attn_type,
|
||||
query_dim=self.query_dim,
|
||||
embedding_dim=input_dim,
|
||||
attention_dim=128,
|
||||
location_attention=location_attn,
|
||||
attention_location_n_filters=32,
|
||||
attention_location_kernel_size=31,
|
||||
windowing=attn_win,
|
||||
norm=attn_norm,
|
||||
forward_attn=forward_attn,
|
||||
trans_agent=trans_agent,
|
||||
forward_attn_mask=forward_attn_mask,
|
||||
attn_K=attn_K)
|
||||
|
||||
self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
|
||||
self.decoder_rnn_dim,
|
||||
bias=True)
|
||||
|
||||
self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
|
||||
self.frame_dim * self.r_init)
|
||||
|
||||
self.stopnet = nn.Sequential(
|
||||
nn.Dropout(0.1),
|
||||
Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
|
||||
1,
|
||||
bias=True,
|
||||
init_gain='sigmoid'))
|
||||
self.memory_truncated = None
|
||||
|
||||
def set_r(self, new_r):
|
||||
self.r = new_r
|
||||
|
||||
def get_go_frame(self, inputs):
|
||||
B = inputs.size(0)
|
||||
memory = torch.zeros(1, device=inputs.device).repeat(B,
|
||||
self.frame_dim * self.r)
|
||||
return memory
|
||||
|
||||
def _init_states(self, inputs, mask, keep_states=False):
|
||||
B = inputs.size(0)
|
||||
# T = inputs.size(1)
|
||||
if not keep_states:
|
||||
self.query = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.query_dim)
|
||||
self.attention_rnn_cell_state = torch.zeros(
|
||||
1, device=inputs.device).repeat(B, self.query_dim)
|
||||
self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.decoder_rnn_dim)
|
||||
self.decoder_cell = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.decoder_rnn_dim)
|
||||
self.context = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.encoder_embedding_dim)
|
||||
self.inputs = inputs
|
||||
self.processed_inputs = self.attention.preprocess_inputs(inputs)
|
||||
self.mask = mask
|
||||
|
||||
def _reshape_memory(self, memory):
|
||||
"""
|
||||
Reshape the spectrograms for given 'r'
|
||||
"""
|
||||
# Grouping multiple frames if necessary
|
||||
if memory.size(-1) == self.frame_dim:
|
||||
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
|
||||
# Time first (T_decoder, B, frame_dim)
|
||||
memory = memory.transpose(0, 1)
|
||||
return memory
|
||||
|
||||
def _parse_outputs(self, outputs, stop_tokens, alignments):
|
||||
alignments = torch.stack(alignments).transpose(0, 1)
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
|
||||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def _update_memory(self, memory):
|
||||
if len(memory.shape) == 2:
|
||||
return memory[:, self.frame_dim * (self.r - 1):]
|
||||
return memory[:, :, self.frame_dim * (self.r - 1):]
|
||||
|
||||
def decode(self, memory):
|
||||
'''
|
||||
shapes:
|
||||
- memory: B x r * self.frame_dim
|
||||
'''
|
||||
# self.context: B x D_en
|
||||
# query_input: B x D_en + (r * self.frame_dim)
|
||||
query_input = torch.cat((memory, self.context), -1)
|
||||
# self.query and self.attention_rnn_cell_state : B x D_attn_rnn
|
||||
self.query, self.attention_rnn_cell_state = self.attention_rnn(
|
||||
query_input, (self.query, self.attention_rnn_cell_state))
|
||||
self.query = F.dropout(self.query, self.p_attention_dropout,
|
||||
self.training)
|
||||
self.attention_rnn_cell_state = F.dropout(
|
||||
self.attention_rnn_cell_state, self.p_attention_dropout,
|
||||
self.training)
|
||||
# B x D_en
|
||||
self.context = self.attention(self.query, self.inputs,
|
||||
self.processed_inputs, self.mask)
|
||||
# B x (D_en + D_attn_rnn)
|
||||
decoder_rnn_input = torch.cat((self.query, self.context), -1)
|
||||
# self.decoder_hidden and self.decoder_cell: B x D_decoder_rnn
|
||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||
decoder_rnn_input, (self.decoder_hidden, self.decoder_cell))
|
||||
self.decoder_hidden = F.dropout(self.decoder_hidden,
|
||||
self.p_decoder_dropout, self.training)
|
||||
# B x (D_decoder_rnn + D_en)
|
||||
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
|
||||
dim=1)
|
||||
# B x (self.r * self.frame_dim)
|
||||
decoder_output = self.linear_projection(decoder_hidden_context)
|
||||
# B x (D_decoder_rnn + (self.r * self.frame_dim))
|
||||
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
|
||||
if self.separate_stopnet:
|
||||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
# select outputs for the reduction rate self.r
|
||||
decoder_output = decoder_output[:, :self.r * self.frame_dim]
|
||||
return decoder_output, self.attention.attention_weights, stop_token
|
||||
|
||||
def forward(self, inputs, memories, mask, speaker_embeddings=None):
|
||||
memory = self.get_go_frame(inputs).unsqueeze(0)
|
||||
memories = self._reshape_memory(memories)
|
||||
memories = torch.cat((memory, memories), dim=0)
|
||||
memories = self._update_memory(memories)
|
||||
if speaker_embeddings is not None:
|
||||
memories = torch.cat([memories, speaker_embeddings], dim=-1)
|
||||
memories = self.prenet(memories)
|
||||
|
||||
self._init_states(inputs, mask=mask)
|
||||
self.attention.init_states(inputs)
|
||||
|
||||
outputs, stop_tokens, alignments = [], [], []
|
||||
while len(outputs) < memories.size(0) - 1:
|
||||
memory = memories[len(outputs)]
|
||||
decoder_output, attention_weights, stop_token = self.decode(memory)
|
||||
outputs += [decoder_output.squeeze(1)]
|
||||
stop_tokens += [stop_token.squeeze(1)]
|
||||
alignments += [attention_weights]
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
return outputs, alignments, stop_tokens
|
||||
|
||||
def inference(self, inputs, speaker_embeddings=None):
|
||||
memory = self.get_go_frame(inputs)
|
||||
memory = self._update_memory(memory)
|
||||
|
||||
self._init_states(inputs, mask=None)
|
||||
self.attention.init_states(inputs)
|
||||
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
while True:
|
||||
memory = self.prenet(memory)
|
||||
if speaker_embeddings is not None:
|
||||
memory = torch.cat([memory, speaker_embeddings], dim=-1)
|
||||
decoder_output, alignment, stop_token = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [decoder_output.squeeze(1)]
|
||||
stop_tokens += [stop_token]
|
||||
alignments += [alignment]
|
||||
|
||||
if stop_token > 0.7 and t > inputs.shape[0] / 2:
|
||||
break
|
||||
if len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
memory = self._update_memory(decoder_output)
|
||||
t += 1
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
|
||||
return outputs, alignments, stop_tokens
|
||||
|
||||
def inference_truncated(self, inputs):
|
||||
"""
|
||||
Preserve decoder states for continuous inference
|
||||
"""
|
||||
if self.memory_truncated is None:
|
||||
self.memory_truncated = self.get_go_frame(inputs)
|
||||
self._init_states(inputs, mask=None, keep_states=False)
|
||||
else:
|
||||
self._init_states(inputs, mask=None, keep_states=True)
|
||||
|
||||
self.attention.init_win_idx()
|
||||
self.attention.init_states(inputs)
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
while True:
|
||||
memory = self.prenet(self.memory_truncated)
|
||||
decoder_output, alignment, stop_token = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [decoder_output.squeeze(1)]
|
||||
stop_tokens += [stop_token]
|
||||
alignments += [alignment]
|
||||
|
||||
if stop_token > 0.7:
|
||||
break
|
||||
if len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
|
||||
self.memory_truncated = decoder_output
|
||||
t += 1
|
||||
|
||||
outputs, stop_tokens, alignments = self._parse_outputs(
|
||||
outputs, stop_tokens, alignments)
|
||||
|
||||
return outputs, alignments, stop_tokens
|
||||
|
||||
def inference_step(self, inputs, t, memory=None):
|
||||
"""
|
||||
For debug purposes
|
||||
"""
|
||||
if t == 0:
|
||||
memory = self.get_go_frame(inputs)
|
||||
self._init_states(inputs, mask=None)
|
||||
|
||||
memory = self.prenet(memory)
|
||||
decoder_output, stop_token, alignment = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
memory = decoder_output
|
||||
return decoder_output, stop_token, alignment
|
|
@ -1,160 +0,0 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.layers.gst_layers import GST
|
||||
from TTS.layers.tacotron import Decoder, Encoder, PostCBHG
|
||||
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
|
||||
class Tacotron(TacotronAbstract):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
r=5,
|
||||
postnet_output_dim=1025,
|
||||
decoder_output_dim=80,
|
||||
attn_type='original',
|
||||
attn_win=False,
|
||||
attn_norm="sigmoid",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
attn_K=5,
|
||||
separate_stopnet=True,
|
||||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
gst=False,
|
||||
memory_size=5):
|
||||
super(Tacotron,
|
||||
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
|
||||
decoder_output_dim, attn_type, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet,
|
||||
bidirectional_decoder, double_decoder_consistency,
|
||||
ddc_r, gst)
|
||||
decoder_in_features = 512 if num_speakers > 1 else 256
|
||||
encoder_in_features = 512 if num_speakers > 1 else 256
|
||||
speaker_embedding_dim = 256
|
||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||
# base model layers
|
||||
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
|
||||
self.embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(encoder_in_features)
|
||||
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
|
||||
memory_size, attn_type, attn_win, attn_norm,
|
||||
prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn,
|
||||
attn_K, separate_stopnet, proj_speaker_dim)
|
||||
self.postnet = PostCBHG(decoder_output_dim)
|
||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
|
||||
postnet_output_dim)
|
||||
# speaker embedding layers
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
self.speaker_project_mel = nn.Sequential(
|
||||
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
# global style token layers
|
||||
if self.gst:
|
||||
gst_embedding_dim = 256
|
||||
self.gst_layer = GST(num_mel=80,
|
||||
num_heads=4,
|
||||
num_style_tokens=10,
|
||||
embedding_dim=gst_embedding_dim)
|
||||
# backward pass decoder
|
||||
if self.bidirectional_decoder:
|
||||
self._init_backward_decoder()
|
||||
# setup DDC
|
||||
if self.double_decoder_consistency:
|
||||
self.coarse_decoder = Decoder(
|
||||
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
|
||||
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask, location_attn,
|
||||
attn_K, separate_stopnet, proj_speaker_dim)
|
||||
|
||||
|
||||
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
|
||||
"""
|
||||
Shapes:
|
||||
- characters: B x T_in
|
||||
- text_lengths: B
|
||||
- mel_specs: B x T_out x D
|
||||
- speaker_ids: B x 1
|
||||
"""
|
||||
self._init_states()
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
# B x T_in x embed_dim
|
||||
inputs = self.embedding(characters)
|
||||
# B x speaker_embed_dim
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
# B x T_in x embed_dim + speaker_embed_dim
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
# B x T_in x encoder_in_features
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
# sequence masking
|
||||
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||
# global style token
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
# decoder_outputs: B x decoder_in_features x T_out
|
||||
# alignments: B x T_in x encoder_in_features
|
||||
# stop_tokens: B x T_in
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, input_mask,
|
||||
self.speaker_embeddings_projected)
|
||||
# sequence masking
|
||||
if output_mask is not None:
|
||||
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
|
||||
# B x T_out x decoder_in_features
|
||||
postnet_outputs = self.postnet(decoder_outputs)
|
||||
# sequence masking
|
||||
if output_mask is not None:
|
||||
postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs)
|
||||
# B x T_out x posnet_dim
|
||||
postnet_outputs = self.last_linear(postnet_outputs)
|
||||
# B x T_out x decoder_in_features
|
||||
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
|
||||
if self.bidirectional_decoder:
|
||||
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||
if self.double_decoder_consistency:
|
||||
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||
inputs = self.embedding(characters)
|
||||
self._init_states()
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
if self.gst and style_mel is not None:
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs, self.speaker_embeddings_projected)
|
||||
postnet_outputs = self.postnet(decoder_outputs)
|
||||
postnet_outputs = self.last_linear(postnet_outputs)
|
||||
decoder_outputs = decoder_outputs.transpose(1, 2)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
|
@ -1,169 +0,0 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.layers.gst_layers import GST
|
||||
from TTS.layers.tacotron2 import Decoder, Encoder, Postnet
|
||||
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
|
||||
# TODO: match function arguments with tacotron
|
||||
class Tacotron2(TacotronAbstract):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
r,
|
||||
postnet_output_dim=80,
|
||||
decoder_output_dim=80,
|
||||
attn_type='original',
|
||||
attn_win=False,
|
||||
attn_norm="softmax",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
attn_K=5,
|
||||
separate_stopnet=True,
|
||||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
gst=False):
|
||||
super(Tacotron2,
|
||||
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
|
||||
decoder_output_dim, attn_type, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet,
|
||||
bidirectional_decoder, double_decoder_consistency,
|
||||
ddc_r, gst)
|
||||
decoder_in_features = 512 if num_speakers > 1 else 512
|
||||
encoder_in_features = 512 if num_speakers > 1 else 512
|
||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||
# base layers
|
||||
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, 512)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(encoder_in_features)
|
||||
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
||||
self.postnet = Postnet(self.postnet_output_dim)
|
||||
# global style token layers
|
||||
if self.gst:
|
||||
gst_embedding_dim = encoder_in_features
|
||||
self.gst_layer = GST(num_mel=80,
|
||||
num_heads=4,
|
||||
num_style_tokens=10,
|
||||
embedding_dim=gst_embedding_dim)
|
||||
# backward pass decoder
|
||||
if self.bidirectional_decoder:
|
||||
self._init_backward_decoder()
|
||||
# setup DDC
|
||||
if self.double_decoder_consistency:
|
||||
self.coarse_decoder = Decoder(
|
||||
decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
|
||||
attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn, attn_K,
|
||||
separate_stopnet, proj_speaker_dim)
|
||||
|
||||
@staticmethod
|
||||
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
||||
mel_outputs = mel_outputs.transpose(1, 2)
|
||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||
return mel_outputs, mel_outputs_postnet, alignments
|
||||
|
||||
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
|
||||
self._init_states()
|
||||
# compute mask for padding
|
||||
# B x T_in_max (boolean)
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
# B x D_embed x T_in_max
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
# B x T_in_max x D_en
|
||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||
# adding speaker embeddding to encoder output
|
||||
# TODO: multi-speaker
|
||||
# B x speaker_embed_dim
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
# B x T_in x embed_dim + speaker_embed_dim
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
self.speaker_embeddings)
|
||||
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||
# global style token
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, input_mask)
|
||||
# sequence masking
|
||||
if mel_lengths is not None:
|
||||
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
|
||||
# B x mel_dim x T_out
|
||||
postnet_outputs = self.postnet(decoder_outputs)
|
||||
postnet_outputs = decoder_outputs + postnet_outputs
|
||||
# sequence masking
|
||||
if output_mask is not None:
|
||||
postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs)
|
||||
# B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in
|
||||
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
|
||||
decoder_outputs, postnet_outputs, alignments)
|
||||
if self.bidirectional_decoder:
|
||||
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||
if self.double_decoder_consistency:
|
||||
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, text, speaker_ids=None):
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
self.speaker_embeddings)
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
postnet_outputs = self.postnet(decoder_outputs)
|
||||
postnet_outputs = decoder_outputs + postnet_outputs
|
||||
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
|
||||
decoder_outputs, postnet_outputs, alignments)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
def inference_truncated(self, text, speaker_ids=None):
|
||||
"""
|
||||
Preserve model states for continuous inference
|
||||
"""
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||
speaker_ids)
|
||||
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
|
||||
encoder_outputs)
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
||||
mel_outputs, mel_outputs_postnet, alignments)
|
||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||
|
||||
|
||||
def _speaker_embedding_pass(self, encoder_outputs, speaker_ids):
|
||||
# TODO: multi-speaker
|
||||
# if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||
# raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
||||
# if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
|
||||
# speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||
# encoder_outputs.size(1),
|
||||
# -1)
|
||||
# encoder_outputs = encoder_outputs + speaker_embeddings
|
||||
# return encoder_outputs
|
||||
pass
|
|
@ -1,180 +0,0 @@
|
|||
import copy
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class TacotronAbstract(ABC, nn.Module):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
r,
|
||||
postnet_output_dim=80,
|
||||
decoder_output_dim=80,
|
||||
attn_type='original',
|
||||
attn_win=False,
|
||||
attn_norm="softmax",
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
attn_K=5,
|
||||
separate_stopnet=True,
|
||||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
gst=False):
|
||||
""" Abstract Tacotron class """
|
||||
super().__init__()
|
||||
self.num_chars = num_chars
|
||||
self.r = r
|
||||
self.decoder_output_dim = decoder_output_dim
|
||||
self.postnet_output_dim = postnet_output_dim
|
||||
self.gst = gst
|
||||
self.num_speakers = num_speakers
|
||||
self.bidirectional_decoder = bidirectional_decoder
|
||||
self.double_decoder_consistency = double_decoder_consistency
|
||||
self.ddc_r = ddc_r
|
||||
self.attn_type = attn_type
|
||||
self.attn_win = attn_win
|
||||
self.attn_norm = attn_norm
|
||||
self.prenet_type = prenet_type
|
||||
self.prenet_dropout = prenet_dropout
|
||||
self.forward_attn = forward_attn
|
||||
self.trans_agent = trans_agent
|
||||
self.forward_attn_mask = forward_attn_mask
|
||||
self.location_attn = location_attn
|
||||
self.attn_K = attn_K
|
||||
self.separate_stopnet = separate_stopnet
|
||||
|
||||
# layers
|
||||
self.embedding = None
|
||||
self.encoder = None
|
||||
self.decoder = None
|
||||
self.postnet = None
|
||||
|
||||
# global style token
|
||||
if self.gst:
|
||||
self.gst_layer = None
|
||||
|
||||
# model states
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
|
||||
# additional layers
|
||||
self.decoder_backward = None
|
||||
self.coarse_decoder = None
|
||||
|
||||
#############################
|
||||
# INIT FUNCTIONS
|
||||
#############################
|
||||
|
||||
def _init_states(self):
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
|
||||
def _init_backward_decoder(self):
|
||||
self.decoder_backward = copy.deepcopy(self.decoder)
|
||||
|
||||
def _init_coarse_decoder(self):
|
||||
self.coarse_decoder = copy.deepcopy(self.decoder)
|
||||
self.coarse_decoder.r_init = self.ddc_r
|
||||
self.coarse_decoder.set_r(self.ddc_r)
|
||||
|
||||
#############################
|
||||
# CORE FUNCTIONS
|
||||
#############################
|
||||
|
||||
@abstractmethod
|
||||
def forward(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def inference(self):
|
||||
pass
|
||||
|
||||
#############################
|
||||
# COMMON COMPUTE FUNCTIONS
|
||||
#############################
|
||||
|
||||
def compute_masks(self, text_lengths, mel_lengths):
|
||||
"""Compute masks against sequence paddings."""
|
||||
# B x T_in_max (boolean)
|
||||
device = text_lengths.device
|
||||
input_mask = sequence_mask(text_lengths).to(device)
|
||||
output_mask = None
|
||||
if mel_lengths is not None:
|
||||
max_len = mel_lengths.max()
|
||||
r = self.decoder.r
|
||||
max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
|
||||
output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device)
|
||||
return input_mask, output_mask
|
||||
|
||||
def _backward_pass(self, mel_specs, encoder_outputs, mask):
|
||||
""" Run backwards decoder """
|
||||
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
|
||||
self.speaker_embeddings_projected)
|
||||
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
|
||||
return decoder_outputs_b, alignments_b
|
||||
|
||||
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments,
|
||||
input_mask):
|
||||
""" Double Decoder Consistency """
|
||||
T = mel_specs.shape[1]
|
||||
if T % self.coarse_decoder.r > 0:
|
||||
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
|
||||
mel_specs = torch.nn.functional.pad(mel_specs,
|
||||
(0, 0, 0, padding_size, 0, 0))
|
||||
decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
|
||||
encoder_outputs.detach(), mel_specs, input_mask)
|
||||
# scale_factor = self.decoder.r_init / self.decoder.r
|
||||
alignments_backward = torch.nn.functional.interpolate(
|
||||
alignments_backward.transpose(1, 2),
|
||||
size=alignments.shape[1],
|
||||
mode='nearest').transpose(1, 2)
|
||||
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
|
||||
decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
|
||||
return decoder_outputs_backward, alignments_backward
|
||||
|
||||
#############################
|
||||
# EMBEDDING FUNCTIONS
|
||||
#############################
|
||||
|
||||
def compute_speaker_embedding(self, speaker_ids):
|
||||
""" Compute speaker embedding vectors """
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||
raise RuntimeError(
|
||||
" [!] Model has speaker embedding layer but speaker_id is not provided"
|
||||
)
|
||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||
self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1)
|
||||
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
|
||||
self.speaker_embeddings_projected = self.speaker_project_mel(
|
||||
self.speaker_embeddings).squeeze(1)
|
||||
|
||||
def compute_gst(self, inputs, mel_specs):
|
||||
""" Compute global style token """
|
||||
# pylint: disable=not-callable
|
||||
gst_outputs = self.gst_layer(mel_specs)
|
||||
inputs = self._add_speaker_embedding(inputs, gst_outputs)
|
||||
return inputs
|
||||
|
||||
@staticmethod
|
||||
def _add_speaker_embedding(outputs, speaker_embeddings):
|
||||
speaker_embeddings_ = speaker_embeddings.expand(
|
||||
outputs.size(0), outputs.size(1), -1)
|
||||
outputs = outputs + speaker_embeddings_
|
||||
return outputs
|
||||
|
||||
@staticmethod
|
||||
def _concat_speaker_embedding(outputs, speaker_embeddings):
|
||||
speaker_embeddings_ = speaker_embeddings.expand(
|
||||
outputs.size(0), outputs.size(1), -1)
|
||||
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
|
||||
return outputs
|
|
@ -16,9 +16,9 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.utils.generic_utils import load_config\n",
|
||||
"from TTS.tts.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.tts.utils.generic_utils import load_config\n",
|
||||
"import glob \n",
|
||||
"import IPython.display as ipd"
|
||||
]
|
||||
|
|
|
@ -22,12 +22,12 @@
|
|||
"import numpy as np\n",
|
||||
"from tqdm import tqdm as tqdm\n",
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"from TTS.datasets.TTSDataset import MyDataset\n",
|
||||
"from TTS.layers.losses import L1LossMasked\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"from TTS.tts.datasets.TTSDataset import MyDataset\n",
|
||||
"from TTS.tts.layers.losses import L1LossMasked\n",
|
||||
"from TTS.tts.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.visual import plot_spectrogram\n",
|
||||
"from TTS.tts.utils.generic_utils import load_config, setup_model, sequence_mask\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
|
@ -108,7 +108,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
|
||||
"preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
|
||||
"preprocessor = getattr(preprocessor, DATASET.lower())\n",
|
||||
"meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n",
|
||||
"dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n",
|
||||
|
|
|
@ -36,14 +36,14 @@
|
|||
"import librosa\n",
|
||||
"import librosa.display\n",
|
||||
"\n",
|
||||
"from TTS.layers import *\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.generic_utils import setup_model\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.utils.text import text_to_sequence\n",
|
||||
"from TTS.utils.synthesis import synthesis\n",
|
||||
"from TTS.utils.visual import plot_alignment\n",
|
||||
"from TTS.utils.measures import alignment_diagonal_score\n",
|
||||
"from TTS.tts.layers import *\n",
|
||||
"from TTS.tts.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.tts.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text import text_to_sequence\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis\n",
|
||||
"from TTS.tts.utils.visual import plot_alignment\n",
|
||||
"from TTS.tts.utils.measures import alignment_diagonal_score\n",
|
||||
"\n",
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
|
@ -96,7 +96,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
|
||||
"\n",
|
||||
"# multi speaker \n",
|
||||
"if CONFIG.use_speaker_embedding:\n",
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -27,7 +27,7 @@
|
|||
"from multiprocessing import Pool\n",
|
||||
"from matplotlib import pylab as plt\n",
|
||||
"from collections import Counter\n",
|
||||
"from TTS.datasets.preprocess import *\n",
|
||||
"from TTS.tts.datasets.preprocess import *\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
|
@ -0,0 +1,10 @@
|
|||
# tests
|
||||
nosetests tests -x
|
||||
|
||||
# runtime tests
|
||||
./tests/test_server_package.sh
|
||||
./tests/test_tts_train.sh
|
||||
./tests/test_vocoder_train.sh
|
||||
|
||||
# linter check
|
||||
cardboardlinter --refspec master
|
|
@ -1,47 +0,0 @@
|
|||
## TTS example web-server
|
||||
|
||||
You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
|
||||
|
||||
Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work.
|
||||
|
||||
#### Development server:
|
||||
|
||||
##### Using server.py
|
||||
If you have the environment set already for TTS, then you can directly call ```server.py```.
|
||||
|
||||
##### Using .whl
|
||||
1. apt-get install -y espeak libsndfile1 python3-venv
|
||||
2. python3 -m venv /tmp/venv
|
||||
3. source /tmp/venv/bin/activate
|
||||
4. pip install -U pip setuptools wheel
|
||||
5. pip install -U https//example.com/url/to/python/package.whl
|
||||
6. python -m TTS.server.server
|
||||
|
||||
You can now open http://localhost:5002 in a browser
|
||||
|
||||
#### Running with nginx/uwsgi:
|
||||
|
||||
1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
|
||||
2. python3 -m venv /tmp/venv
|
||||
3. source /tmp/venv/bin/activate
|
||||
4. pip install -U pip setuptools wheel
|
||||
5. pip install -U https//example.com/url/to/python/package.whl
|
||||
6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip
|
||||
7. unzip *-nginx-uwsgi.zip
|
||||
8. cp tts_site_nginx /etc/nginx/sites-enabled/default
|
||||
9. service nginx restart
|
||||
10. uwsgi --ini uwsgi.ini
|
||||
|
||||
You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx).
|
||||
Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting.
|
||||
|
||||
#### Creating a server package with an embedded model
|
||||
|
||||
[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command:
|
||||
|
||||
- `--checkpoint <path to checkpoint file>` - path to model checkpoint file you want to embed in the package
|
||||
- `--model_config <path to config.json file>` - path to corresponding config.json file for the checkpoint
|
||||
|
||||
To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`.
|
||||
|
||||
A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it.
|
|
@ -1,16 +0,0 @@
|
|||
{
|
||||
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
|
||||
"tts_file":"best_model.pth.tar", // tts checkpoint file
|
||||
"tts_config":"config.json", // tts config.json file
|
||||
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
|
||||
"vocoder_config":null,
|
||||
"vocoder_file": null,
|
||||
"wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
|
||||
"wavernn_path":null, // wavernn model root path
|
||||
"wavernn_file":null, // wavernn checkpoint file name
|
||||
"wavernn_config": null, // wavernn config file
|
||||
"is_wavernn_batched":true,
|
||||
"port": 5002,
|
||||
"use_cuda": true,
|
||||
"debug": true
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
#!flask/bin/python
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from flask import Flask, request, render_template, send_file
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
|
||||
|
||||
def create_argparser():
|
||||
def convert_boolean(x):
|
||||
return x.lower() in ['true', '1', 'yes']
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file')
|
||||
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
|
||||
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
|
||||
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
||||
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
|
||||
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
|
||||
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
|
||||
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
||||
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
|
||||
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
|
||||
return parser
|
||||
|
||||
|
||||
synthesizer = None
|
||||
|
||||
embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model')
|
||||
|
||||
embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
|
||||
tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
|
||||
tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
|
||||
|
||||
embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder')
|
||||
vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar')
|
||||
vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json')
|
||||
|
||||
# These models are soon to be deprecated
|
||||
embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
|
||||
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
|
||||
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
|
||||
|
||||
args = create_argparser().parse_args()
|
||||
|
||||
# If these were not specified in the CLI args, use default values with embedded model files
|
||||
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
|
||||
args.tts_checkpoint = tts_checkpoint_file
|
||||
if not args.tts_config and os.path.isfile(tts_config_file):
|
||||
args.tts_config = tts_config_file
|
||||
|
||||
if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
|
||||
args.vocoder_checkpoint = vocoder_checkpoint_file
|
||||
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
|
||||
args.vocoder_config = vocoder_config_file
|
||||
|
||||
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
|
||||
args.wavernn_checkpoint = wavernn_checkpoint_file
|
||||
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
|
||||
args.wavernn_config = wavernn_config_file
|
||||
|
||||
synthesizer = Synthesizer(args)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
|
||||
@app.route('/api/tts', methods=['GET'])
|
||||
def tts():
|
||||
text = request.args.get('text')
|
||||
print(" > Model input: {}".format(text))
|
||||
data = synthesizer.tts(text)
|
||||
return send_file(data, mimetype='audio/wav')
|
||||
|
||||
|
||||
def main():
|
||||
app.run(debug=args.debug, host='0.0.0.0', port=args.port)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,194 +0,0 @@
|
|||
import io
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import yaml
|
||||
import pysbd
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.speakers import load_speaker_mapping
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.utils.synthesis import *
|
||||
|
||||
from TTS.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
|
||||
class Synthesizer(object):
|
||||
def __init__(self, config):
|
||||
self.wavernn = None
|
||||
self.vocoder_model = None
|
||||
self.config = config
|
||||
print(config)
|
||||
self.seg = self.get_segmenter("en")
|
||||
self.use_cuda = self.config.use_cuda
|
||||
if self.use_cuda:
|
||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
|
||||
self.config.use_cuda)
|
||||
if self.config.vocoder_checkpoint:
|
||||
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
|
||||
if self.config.wavernn_lib_path:
|
||||
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
|
||||
self.config.wavernn_config, self.config.use_cuda)
|
||||
|
||||
@staticmethod
|
||||
def get_segmenter(lang):
|
||||
return pysbd.Segmenter(language=lang, clean=True)
|
||||
|
||||
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
|
||||
# pylint: disable=global-statement
|
||||
global symbols, phonemes
|
||||
|
||||
print(" > Loading TTS model ...")
|
||||
print(" | > model config: ", tts_config)
|
||||
print(" | > checkpoint file: ", tts_checkpoint)
|
||||
|
||||
self.tts_config = load_config(tts_config)
|
||||
self.use_phonemes = self.tts_config.use_phonemes
|
||||
self.ap = AudioProcessor(**self.tts_config.audio)
|
||||
|
||||
if 'characters' in self.tts_config.keys():
|
||||
symbols, phonemes = make_symbols(**self.tts_config.characters)
|
||||
|
||||
if self.use_phonemes:
|
||||
self.input_size = len(phonemes)
|
||||
else:
|
||||
self.input_size = len(symbols)
|
||||
# TODO: fix this for multi-speaker model - load speakers
|
||||
if self.config.tts_speakers is not None:
|
||||
self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
|
||||
num_speakers = len(self.tts_speakers)
|
||||
else:
|
||||
num_speakers = 0
|
||||
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
|
||||
# load model state
|
||||
cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
|
||||
# load the model
|
||||
self.tts_model.load_state_dict(cp['model'])
|
||||
if use_cuda:
|
||||
self.tts_model.cuda()
|
||||
self.tts_model.eval()
|
||||
self.tts_model.decoder.max_decoder_steps = 3000
|
||||
if 'r' in cp:
|
||||
self.tts_model.decoder.set_r(cp['r'])
|
||||
print(f" > model reduction factor: {cp['r']}")
|
||||
|
||||
def load_vocoder(self, model_file, model_config, use_cuda):
|
||||
self.vocoder_config = load_config(model_config)
|
||||
self.vocoder_model = setup_generator(self.vocoder_config)
|
||||
self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
|
||||
self.vocoder_model.remove_weight_norm()
|
||||
self.vocoder_model.inference_padding = 0
|
||||
self.vocoder_config = load_config(model_config)
|
||||
|
||||
if use_cuda:
|
||||
self.vocoder_model.cuda()
|
||||
self.vocoder_model.eval()
|
||||
|
||||
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
|
||||
# TODO: set a function in wavernn code base for model setup and call it here.
|
||||
sys.path.append(lib_path) # set this if WaveRNN is not installed globally
|
||||
#pylint: disable=import-outside-toplevel
|
||||
from WaveRNN.models.wavernn import Model
|
||||
print(" > Loading WaveRNN model ...")
|
||||
print(" | > model config: ", model_config)
|
||||
print(" | > model file: ", model_file)
|
||||
self.wavernn_config = load_config(model_config)
|
||||
# This is the default architecture we use for our models.
|
||||
# You might need to update it
|
||||
self.wavernn = Model(
|
||||
rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=self.wavernn_config.mode,
|
||||
mulaw=self.wavernn_config.mulaw,
|
||||
pad=self.wavernn_config.pad,
|
||||
use_aux_net=self.wavernn_config.use_aux_net,
|
||||
use_upsample_net=self.wavernn_config.use_upsample_net,
|
||||
upsample_factors=self.wavernn_config.upsample_factors,
|
||||
feat_dims=80,
|
||||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
res_blocks=10,
|
||||
hop_length=self.ap.hop_length,
|
||||
sample_rate=self.ap.sample_rate,
|
||||
).cuda()
|
||||
|
||||
check = torch.load(model_file, map_location="cpu")
|
||||
self.wavernn.load_state_dict(check['model'])
|
||||
if use_cuda:
|
||||
self.wavernn.cuda()
|
||||
self.wavernn.eval()
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
# wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
|
||||
wav = np.array(wav)
|
||||
self.ap.save_wav(wav, path)
|
||||
|
||||
def split_into_sentences(self, text):
|
||||
return self.seg.segment(text)
|
||||
|
||||
def tts(self, text, speaker_id=None):
|
||||
start_time = time.time()
|
||||
wavs = []
|
||||
sens = self.split_into_sentences(text)
|
||||
print(sens)
|
||||
speaker_id = id_to_torch(speaker_id)
|
||||
if speaker_id is not None and self.use_cuda:
|
||||
speaker_id = speaker_id.cuda()
|
||||
|
||||
for sen in sens:
|
||||
# preprocess the given text
|
||||
inputs = text_to_seqvec(sen, self.tts_config)
|
||||
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
|
||||
inputs = inputs.unsqueeze(0)
|
||||
# synthesize voice
|
||||
_, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
|
||||
if self.vocoder_model:
|
||||
# use native vocoder model
|
||||
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||
wav = self.vocoder_model.inference(vocoder_input)
|
||||
if self.use_cuda:
|
||||
wav = wav.cpu().numpy()
|
||||
else:
|
||||
wav = wav.numpy()
|
||||
wav = wav.flatten()
|
||||
elif self.wavernn:
|
||||
# use 3rd paty wavernn
|
||||
vocoder_input = None
|
||||
if self.tts_config.model == "Tacotron":
|
||||
vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
|
||||
else:
|
||||
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||
if self.use_cuda:
|
||||
vocoder_input.cuda()
|
||||
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
|
||||
else:
|
||||
# use GL
|
||||
if self.use_cuda:
|
||||
postnet_output = postnet_output[0].cpu()
|
||||
else:
|
||||
postnet_output = postnet_output[0]
|
||||
postnet_output = postnet_output.numpy()
|
||||
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
|
||||
|
||||
# trim silence
|
||||
wav = trim_silence(wav, self.ap)
|
||||
|
||||
wavs += list(wav)
|
||||
wavs += [0] * 10000
|
||||
|
||||
out = io.BytesIO()
|
||||
self.save_wav(wavs, out)
|
||||
|
||||
# compute stats
|
||||
process_time = time.time() - start_time
|
||||
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
|
||||
print(f" > Processing time: {process_time}")
|
||||
print(f" > Real-time factor: {process_time / audio_time}")
|
||||
return out
|
|
@ -1,111 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="">
|
||||
|
||||
<title>Mozilla - Text2Speech engine</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
|
||||
|
||||
<!-- Custom styles for this template -->
|
||||
<style>
|
||||
body {
|
||||
padding-top: 54px;
|
||||
}
|
||||
@media (min-width: 992px) {
|
||||
body {
|
||||
padding-top: 56px;
|
||||
}
|
||||
}
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
||||
|
||||
<!-- Navigation -->
|
||||
<!--
|
||||
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
|
||||
<div class="container">
|
||||
<a class="navbar-brand" href="#">Mozilla TTS</a>
|
||||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarResponsive">
|
||||
<ul class="navbar-nav ml-auto">
|
||||
<li class="nav-item active">
|
||||
<a class="nav-link" href="#">Home
|
||||
<span class="sr-only">(current)</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
-->
|
||||
|
||||
<!-- Page Content -->
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col-lg-12 text-center">
|
||||
<img class="mt-5" src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" alt=></img>
|
||||
<h1 class="mt-5">Mozilla TTS</h1>
|
||||
<ul class="list-unstyled">
|
||||
</ul>
|
||||
<input id="text" placeholder="Type here..." size=45 type="text" name="text">
|
||||
<button id="speak-button" name="speak">Speak</button><br/><br/>
|
||||
<audio id="audio" controls autoplay hidden></audio>
|
||||
<p id="message"></p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Bootstrap core JavaScript -->
|
||||
<script>
|
||||
function q(selector) {return document.querySelector(selector)}
|
||||
q('#text').focus()
|
||||
function do_tts(e) {
|
||||
text = q('#text').value
|
||||
if (text) {
|
||||
q('#message').textContent = 'Synthesizing...'
|
||||
q('#speak-button').disabled = true
|
||||
q('#audio').hidden = true
|
||||
synthesize(text)
|
||||
}
|
||||
e.preventDefault()
|
||||
return false
|
||||
}
|
||||
q('#speak-button').addEventListener('click', do_tts)
|
||||
q('#text').addEventListener('keyup', function(e) {
|
||||
if (e.keyCode == 13) { // enter
|
||||
do_tts(e)
|
||||
}
|
||||
})
|
||||
function synthesize(text) {
|
||||
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
|
||||
.then(function(res) {
|
||||
if (!res.ok) throw Error(res.statusText)
|
||||
return res.blob()
|
||||
}).then(function(blob) {
|
||||
q('#message').textContent = ''
|
||||
q('#speak-button').disabled = false
|
||||
q('#audio').src = URL.createObjectURL(blob)
|
||||
q('#audio').hidden = false
|
||||
}).catch(function(err) {
|
||||
q('#message').textContent = 'Error: ' + err.message
|
||||
q('#speak-button').disabled = false
|
||||
})
|
||||
}
|
||||
</script>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
10
setup.py
10
setup.py
|
@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args()
|
|||
# Remove our arguments from argv so that setuptools doesn't see them
|
||||
sys.argv = [sys.argv[0]] + unknown_args
|
||||
|
||||
version = '0.0.3'
|
||||
version = '0.0.4'
|
||||
|
||||
# Adapted from https://github.com/pytorch/pytorch
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
|
@ -112,6 +112,8 @@ setup(
|
|||
name='TTS',
|
||||
version=version,
|
||||
url='https://github.com/mozilla/TTS',
|
||||
author='Eren Gölge',
|
||||
author_email='egolge@mozilla.com',
|
||||
description='Text to Speech with Deep Learning',
|
||||
license='MPL-2.0',
|
||||
entry_points={
|
||||
|
@ -119,11 +121,7 @@ setup(
|
|||
'tts-server = TTS.server.server:main'
|
||||
]
|
||||
},
|
||||
package_dir={'': 'tts_namespace'},
|
||||
packages=find_packages('tts_namespace'),
|
||||
package_data={
|
||||
'TTS': package_data,
|
||||
},
|
||||
packages=find_packages(include=['TTS*']),
|
||||
project_urls={
|
||||
'Documentation': 'https://github.com/mozilla/TTS/wiki',
|
||||
'Tracker': 'https://github.com/mozilla/TTS/issues',
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
### Speaker Encoder
|
||||
|
||||
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
||||
|
||||
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
|
||||
|
||||
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
|
||||
|
||||

|
||||
|
||||
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
||||
|
||||
To run the code, you need to follow the same flow as in TTS.
|
||||
|
||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
||||
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
||||
- Watch training on Tensorboard as in TTS
|
|
@ -1,88 +0,0 @@
|
|||
import argparse
|
||||
import glob
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import load_config
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compute embedding vectors for each wav file in a dataset. ')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
type=str,
|
||||
help='Path to model outputs (checkpoint, tensorboard etc.).')
|
||||
parser.add_argument(
|
||||
'config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'data_path',
|
||||
type=str,
|
||||
help='Data path for wav files - directory or CSV file')
|
||||
parser.add_argument(
|
||||
'output_path',
|
||||
type=str,
|
||||
help='path for training outputs.')
|
||||
parser.add_argument(
|
||||
'--use_cuda', type=bool, help='flag to set cuda.', default=False
|
||||
)
|
||||
parser.add_argument(
|
||||
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
c = load_config(args.config_path)
|
||||
ap = AudioProcessor(**c['audio'])
|
||||
|
||||
data_path = args.data_path
|
||||
split_ext = os.path.splitext(data_path)
|
||||
sep = args.separator
|
||||
|
||||
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
|
||||
# Parse CSV
|
||||
print(f'CSV file: {data_path}')
|
||||
with open(data_path) as f:
|
||||
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
|
||||
wav_files = []
|
||||
print(f'Separator is: {sep}')
|
||||
for line in f:
|
||||
components = line.split(sep)
|
||||
if len(components) != 2:
|
||||
print("Invalid line")
|
||||
continue
|
||||
wav_file = os.path.join(wav_path, components[0] + '.wav')
|
||||
#print(f'wav_file: {wav_file}')
|
||||
if os.path.exists(wav_file):
|
||||
wav_files.append(wav_file)
|
||||
print(f'Count of wavs imported: {len(wav_files)}')
|
||||
else:
|
||||
# Parse all wav files in data_path
|
||||
wav_path = data_path
|
||||
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
|
||||
|
||||
output_files = [wav_file.replace(wav_path, args.output_path).replace(
|
||||
'.wav', '.npy') for wav_file in wav_files]
|
||||
|
||||
for output_file in output_files:
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
model = SpeakerEncoder(**c.model)
|
||||
model.load_state_dict(torch.load(args.model_path)['model'])
|
||||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
|
||||
for idx, wav_file in enumerate(tqdm(wav_files)):
|
||||
mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T
|
||||
mel_spec = torch.FloatTensor(mel_spec[None, :, :])
|
||||
if args.use_cuda:
|
||||
mel_spec = mel_spec.cuda()
|
||||
embedd = model.compute_embedding(mel_spec)
|
||||
np.save(output_files[idx], embedd.detach().cpu().numpy())
|
|
@ -1,59 +0,0 @@
|
|||
{
|
||||
"run_name": "libritts_360-half",
|
||||
"run_description": "train speaker encoder for libritts 360",
|
||||
"audio": {
|
||||
// Audio processing parameters
|
||||
"num_mels": 40, // size of the mel spec frame.
|
||||
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"frame_length_ms": 50, // stft window length in ms.
|
||||
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
"reinit_layers": [],
|
||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"model": {
|
||||
"input_dim": 40,
|
||||
"proj_dim": 128,
|
||||
"lstm_dim": 384,
|
||||
"num_lstm_layers": 3
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
{
|
||||
"name": "libri_tts",
|
||||
"path": "/home/erogol/Data/Libri-TTS/train-clean-360/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
},
|
||||
{
|
||||
"name": "libri_tts",
|
||||
"path": "/home/erogol/Data/Libri-TTS/train-clean-100/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,123 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import random
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
||||
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
||||
"""
|
||||
Args:
|
||||
ap (TTS.utils.AudioProcessor): audio processor object.
|
||||
meta_data (list): list of dataset instances.
|
||||
seq_len (int): voice segment length in seconds.
|
||||
verbose (bool): print diagnostic information.
|
||||
"""
|
||||
self.items = meta_data
|
||||
self.sample_rate = ap.sample_rate
|
||||
self.voice_len = voice_len
|
||||
self.seq_len = int(voice_len * self.sample_rate)
|
||||
self.num_speakers_in_batch = num_speakers_in_batch
|
||||
self.num_utter_per_speaker = num_utter_per_speaker
|
||||
self.skip_speakers = skip_speakers
|
||||
self.ap = ap
|
||||
self.verbose = verbose
|
||||
self.__parse_items()
|
||||
if self.verbose:
|
||||
print("\n > DataLoader initialization")
|
||||
print(f" | > Number of instances : {len(self.items)}")
|
||||
print(f" | > Sequence length: {self.seq_len}")
|
||||
print(f" | > Num speakers: {len(self.speakers)}")
|
||||
|
||||
def load_wav(self, filename):
|
||||
audio = self.ap.load_wav(filename)
|
||||
return audio
|
||||
|
||||
def load_data(self, idx):
|
||||
text, wav_file, speaker_name = self.items[idx]
|
||||
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
|
||||
mel = self.ap.melspectrogram(wav).astype("float32")
|
||||
# sample seq_len
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
assert wav.size > 0, self.items[idx][1]
|
||||
|
||||
sample = {
|
||||
"mel": mel,
|
||||
"item_idx": self.items[idx][1],
|
||||
"speaker_name": speaker_name,
|
||||
}
|
||||
return sample
|
||||
|
||||
def __parse_items(self):
|
||||
"""
|
||||
Find unique speaker ids and create a dict mapping utterances from speaker id
|
||||
"""
|
||||
speakers = list({item[-1] for item in self.items})
|
||||
self.speaker_to_utters = {}
|
||||
self.speakers = []
|
||||
for speaker in speakers:
|
||||
speaker_utters = [item[1] for item in self.items if item[2] == speaker]
|
||||
if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
|
||||
print(
|
||||
f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
|
||||
)
|
||||
else:
|
||||
self.speakers.append(speaker)
|
||||
self.speaker_to_utters[speaker] = speaker_utters
|
||||
|
||||
def __len__(self):
|
||||
return int(1e10)
|
||||
|
||||
def __sample_speaker(self):
|
||||
speaker = random.sample(self.speakers, 1)[0]
|
||||
if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
|
||||
utters = random.choices(
|
||||
self.speaker_to_utters[speaker], k=self.num_utter_per_speaker
|
||||
)
|
||||
else:
|
||||
utters = random.sample(
|
||||
self.speaker_to_utters[speaker], self.num_utter_per_speaker
|
||||
)
|
||||
return speaker, utters
|
||||
|
||||
def __sample_speaker_utterances(self, speaker):
|
||||
"""
|
||||
Sample all M utterances for the given speaker.
|
||||
"""
|
||||
feats = []
|
||||
labels = []
|
||||
for _ in range(self.num_utter_per_speaker):
|
||||
# TODO:dummy but works
|
||||
while True:
|
||||
if len(self.speaker_to_utters[speaker]) > 0:
|
||||
utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
|
||||
else:
|
||||
self.speakers.remove(speaker)
|
||||
speaker, _ = self.__sample_speaker()
|
||||
continue
|
||||
wav = self.load_wav(utter)
|
||||
if wav.shape[0] - self.seq_len > 0:
|
||||
break
|
||||
self.speaker_to_utters[speaker].remove(utter)
|
||||
|
||||
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
||||
mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len])
|
||||
feats.append(torch.FloatTensor(mel))
|
||||
labels.append(speaker)
|
||||
return feats, labels
|
||||
|
||||
def __getitem__(self, idx):
|
||||
speaker, _ = self.__sample_speaker()
|
||||
return speaker
|
||||
|
||||
def collate_fn(self, batch):
|
||||
labels = []
|
||||
feats = []
|
||||
for speaker in batch:
|
||||
feats_, labels_ = self.__sample_speaker_utterances(speaker)
|
||||
labels.append(labels_)
|
||||
feats.extend(feats_)
|
||||
feats = torch.stack(feats)
|
||||
return feats.transpose(1, 2), labels
|
|
@ -1,41 +0,0 @@
|
|||
import os
|
||||
import datetime
|
||||
import torch
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, model_loss, out_path,
|
||||
current_step, epoch):
|
||||
checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
|
||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
||||
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
|
||||
|
||||
new_state_dict = model.state_dict()
|
||||
state = {
|
||||
'model': new_state_dict,
|
||||
'optimizer': optimizer.state_dict() if optimizer is not None else None,
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'GE2Eloss': model_loss,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
torch.save(state, checkpoint_path)
|
||||
|
||||
|
||||
def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
||||
current_step):
|
||||
if model_loss < best_loss:
|
||||
new_state_dict = model.state_dict()
|
||||
state = {
|
||||
'model': new_state_dict,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'step': current_step,
|
||||
'GE2Eloss': model_loss,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
best_loss = model_loss
|
||||
bestmodel_path = 'best_model.pth.tar'
|
||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
|
||||
model_loss, bestmodel_path))
|
||||
torch.save(state, bestmodel_path)
|
||||
return best_loss
|
|
@ -1,121 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||
class GE2ELoss(nn.Module):
|
||||
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
|
||||
"""
|
||||
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
|
||||
Accepts an input of size (N, M, D)
|
||||
where N is the number of speakers in the batch,
|
||||
M is the number of utterances per speaker,
|
||||
and D is the dimensionality of the embedding vector (e.g. d-vector)
|
||||
Args:
|
||||
- init_w (float): defines the initial value of w in Equation (5) of [1]
|
||||
- init_b (float): definies the initial value of b in Equation (5) of [1]
|
||||
"""
|
||||
super(GE2ELoss, self).__init__()
|
||||
# pylint: disable=E1102
|
||||
self.w = nn.Parameter(torch.tensor(init_w))
|
||||
# pylint: disable=E1102
|
||||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.loss_method = loss_method
|
||||
|
||||
assert self.loss_method in ["softmax", "contrast"]
|
||||
|
||||
if self.loss_method == "softmax":
|
||||
self.embed_loss = self.embed_loss_softmax
|
||||
if self.loss_method == "contrast":
|
||||
self.embed_loss = self.embed_loss_contrast
|
||||
|
||||
# pylint: disable=R0201
|
||||
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
|
||||
"""
|
||||
Calculates the new centroids excluding the reference utterance
|
||||
"""
|
||||
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
|
||||
excl = torch.mean(excl, 0)
|
||||
new_centroids = []
|
||||
for i, centroid in enumerate(centroids):
|
||||
if i == spkr:
|
||||
new_centroids.append(excl)
|
||||
else:
|
||||
new_centroids.append(centroid)
|
||||
return torch.stack(new_centroids)
|
||||
|
||||
def calc_cosine_sim(self, dvecs, centroids):
|
||||
"""
|
||||
Make the cosine similarity matrix with dims (N,M,N)
|
||||
"""
|
||||
cos_sim_matrix = []
|
||||
for spkr_idx, speaker in enumerate(dvecs):
|
||||
cs_row = []
|
||||
for utt_idx, utterance in enumerate(speaker):
|
||||
new_centroids = self.calc_new_centroids(
|
||||
dvecs, centroids, spkr_idx, utt_idx
|
||||
)
|
||||
# vector based cosine similarity for speed
|
||||
cs_row.append(
|
||||
torch.clamp(
|
||||
torch.mm(
|
||||
utterance.unsqueeze(1).transpose(0, 1),
|
||||
new_centroids.transpose(0, 1),
|
||||
)
|
||||
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
|
||||
1e-6,
|
||||
)
|
||||
)
|
||||
cs_row = torch.cat(cs_row, dim=0)
|
||||
cos_sim_matrix.append(cs_row)
|
||||
return torch.stack(cos_sim_matrix)
|
||||
|
||||
# pylint: disable=R0201
|
||||
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
|
||||
"""
|
||||
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
|
||||
"""
|
||||
N, M, _ = dvecs.shape
|
||||
L = []
|
||||
for j in range(N):
|
||||
L_row = []
|
||||
for i in range(M):
|
||||
L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
|
||||
L_row = torch.stack(L_row)
|
||||
L.append(L_row)
|
||||
return torch.stack(L)
|
||||
|
||||
# pylint: disable=R0201
|
||||
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
|
||||
"""
|
||||
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
|
||||
"""
|
||||
N, M, _ = dvecs.shape
|
||||
L = []
|
||||
for j in range(N):
|
||||
L_row = []
|
||||
for i in range(M):
|
||||
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
|
||||
excl_centroids_sigmoids = torch.cat(
|
||||
(centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])
|
||||
)
|
||||
L_row.append(
|
||||
1.0
|
||||
- torch.sigmoid(cos_sim_matrix[j, i, j])
|
||||
+ torch.max(excl_centroids_sigmoids)
|
||||
)
|
||||
L_row = torch.stack(L_row)
|
||||
L.append(L_row)
|
||||
return torch.stack(L)
|
||||
|
||||
def forward(self, dvecs):
|
||||
"""
|
||||
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
||||
"""
|
||||
centroids = torch.mean(dvecs, 1)
|
||||
cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
|
||||
torch.clamp(self.w, 1e-6)
|
||||
cos_sim_matrix = self.w * cos_sim_matrix + self.b
|
||||
L = self.embed_loss(dvecs, cos_sim_matrix)
|
||||
return L.mean()
|
|
@ -1,88 +0,0 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class LSTMWithProjection(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, proj_size):
|
||||
super().__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.proj_size = proj_size
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
|
||||
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
self.lstm.flatten_parameters()
|
||||
o, (_, _) = self.lstm(x)
|
||||
return self.linear(o)
|
||||
|
||||
|
||||
class SpeakerEncoder(nn.Module):
|
||||
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
|
||||
super().__init__()
|
||||
layers = []
|
||||
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
||||
for _ in range(num_lstm_layers - 1):
|
||||
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
||||
self.layers = nn.Sequential(*layers)
|
||||
self._init_layers()
|
||||
|
||||
def _init_layers(self):
|
||||
for name, param in self.layers.named_parameters():
|
||||
if "bias" in name:
|
||||
nn.init.constant_(param, 0.0)
|
||||
elif "weight" in name:
|
||||
nn.init.xavier_normal_(param)
|
||||
|
||||
def forward(self, x):
|
||||
# TODO: implement state passing for lstms
|
||||
d = self.layers(x)
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
return d
|
||||
|
||||
def inference(self, x):
|
||||
d = self.layers.forward(x)
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
return d
|
||||
|
||||
def compute_embedding(self, x, num_frames=160, overlap=0.5):
|
||||
"""
|
||||
Generate embeddings for a batch of utterances
|
||||
x: 1xTxD
|
||||
"""
|
||||
num_overlap = int(num_frames * overlap)
|
||||
max_len = x.shape[1]
|
||||
embed = None
|
||||
cur_iter = 0
|
||||
for offset in range(0, max_len, num_frames - num_overlap):
|
||||
cur_iter += 1
|
||||
end_offset = min(x.shape[1], offset + num_frames)
|
||||
frames = x[:, offset:end_offset]
|
||||
if embed is None:
|
||||
embed = self.inference(frames)
|
||||
else:
|
||||
embed += self.inference(frames)
|
||||
return embed / cur_iter
|
||||
|
||||
def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
|
||||
"""
|
||||
Generate embeddings for a batch of utterances
|
||||
x: BxTxD
|
||||
"""
|
||||
num_overlap = num_frames * overlap
|
||||
max_len = x.shape[1]
|
||||
embed = None
|
||||
num_iters = seq_lens / (num_frames - num_overlap)
|
||||
cur_iter = 0
|
||||
for offset in range(0, max_len, num_frames - num_overlap):
|
||||
cur_iter += 1
|
||||
end_offset = min(x.shape[1], offset + num_frames)
|
||||
frames = x[:, offset:end_offset]
|
||||
if embed is None:
|
||||
embed = self.inference(frames)
|
||||
else:
|
||||
embed[cur_iter <= num_iters, :] += self.inference(
|
||||
frames[cur_iter <= num_iters, :, :]
|
||||
)
|
||||
return embed / num_iters
|
||||
|
|
@ -1,325 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Overview\n",
|
||||
"\n",
|
||||
"This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
|
||||
"\n",
|
||||
"Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"import random\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import umap\n",
|
||||
"\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.generic_utils import load_config\n",
|
||||
"\n",
|
||||
"from bokeh.io import output_notebook, show\n",
|
||||
"from bokeh.plotting import figure\n",
|
||||
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
|
||||
"from bokeh.transform import factor_cmap, factor_mark\n",
|
||||
"from bokeh.palettes import Category10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
|
||||
"\n",
|
||||
"List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
|
||||
"\n",
|
||||
"**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output_notebook()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You should also adjust all the path constants to point at the relevant locations for you locally"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
|
||||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
"# My single speaker locations\n",
|
||||
"#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
|
||||
"#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
|
||||
"\n",
|
||||
"# My multi speaker locations\n",
|
||||
"EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
|
||||
"AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!ls -1 $MODEL_RUN_PATH"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CONFIG = load_config(CONFIG_PATH)\n",
|
||||
"ap = AudioProcessor(**CONFIG['audio'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Bring in the embeddings created by **compute_embeddings.py**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
|
||||
"print(f'Embeddings found: {len(embed_files)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check that we did indeed find an embedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embed_files[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Process the speakers\n",
|
||||
"\n",
|
||||
"Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
|
||||
"speaker_to_utter = {}\n",
|
||||
"for embed_file in embed_files:\n",
|
||||
" speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
|
||||
" try:\n",
|
||||
" speaker_to_utter[speaker_path].append(embed_file)\n",
|
||||
" except:\n",
|
||||
" speaker_to_utter[speaker_path]=[embed_file]\n",
|
||||
"print(f'Speaker count: {len(speaker_paths)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set up the embeddings\n",
|
||||
"\n",
|
||||
"Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeds = []\n",
|
||||
"labels = []\n",
|
||||
"locations = []\n",
|
||||
"\n",
|
||||
"# single speaker \n",
|
||||
"#num_speakers = 1\n",
|
||||
"#num_utters = 1000\n",
|
||||
"\n",
|
||||
"# multi speaker\n",
|
||||
"num_speakers = 10\n",
|
||||
"num_utters = 20\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
|
||||
"\n",
|
||||
"for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
|
||||
" speaker_path = speaker_paths[speaker_idx]\n",
|
||||
" speakers_utter = speaker_to_utter[speaker_path]\n",
|
||||
" utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
|
||||
" for utter_idx in utter_idxs:\n",
|
||||
" embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
|
||||
" embed = np.load(embed_path)\n",
|
||||
" embeds.append(embed)\n",
|
||||
" labels.append(str(speaker_num))\n",
|
||||
" locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
|
||||
"embeds = np.concatenate(embeds)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load embeddings with UMAP"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = umap.UMAP()\n",
|
||||
"projection = model.fit_transform(embeds)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Interactively charting the data in Bokeh\n",
|
||||
"\n",
|
||||
"Set up various details for Bokeh to plot the data\n",
|
||||
"\n",
|
||||
"You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
|
||||
"\n",
|
||||
"Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
|
||||
"\n",
|
||||
"File location in the tooltip is given relative to **AUDIO_PATH**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"source_wav_stems = ColumnDataSource(\n",
|
||||
" data=dict(\n",
|
||||
" x = projection.T[0].tolist(),\n",
|
||||
" y = projection.T[1].tolist(),\n",
|
||||
" desc=locations,\n",
|
||||
" label=labels\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"hover = HoverTool(\n",
|
||||
" tooltips=[\n",
|
||||
" (\"file\", \"@desc\"),\n",
|
||||
" (\"speaker\", \"@label\"),\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# optionally consider adding these to the tooltips if you want additional detail\n",
|
||||
"# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
|
||||
"# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
|
||||
"\n",
|
||||
"factors = list(set(labels))\n",
|
||||
"pal_size = max(len(factors), 3)\n",
|
||||
"pal = Category10[pal_size]\n",
|
||||
"\n",
|
||||
"p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
|
||||
"\n",
|
||||
"url = \"http://localhost:8000/@desc\"\n",
|
||||
"taptool = p.select(type=TapTool)\n",
|
||||
"taptool.callback = OpenURL(url=url)\n",
|
||||
"\n",
|
||||
"show(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Local server to serve wav files from corpus\n",
|
||||
"\n",
|
||||
"This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n",
|
||||
"\n",
|
||||
"There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n",
|
||||
"\n",
|
||||
"The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%cd $AUDIO_PATH\n",
|
||||
"%pwd\n",
|
||||
"!python -m http.server"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -1,2 +0,0 @@
|
|||
umap-learn
|
||||
numpy>=1.17.0
|
|
@ -1,252 +0,0 @@
|
|||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.speaker_encoder.dataset import MyDataset
|
||||
from TTS.speaker_encoder.loss import GE2ELoss
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.visual import plot_embeddings
|
||||
from TTS.speaker_encoder.generic_utils import save_best_model
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import load_config, copy_config_file
|
||||
from TTS.utils.training import check_update, NoamLR
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.radam import RAdam
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.manual_seed(54321)
|
||||
use_cuda = torch.cuda.is_available()
|
||||
num_gpus = torch.cuda.device_count()
|
||||
print(" > Using CUDA: ", use_cuda)
|
||||
print(" > Number of GPUs: ", num_gpus)
|
||||
|
||||
|
||||
def setup_loader(ap, is_val=False, verbose=False):
|
||||
if is_val:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(ap,
|
||||
meta_data_eval if is_val else meta_data_train,
|
||||
voice_len=1.6,
|
||||
num_utter_per_speaker=10,
|
||||
skip_speakers=False,
|
||||
verbose=verbose)
|
||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(dataset,
|
||||
batch_size=c.num_speakers_in_batch,
|
||||
shuffle=False,
|
||||
num_workers=c.num_loader_workers,
|
||||
collate_fn=dataset.collate_fn)
|
||||
return loader
|
||||
|
||||
|
||||
def train(model, criterion, optimizer, scheduler, ap, global_step):
|
||||
data_loader = setup_loader(ap, is_val=False, verbose=True)
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
best_loss = float('inf')
|
||||
avg_loss = 0
|
||||
end_time = time.time()
|
||||
for _, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
inputs = data[0]
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
# setup lr
|
||||
if c.lr_decay:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
# labels = labels.cuda(non_blocking=True)
|
||||
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(
|
||||
outputs.view(c.num_speakers_in_batch,
|
||||
outputs.shape[0] // c.num_speakers_in_batch, -1))
|
||||
loss.backward()
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
avg_loss = 0.01 * loss.item(
|
||||
) + 0.99 * avg_loss if avg_loss != 0 else loss.item()
|
||||
current_lr = optimizer.param_groups[0]['lr']
|
||||
|
||||
if global_step % c.steps_plot_stats == 0:
|
||||
# Plot Training Epoch Stats
|
||||
train_stats = {
|
||||
"GE2Eloss": avg_loss,
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time
|
||||
}
|
||||
tb_logger.tb_train_epoch_stats(global_step, train_stats)
|
||||
figures = {
|
||||
# FIXME: not constant
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(),
|
||||
10),
|
||||
}
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
if global_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
|
||||
"StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format(
|
||||
global_step, loss.item(), avg_loss, grad_norm, step_time,
|
||||
loader_time, current_lr),
|
||||
flush=True)
|
||||
|
||||
# save best model
|
||||
best_loss = save_best_model(model, optimizer, avg_loss, best_loss,
|
||||
OUT_PATH, global_step)
|
||||
|
||||
end_time = time.time()
|
||||
return avg_loss, global_step
|
||||
|
||||
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
model = SpeakerEncoder(input_dim=40,
|
||||
proj_dim=128,
|
||||
lstm_dim=384,
|
||||
num_lstm_layers=3)
|
||||
optimizer = RAdam(model.parameters(), lr=c.lr)
|
||||
criterion = GE2ELoss(loss_method='softmax')
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path)
|
||||
try:
|
||||
# TODO: fix optimizer init, model.cuda() needs to be called before
|
||||
# optimizer restore
|
||||
# optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except KeyError:
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint, c)
|
||||
model.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = c.lr
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
criterion.cuda()
|
||||
|
||||
if c.lr_decay:
|
||||
scheduler = NoamLR(optimizer,
|
||||
warmup_steps=c.warmup_steps,
|
||||
last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
scheduler = None
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
global_step = args.restore_step
|
||||
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
|
||||
global_step)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Path to model outputs (checkpoint, tensorboard etc.).',
|
||||
default=0)
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
)
|
||||
parser.add_argument('--debug',
|
||||
type=bool,
|
||||
default=True,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
type=str,
|
||||
default='',
|
||||
help='Defines the data path. It overwrites config.json.')
|
||||
parser.add_argument('--output_path',
|
||||
type=str,
|
||||
help='path for training outputs.',
|
||||
default='')
|
||||
parser.add_argument('--output_folder',
|
||||
type=str,
|
||||
default='',
|
||||
help='folder name for training outputs.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# setup output paths and read configs
|
||||
c = load_config(args.config_path)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
if args.data_path != '':
|
||||
c.data_path = args.data_path
|
||||
|
||||
if args.output_path == '':
|
||||
OUT_PATH = os.path.join(_, c.output_path)
|
||||
else:
|
||||
OUT_PATH = args.output_path
|
||||
|
||||
if args.output_folder == '':
|
||||
OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
|
||||
else:
|
||||
OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
|
||||
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'),
|
||||
new_fields)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR)
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
Binary file not shown.
Before Width: | Height: | Size: 24 KiB |
|
@ -1,46 +0,0 @@
|
|||
import umap
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
|
||||
colormap = (
|
||||
np.array(
|
||||
[
|
||||
[76, 255, 0],
|
||||
[0, 127, 70],
|
||||
[255, 0, 0],
|
||||
[255, 217, 38],
|
||||
[0, 135, 255],
|
||||
[165, 0, 165],
|
||||
[255, 167, 255],
|
||||
[0, 255, 255],
|
||||
[255, 96, 38],
|
||||
[142, 76, 0],
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
],
|
||||
dtype=np.float,
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
|
||||
|
||||
def plot_embeddings(embeddings, num_utter_per_speaker):
|
||||
embeddings = embeddings[: 10 * num_utter_per_speaker]
|
||||
model = umap.UMAP()
|
||||
projection = model.fit_transform(embeddings)
|
||||
num_speakers = embeddings.shape[0] // num_utter_per_speaker
|
||||
ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
|
||||
colors = [colormap[i] for i in ground_truth]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(16, 10))
|
||||
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
|
||||
plt.gca().set_aspect("equal", "datalim")
|
||||
plt.title("UMAP projection")
|
||||
plt.tight_layout()
|
||||
plt.savefig("umap")
|
||||
return fig
|
182
synthesize.py
182
synthesize.py
|
@ -1,182 +0,0 @@
|
|||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import torch
|
||||
import json
|
||||
import string
|
||||
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
def tts(model,
|
||||
vocoder_model,
|
||||
C,
|
||||
VC,
|
||||
text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
use_cuda,
|
||||
batched_vocoder,
|
||||
speaker_id=None,
|
||||
figures=False):
|
||||
t_1 = time.time()
|
||||
use_vocoder_model = vocoder_model is not None
|
||||
waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
|
||||
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
|
||||
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
|
||||
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
|
||||
|
||||
if C.model == "Tacotron" and use_vocoder_model:
|
||||
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
|
||||
# correct if there is a scale difference b/w two models
|
||||
if use_vocoder_model:
|
||||
postnet_output = ap._denormalize(postnet_output)
|
||||
postnet_output = ap_vocoder._normalize(postnet_output)
|
||||
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
|
||||
waveform = vocoder_model.generate(
|
||||
vocoder_input.cuda() if use_cuda else vocoder_input,
|
||||
batched=batched_vocoder,
|
||||
target=8000,
|
||||
overlap=400)
|
||||
print(" > Run-time: {}".format(time.time() - t_1))
|
||||
return alignment, postnet_output, stop_tokens, waveform
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
global symbols, phonemes
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument('config_path',
|
||||
type=str,
|
||||
help='Path to model config file.')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
type=str,
|
||||
help='Path to model file.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'out_path',
|
||||
type=str,
|
||||
help='Path to save final wav file. Wav file will be names as the text given.',
|
||||
)
|
||||
parser.add_argument('--use_cuda',
|
||||
type=bool,
|
||||
help='Run model on CUDA.',
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
'--vocoder_path',
|
||||
type=str,
|
||||
help=
|
||||
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
|
||||
default="",
|
||||
)
|
||||
parser.add_argument('--vocoder_config_path',
|
||||
type=str,
|
||||
help='Path to vocoder model config file.',
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--batched_vocoder',
|
||||
type=bool,
|
||||
help="If True, vocoder model uses faster batch processing.",
|
||||
default=True)
|
||||
parser.add_argument('--speakers_json',
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--speaker_id',
|
||||
type=int,
|
||||
help="target speaker_id if the model is multi-speaker.",
|
||||
default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.vocoder_path != "":
|
||||
assert args.use_cuda, " [!] Enable cuda for vocoder."
|
||||
from WaveRNN.models.wavernn import Model as VocoderModel
|
||||
|
||||
# load the config
|
||||
C = load_config(args.config_path)
|
||||
C.forward_attn_mask = True
|
||||
|
||||
# load the audio processor
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
# load speakers
|
||||
if args.speakers_json != '':
|
||||
speakers = json.load(open(args.speakers_json, 'r'))
|
||||
num_speakers = len(speakers)
|
||||
else:
|
||||
num_speakers = 0
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, C)
|
||||
cp = torch.load(args.model_path)
|
||||
model.load_state_dict(cp['model'])
|
||||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
model.decoder.set_r(cp['r'])
|
||||
|
||||
# load vocoder model
|
||||
if args.vocoder_path != "":
|
||||
VC = load_config(args.vocoder_config_path)
|
||||
ap_vocoder = AudioProcessor(**VC.audio)
|
||||
bits = 10
|
||||
vocoder_model = VocoderModel(rnn_dims=512,
|
||||
fc_dims=512,
|
||||
mode=VC.mode,
|
||||
mulaw=VC.mulaw,
|
||||
pad=VC.pad,
|
||||
upsample_factors=VC.upsample_factors,
|
||||
feat_dims=VC.audio["num_mels"],
|
||||
compute_dims=128,
|
||||
res_out_dims=128,
|
||||
res_blocks=10,
|
||||
hop_length=ap.hop_length,
|
||||
sample_rate=ap.sample_rate,
|
||||
use_aux_net=True,
|
||||
use_upsample_net=True)
|
||||
|
||||
check = torch.load(args.vocoder_path)
|
||||
vocoder_model.load_state_dict(check['model'])
|
||||
vocoder_model.eval()
|
||||
if args.use_cuda:
|
||||
vocoder_model.cuda()
|
||||
else:
|
||||
vocoder_model = None
|
||||
VC = None
|
||||
ap_vocoder = None
|
||||
|
||||
# synthesize voice
|
||||
print(" > Text: {}".format(args.text))
|
||||
_, _, _, wav = tts(model,
|
||||
vocoder_model,
|
||||
C,
|
||||
VC,
|
||||
args.text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
args.use_cuda,
|
||||
args.batched_vocoder,
|
||||
speaker_id=args.speaker_id,
|
||||
figures=False)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
print(" > Saving output to {}".format(out_path))
|
||||
ap.save_wav(wav, out_path)
|
|
@ -1,8 +1,8 @@
|
|||
import unittest
|
||||
import torch as T
|
||||
|
||||
from TTS.utils.generic_utils import save_checkpoint, save_best_model
|
||||
from TTS.layers.tacotron import Prenet
|
||||
from TTS.tts.utils.generic_utils import save_checkpoint, save_best_model
|
||||
from TTS.tts.layers.tacotron import Prenet
|
||||
|
||||
OUT_PATH = '/tmp/test.pth.tar'
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"model": "Tacotron2",
|
||||
"run_name": "ljspeech-ddc-bn",
|
||||
"run_description": "tacotron2 with ddc and batch-normalization",
|
||||
"run_name": "test_sample_dataset_run",
|
||||
"run_description": "sample dataset test run",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
|
@ -61,30 +61,30 @@
|
|||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":16,
|
||||
"batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":1,
|
||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": false, // use noam warmup and lr schedule.
|
||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
// TACOTRON PRENET
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
|
@ -105,7 +105,7 @@
|
|||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
"print_step": 1, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
|
@ -122,10 +122,10 @@
|
|||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
|
@ -135,13 +135,15 @@
|
|||
"use_gst": false, // TACOTRON ONLY: use global style tokens
|
||||
|
||||
// DATASETS
|
||||
"train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments.
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": null
|
||||
"meta_file_val": "metadata.csv"
|
||||
}
|
||||
]
|
||||
|
|
@ -31,7 +31,7 @@
|
|||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
|
@ -90,7 +90,7 @@
|
|||
},
|
||||
|
||||
// DATASET
|
||||
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
|
||||
"data_path": "tests/data/ljspeech/wavs/",
|
||||
"feature_path": null,
|
||||
"seq_len": 16384,
|
||||
"pad_short": 2000,
|
||||
|
@ -101,7 +101,7 @@
|
|||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
|
@ -109,7 +109,7 @@
|
|||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"epochs": 10000, // total number of epochs to train.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"wd": 0.0, // Weight decay weight.
|
||||
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
||||
|
@ -127,7 +127,7 @@
|
|||
"lr_disc": 1e-4,
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log traning on console.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
|
@ -139,6 +139,6 @@
|
|||
"eval_split_size": 10,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/"
|
||||
"output_path": "tests/outputs/train_outputs/"
|
||||
}
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
{
|
||||
"run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
|
||||
"run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
|
||||
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": false, // move normalization to range [-1, 1]
|
||||
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
},
|
||||
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54321"
|
||||
},
|
||||
|
||||
"reinit_layers": [],
|
||||
|
||||
"model": "Tacotron2", // one of the model in models/
|
||||
"grad_clip": 1, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
|
||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
||||
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
|
||||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"forward_attn_mask": false,
|
||||
"attention_type": "original",
|
||||
"attention_heads": 5,
|
||||
"bidirectional_decoder": false,
|
||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"use_gst": false,
|
||||
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||
"eval_batch_size":16,
|
||||
"r": 1, // Number of frames to predict for step.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 10, // Number of steps to log traning on console.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
|
||||
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
|
||||
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
|
||||
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
|
||||
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 150, // DATASET-RELATED: maximum text length
|
||||
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"use_speaker_embedding": false // whether to use additional embeddings for separate speakers
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import unittest
|
||||
|
||||
from TTS.utils.text import phonemes
|
||||
from TTS.tts.utils.text import phonemes
|
||||
|
||||
class SymbolsTest(unittest.TestCase):
|
||||
def test_uniqueness(self): #pylint: disable=no-self-use
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import unittest
|
||||
|
||||
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
|
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
|
|||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
os.makedirs(OUT_PATH, exist_ok=True)
|
||||
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
|
||||
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
|
||||
# pylint: disable=protected-access
|
||||
|
|
|
@ -4,10 +4,11 @@ import unittest
|
|||
import torch as T
|
||||
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
from TTS.tests import get_tests_input_path, get_tests_output_path
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.io import load_config, save_checkpoint
|
||||
from tests import get_tests_input_path, get_tests_output_path
|
||||
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.tts.utils.generic_utils import setup_model
|
||||
from TTS.tts.utils.io import save_checkpoint
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
|
||||
class DemoServerTest(unittest.TestCase):
|
||||
|
|
|
@ -2,12 +2,13 @@ import os
|
|||
import unittest
|
||||
import torch as T
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.loss import GE2ELoss
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
|
||||
file_path = get_tests_input_path()
|
||||
c = load_config(os.path.join(file_path, "test_config.json"))
|
||||
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
import unittest
|
||||
import torch as T
|
||||
|
||||
from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder
|
||||
from TTS.layers.losses import L1LossMasked
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder
|
||||
from TTS.tts.layers.losses import L1LossMasked
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
|
|
@ -4,18 +4,18 @@ import shutil
|
|||
import torch
|
||||
import numpy as np
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.datasets import TTSDataset
|
||||
from TTS.datasets.preprocess import ljspeech
|
||||
from TTS.tts.datasets import TTSDataset
|
||||
from TTS.tts.datasets.preprocess import ljspeech
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
OUTPATH = os.path.join(file_path, "outputs/loader_tests/")
|
||||
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
ok_ljspeech = os.path.exists(c.data_path)
|
||||
|
||||
DATA_EXIST = True
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import unittest
|
||||
import os
|
||||
from TTS.tests import get_tests_input_path
|
||||
from tests import get_tests_input_path
|
||||
|
||||
from TTS.datasets.preprocess import common_voice
|
||||
from TTS.tts.datasets.preprocess import common_voice
|
||||
|
||||
|
||||
class TestPreprocessors(unittest.TestCase):
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import os
|
||||
import copy
|
||||
import torch
|
||||
import os
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
from torch import optim
|
||||
from torch import nn
|
||||
import torch
|
||||
from tests import get_tests_input_path
|
||||
from torch import nn, optim
|
||||
|
||||
from TTS.tts.layers.losses import MSELossMasked
|
||||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.layers.losses import MSELossMasked
|
||||
from TTS.models.tacotron2 import Tacotron2
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
||||
|
@ -16,8 +16,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
|
||||
class TacotronTrainTest(unittest.TestCase):
|
||||
|
|
|
@ -5,9 +5,11 @@ import numpy as np
|
|||
import tensorflow as tf
|
||||
tf.get_logger().setLevel('INFO')
|
||||
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tf.models.tacotron2 import Tacotron2
|
||||
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
|
||||
from TTS.tts.tf.models.tacotron2 import Tacotron2
|
||||
from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
||||
|
@ -15,8 +17,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
|
||||
class TacotronTFTrainTest(unittest.TestCase):
|
|
@ -1,13 +1,14 @@
|
|||
import os
|
||||
import copy
|
||||
import torch
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from torch import optim
|
||||
from torch import nn
|
||||
import torch
|
||||
from tests import get_tests_input_path
|
||||
from torch import nn, optim
|
||||
|
||||
from TTS.tts.layers.losses import L1LossMasked
|
||||
from TTS.tts.models.tacotron import Tacotron
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.layers.losses import L1LossMasked
|
||||
from TTS.models.tacotron import Tacotron
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
||||
|
@ -15,8 +16,7 @@ torch.manual_seed(1)
|
|||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
|
|
|
@ -3,12 +3,12 @@ import os
|
|||
# pylint: disable=wildcard-import
|
||||
# pylint: disable=unused-import
|
||||
import unittest
|
||||
from TTS.utils.text import *
|
||||
from TTS.tests import get_tests_path
|
||||
from tests import get_tests_input_path
|
||||
from TTS.tts.utils.text import *
|
||||
from tests import get_tests_path
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
TESTS_PATH = get_tests_path()
|
||||
conf = load_config(os.path.join(TESTS_PATH, 'test_config.json'))
|
||||
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
def test_phoneme_to_sequence():
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
|
@ -1,24 +1,24 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
|
||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/")
|
||||
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
|
||||
os.makedirs(OUTPATH, exist_ok=True)
|
||||
|
||||
C = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
|
||||
test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/")
|
||||
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
|
||||
ok_ljspeech = os.path.exists(test_data_path)
|
||||
|
||||
|
||||
|
||||
def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers):
|
||||
''' run dataloader with given parameters and check conditions '''
|
||||
ap = AudioProcessor(**C.audio)
|
|
@ -1,11 +1,11 @@
|
|||
import os
|
||||
|
||||
import torch
|
||||
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||
|
||||
from TTS.vocoder.layers.losses import TorchSTFT, STFTLoss, MultiScaleSTFTLoss
|
||||
|
||||
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.vocoder.layers.losses import MultiScaleSTFTLoss, STFTLoss, TorchSTFT
|
||||
|
||||
TESTS_PATH = get_tests_path()
|
||||
|
||||
|
@ -14,8 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True)
|
|||
|
||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||
C = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
|
||||
|
@ -53,9 +52,3 @@ def test_multiscale_stft_loss():
|
|||
loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
|
||||
assert loss_sc < 1.0
|
||||
assert loss_m + loss_sc > 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -4,7 +4,7 @@ import torch
|
|||
import soundfile as sf
|
||||
from librosa.core import load
|
||||
|
||||
from TTS.tests import get_tests_path, get_tests_input_path
|
||||
from tests import get_tests_path, get_tests_input_path
|
||||
from TTS.vocoder.layers.pqmf import PQMF
|
||||
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# create run dir
|
||||
mkdir $BASEDIR/train_outputs
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/outputs/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/outputs/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
20
tf/README.md
20
tf/README.md
|
@ -1,20 +0,0 @@
|
|||
## Utilities to Convert Models to Tensorflow2
|
||||
Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=).
|
||||
|
||||
Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations.
|
||||
|
||||
Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome.
|
||||
|
||||
To see how you can use TF model at inference, check the notebook.
|
||||
|
||||
This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own.
|
||||
|
||||
|
||||
### Converting a Model
|
||||
- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments.
|
||||
|
||||
### Known issues ans limitations
|
||||
- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors.
|
||||
- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```.
|
||||
- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper.
|
||||
- You can only convert models trained after TF model implementation since model layers has been updated in Torch model.
|
|
@ -1,37 +0,0 @@
|
|||
# Convert Tensorflow Tacotron2 model to TF-Lite binary
|
||||
|
||||
import argparse
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.text.symbols import symbols, phonemes
|
||||
from TTS.tf.utils.generic_utils import setup_model
|
||||
from TTS.tf.utils.io import load_checkpoint
|
||||
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--tf_model',
|
||||
type=str,
|
||||
help='Path to target torch model to be converted to TF.')
|
||||
parser.add_argument('--config_path',
|
||||
type=str,
|
||||
help='Path to config file of torch model.')
|
||||
parser.add_argument('--output_path',
|
||||
type=str,
|
||||
help='path to tflite output binary.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set constants
|
||||
CONFIG = load_config(args.config_path)
|
||||
|
||||
# load the model
|
||||
c = CONFIG
|
||||
num_speakers = 0
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, c, enable_tflite=True)
|
||||
model.build_inference()
|
||||
model = load_checkpoint(model, args.tf_model)
|
||||
model.decoder.set_max_decoder_steps(1000)
|
||||
|
||||
# create tflite model
|
||||
tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)
|
|
@ -1,210 +0,0 @@
|
|||
# %%
|
||||
import sys
|
||||
sys.path.append('/home/erogol/Projects')
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
# %%
|
||||
import argparse
|
||||
import numpy as np
|
||||
import torch
|
||||
import tensorflow as tf
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tf.models.tacotron2 import Tacotron2
|
||||
from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
|
||||
from TTS.tf.utils.generic_utils import save_checkpoint
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--torch_model_path',
|
||||
type=str,
|
||||
help='Path to target torch model to be converted to TF.')
|
||||
parser.add_argument('--config_path',
|
||||
type=str,
|
||||
help='Path to config file of torch model.')
|
||||
parser.add_argument('--output_path',
|
||||
type=str,
|
||||
help='path to output file including file name to save TF model.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# load model config
|
||||
config_path = args.config_path
|
||||
c = load_config(config_path)
|
||||
num_speakers = 0
|
||||
|
||||
# init torch model
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, c)
|
||||
checkpoint = torch.load(args.torch_model_path,
|
||||
map_location=torch.device('cpu'))
|
||||
state_dict = checkpoint['model']
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
# init tf model
|
||||
model_tf = Tacotron2(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=model.decoder.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder)
|
||||
|
||||
# set initial layer mapping - these are not captured by the below heuristic approach
|
||||
# TODO: set layer names so that we can remove these manual matching
|
||||
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
|
||||
var_map = [
|
||||
('embedding/embeddings:0', 'embedding.weight'),
|
||||
('encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
|
||||
'encoder.lstm.weight_ih_l0'),
|
||||
('encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
|
||||
'encoder.lstm.weight_hh_l0'),
|
||||
('encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
|
||||
'encoder.lstm.weight_ih_l0_reverse'),
|
||||
('encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
|
||||
'encoder.lstm.weight_hh_l0_reverse'),
|
||||
('encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
|
||||
('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
|
||||
('encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
|
||||
('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
|
||||
('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
|
||||
('decoder/linear_projection/kernel:0',
|
||||
'decoder.linear_projection.linear_layer.weight'),
|
||||
('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
|
||||
]
|
||||
|
||||
# %%
|
||||
# get tf_model graph
|
||||
mel_pred = model_tf.build_inference()
|
||||
|
||||
# get tf variables
|
||||
tf_vars = model_tf.weights
|
||||
|
||||
# match variable names with fuzzy logic
|
||||
torch_var_names = list(state_dict.keys())
|
||||
tf_var_names = [we.name for we in model_tf.weights]
|
||||
for tf_name in tf_var_names:
|
||||
# skip re-mapped layer names
|
||||
if tf_name in [name[0] for name in var_map]:
|
||||
continue
|
||||
tf_name_edited = convert_tf_name(tf_name)
|
||||
ratios = [
|
||||
fuzz.ratio(torch_name, tf_name_edited)
|
||||
for torch_name in torch_var_names
|
||||
]
|
||||
max_idx = np.argmax(ratios)
|
||||
matching_name = torch_var_names[max_idx]
|
||||
del torch_var_names[max_idx]
|
||||
var_map.append((tf_name, matching_name))
|
||||
|
||||
# %%
|
||||
# print variable match
|
||||
from pprint import pprint
|
||||
pprint(var_map)
|
||||
pprint(torch_var_names)
|
||||
|
||||
# pass weights
|
||||
tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
|
||||
|
||||
# Compare TF and TORCH models
|
||||
# %%
|
||||
# check embedding outputs
|
||||
model.eval()
|
||||
input_ids = torch.randint(0, 24, (1, 128)).long()
|
||||
|
||||
o_t = model.embedding(input_ids)
|
||||
o_tf = model_tf.embedding(input_ids.detach().numpy())
|
||||
assert abs(o_t.detach().numpy() -
|
||||
o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
|
||||
o_tf.numpy()).sum()
|
||||
|
||||
# compare encoder outputs
|
||||
oo_en = model.encoder.inference(o_t.transpose(1, 2))
|
||||
ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
|
||||
assert compare_torch_tf(oo_en, ooo_en) < 1e-5
|
||||
|
||||
#pylint: disable=redefined-builtin
|
||||
# compare decoder.attention_rnn
|
||||
inp = torch.rand([1, 768])
|
||||
inp_tf = inp.numpy()
|
||||
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
|
||||
output, cell_state = model.decoder.attention_rnn(inp)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
|
||||
states[2],
|
||||
training=False)
|
||||
assert compare_torch_tf(output, output_tf).mean() < 1e-5
|
||||
|
||||
query = output
|
||||
inputs = torch.rand([1, 128, 512])
|
||||
query_tf = query.detach().numpy()
|
||||
inputs_tf = inputs.numpy()
|
||||
|
||||
# compare decoder.attention
|
||||
model.decoder.attention.init_states(inputs)
|
||||
processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
|
||||
loc_attn, proc_query = model.decoder.attention.get_location_attention(
|
||||
query, processes_inputs)
|
||||
context = model.decoder.attention(query, inputs, processes_inputs, None)
|
||||
|
||||
attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1]
|
||||
model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
|
||||
loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states)
|
||||
context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False)
|
||||
|
||||
assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5
|
||||
assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5
|
||||
assert compare_torch_tf(context, context_tf) < 1e-5
|
||||
|
||||
# compare decoder.decoder_rnn
|
||||
input = torch.rand([1, 1536])
|
||||
input_tf = input.numpy()
|
||||
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
|
||||
output, cell_state = model.decoder.decoder_rnn(
|
||||
input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
|
||||
states[3],
|
||||
training=False)
|
||||
assert abs(input - input_tf).mean() < 1e-5
|
||||
assert compare_torch_tf(output, output_tf).mean() < 1e-5
|
||||
|
||||
# compare decoder.linear_projection
|
||||
input = torch.rand([1, 1536])
|
||||
input_tf = input.numpy()
|
||||
output = model.decoder.linear_projection(input)
|
||||
output_tf = model_tf.decoder.linear_projection(input_tf, training=False)
|
||||
assert compare_torch_tf(output, output_tf) < 1e-5
|
||||
|
||||
# compare decoder outputs
|
||||
model.decoder.max_decoder_steps = 100
|
||||
model_tf.decoder.set_max_decoder_steps(100)
|
||||
output, align, stop = model.decoder.inference(oo_en)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
|
||||
assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
|
||||
|
||||
# compare the whole model output
|
||||
outputs_torch = model.inference(input_ids)
|
||||
outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
|
||||
print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
|
||||
assert compare_torch_tf(outputs_torch[2][:, 50, :],
|
||||
outputs_tf[2][:, 50, :]) < 1e-5
|
||||
assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
|
||||
|
||||
# %%
|
||||
# save tf model
|
||||
save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
|
||||
checkpoint['r'], args.output_path)
|
||||
print(' > Model conversion is successfully completed :).')
|
|
@ -1,285 +0,0 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from tensorflow.python.ops import math_ops
|
||||
# from tensorflow_addons.seq2seq import BahdanauAttention
|
||||
|
||||
|
||||
class Linear(keras.layers.Layer):
|
||||
def __init__(self, units, use_bias, **kwargs):
|
||||
super(Linear, self).__init__(**kwargs)
|
||||
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
|
||||
self.activation = keras.layers.ReLU()
|
||||
|
||||
def call(self, x):
|
||||
"""
|
||||
shapes:
|
||||
x: B x T x C
|
||||
"""
|
||||
return self.activation(self.linear_layer(x))
|
||||
|
||||
|
||||
class LinearBN(keras.layers.Layer):
|
||||
def __init__(self, units, use_bias, **kwargs):
|
||||
super(LinearBN, self).__init__(**kwargs)
|
||||
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
|
||||
self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization')
|
||||
self.activation = keras.layers.ReLU()
|
||||
|
||||
def call(self, x, training=None):
|
||||
"""
|
||||
shapes:
|
||||
x: B x T x C
|
||||
"""
|
||||
out = self.linear_layer(x)
|
||||
out = self.batch_normalization(out, training=training)
|
||||
return self.activation(out)
|
||||
|
||||
|
||||
class Prenet(keras.layers.Layer):
|
||||
def __init__(self,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
units,
|
||||
bias,
|
||||
**kwargs):
|
||||
super(Prenet, self).__init__(**kwargs)
|
||||
self.prenet_type = prenet_type
|
||||
self.prenet_dropout = prenet_dropout
|
||||
self.linear_layers = []
|
||||
if prenet_type == "bn":
|
||||
self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
|
||||
elif prenet_type == "original":
|
||||
self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)]
|
||||
else:
|
||||
raise RuntimeError(' [!] Unknown prenet type.')
|
||||
if prenet_dropout:
|
||||
self.dropout = keras.layers.Dropout(rate=0.5)
|
||||
|
||||
def call(self, x, training=None):
|
||||
"""
|
||||
shapes:
|
||||
x: B x T x C
|
||||
"""
|
||||
for linear in self.linear_layers:
|
||||
if self.prenet_dropout:
|
||||
x = self.dropout(linear(x), training=training)
|
||||
else:
|
||||
x = linear(x)
|
||||
return x
|
||||
|
||||
|
||||
def _sigmoid_norm(score):
|
||||
attn_weights = tf.nn.sigmoid(score)
|
||||
attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True)
|
||||
return attn_weights
|
||||
|
||||
|
||||
class Attention(keras.layers.Layer):
|
||||
"""TODO: implement forward_attention
|
||||
TODO: location sensitive attention
|
||||
TODO: implement attention windowing """
|
||||
def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters,
|
||||
loc_attn_kernel_size, use_windowing, norm, use_forward_attn,
|
||||
use_trans_agent, use_forward_attn_mask, **kwargs):
|
||||
super(Attention, self).__init__(**kwargs)
|
||||
self.use_loc_attn = use_loc_attn
|
||||
self.loc_attn_n_filters = loc_attn_n_filters
|
||||
self.loc_attn_kernel_size = loc_attn_kernel_size
|
||||
self.use_windowing = use_windowing
|
||||
self.norm = norm
|
||||
self.use_forward_attn = use_forward_attn
|
||||
self.use_trans_agent = use_trans_agent
|
||||
self.use_forward_attn_mask = use_forward_attn_mask
|
||||
self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer')
|
||||
self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer')
|
||||
self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer')
|
||||
if use_loc_attn:
|
||||
self.location_conv1d = keras.layers.Conv1D(
|
||||
filters=loc_attn_n_filters,
|
||||
kernel_size=loc_attn_kernel_size,
|
||||
padding='same',
|
||||
use_bias=False,
|
||||
name='location_layer/location_conv1d')
|
||||
self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense')
|
||||
if norm == 'softmax':
|
||||
self.norm_func = tf.nn.softmax
|
||||
elif norm == 'sigmoid':
|
||||
self.norm_func = _sigmoid_norm
|
||||
else:
|
||||
raise ValueError("Unknown value for attention norm type")
|
||||
|
||||
def init_states(self, batch_size, value_length):
|
||||
states = []
|
||||
if self.use_loc_attn:
|
||||
attention_cum = tf.zeros([batch_size, value_length])
|
||||
attention_old = tf.zeros([batch_size, value_length])
|
||||
states = [attention_cum, attention_old]
|
||||
if self.use_forward_attn:
|
||||
alpha = tf.concat([
|
||||
tf.ones([batch_size, 1]),
|
||||
tf.zeros([batch_size, value_length])[:, :-1] + 1e-7
|
||||
], 1)
|
||||
states.append(alpha)
|
||||
return tuple(states)
|
||||
|
||||
def process_values(self, values):
|
||||
""" cache values for decoder iterations """
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
self.processed_values = self.inputs_layer(values)
|
||||
self.values = values
|
||||
|
||||
def get_loc_attn(self, query, states):
|
||||
""" compute location attention, query layer and
|
||||
unnorm. attention weights"""
|
||||
attention_cum, attention_old = states[:2]
|
||||
attn_cat = tf.stack([attention_old, attention_cum], axis=2)
|
||||
|
||||
processed_query = self.query_layer(tf.expand_dims(query, 1))
|
||||
processed_attn = self.location_dense(self.location_conv1d(attn_cat))
|
||||
score = self.v(
|
||||
tf.nn.tanh(self.processed_values + processed_query +
|
||||
processed_attn))
|
||||
score = tf.squeeze(score, axis=2)
|
||||
return score, processed_query
|
||||
|
||||
def get_attn(self, query):
|
||||
""" compute query layer and unnormalized attention weights """
|
||||
processed_query = self.query_layer(tf.expand_dims(query, 1))
|
||||
score = self.v(tf.nn.tanh(self.processed_values + processed_query))
|
||||
score = tf.squeeze(score, axis=2)
|
||||
return score, processed_query
|
||||
|
||||
def apply_score_masking(self, score, mask): #pylint: disable=no-self-use
|
||||
""" ignore sequence paddings """
|
||||
padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
|
||||
# Bias so padding positions do not contribute to attention distribution.
|
||||
score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
|
||||
return score
|
||||
|
||||
def apply_forward_attention(self, alignment, alpha): #pylint: disable=no-self-use
|
||||
# forward attention
|
||||
fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0)
|
||||
# compute transition potentials
|
||||
new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment
|
||||
# renormalize attention weights
|
||||
new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True)
|
||||
return new_alpha
|
||||
|
||||
def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None):
|
||||
states = []
|
||||
if self.use_loc_attn:
|
||||
states = [old_states[0] + scores_norm, attn_weights]
|
||||
if self.use_forward_attn:
|
||||
states.append(new_alpha)
|
||||
return tuple(states)
|
||||
|
||||
def call(self, query, states):
|
||||
"""
|
||||
shapes:
|
||||
query: B x D
|
||||
"""
|
||||
if self.use_loc_attn:
|
||||
score, _ = self.get_loc_attn(query, states)
|
||||
else:
|
||||
score, _ = self.get_attn(query)
|
||||
|
||||
# TODO: masking
|
||||
# if mask is not None:
|
||||
# self.apply_score_masking(score, mask)
|
||||
# attn_weights shape == (batch_size, max_length, 1)
|
||||
|
||||
# normalize attention scores
|
||||
scores_norm = self.norm_func(score)
|
||||
attn_weights = scores_norm
|
||||
|
||||
# apply forward attention
|
||||
new_alpha = None
|
||||
if self.use_forward_attn:
|
||||
new_alpha = self.apply_forward_attention(attn_weights, states[-1])
|
||||
attn_weights = new_alpha
|
||||
|
||||
# update states tuple
|
||||
# states = (cum_attn_weights, attn_weights, new_alpha)
|
||||
states = self.update_states(states, scores_norm, attn_weights, new_alpha)
|
||||
|
||||
# context_vector shape after sum == (batch_size, hidden_size)
|
||||
context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False)
|
||||
context_vector = tf.squeeze(context_vector, axis=1)
|
||||
return context_vector, attn_weights, states
|
||||
|
||||
|
||||
# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b):
|
||||
# dtype = processed_query.dtype
|
||||
# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1]
|
||||
# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2])
|
||||
|
||||
|
||||
# class LocationSensitiveAttention(BahdanauAttention):
|
||||
# def __init__(self,
|
||||
# units,
|
||||
# memory=None,
|
||||
# memory_sequence_length=None,
|
||||
# normalize=False,
|
||||
# probability_fn="softmax",
|
||||
# kernel_initializer="glorot_uniform",
|
||||
# dtype=None,
|
||||
# name="LocationSensitiveAttention",
|
||||
# location_attention_filters=32,
|
||||
# location_attention_kernel_size=31):
|
||||
|
||||
# super(LocationSensitiveAttention,
|
||||
# self).__init__(units=units,
|
||||
# memory=memory,
|
||||
# memory_sequence_length=memory_sequence_length,
|
||||
# normalize=normalize,
|
||||
# probability_fn='softmax', ## parent module default
|
||||
# kernel_initializer=kernel_initializer,
|
||||
# dtype=dtype,
|
||||
# name=name)
|
||||
# if probability_fn == 'sigmoid':
|
||||
# self.probability_fn = lambda score, _: self._sigmoid_normalization(score)
|
||||
# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False)
|
||||
# self.location_dense = keras.layers.Dense(units, use_bias=False)
|
||||
# # self.v = keras.layers.Dense(1, use_bias=True)
|
||||
|
||||
# def _location_sensitive_score(self, processed_query, keys, processed_loc):
|
||||
# processed_query = tf.expand_dims(processed_query, 1)
|
||||
# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2])
|
||||
|
||||
# def _location_sensitive(self, alignment_cum, alignment_old):
|
||||
# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2)
|
||||
# return self.location_dense(self.location_conv(alignment_cat))
|
||||
|
||||
# def _sigmoid_normalization(self, score):
|
||||
# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True)
|
||||
|
||||
# # def _apply_masking(self, score, mask):
|
||||
# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
|
||||
# # # Bias so padding positions do not contribute to attention distribution.
|
||||
# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32)
|
||||
# # return score
|
||||
|
||||
# def _calculate_attention(self, query, state):
|
||||
# alignment_cum, alignment_old = state[:2]
|
||||
# processed_query = self.query_layer(
|
||||
# query) if self.query_layer else query
|
||||
# processed_loc = self._location_sensitive(alignment_cum, alignment_old)
|
||||
# score = self._location_sensitive_score(
|
||||
# processed_query,
|
||||
# self.keys,
|
||||
# processed_loc)
|
||||
# alignment = self.probability_fn(score, state)
|
||||
# alignment_cum = alignment_cum + alignment
|
||||
# state[0] = alignment_cum
|
||||
# state[1] = alignment
|
||||
# return alignment, state
|
||||
|
||||
# def compute_context(self, alignments):
|
||||
# expanded_alignments = tf.expand_dims(alignments, 1)
|
||||
# context = tf.matmul(expanded_alignments, self.values)
|
||||
# context = tf.squeeze(context, [1])
|
||||
# return context
|
||||
|
||||
# # def call(self, query, state):
|
||||
# # alignment, next_state = self._calculate_attention(query, state)
|
||||
# # return alignment, next_state
|
|
@ -1,300 +0,0 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
from TTS.tf.layers.common_layers import Prenet, Attention
|
||||
# from tensorflow_addons.seq2seq import AttentionWrapper
|
||||
|
||||
|
||||
class ConvBNBlock(keras.layers.Layer):
|
||||
def __init__(self, filters, kernel_size, activation, **kwargs):
|
||||
super(ConvBNBlock, self).__init__(**kwargs)
|
||||
self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d')
|
||||
self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization')
|
||||
self.dropout = keras.layers.Dropout(rate=0.5, name='dropout')
|
||||
self.activation = keras.layers.Activation(activation, name='activation')
|
||||
|
||||
def call(self, x, training=None):
|
||||
o = self.convolution1d(x)
|
||||
o = self.batch_normalization(o, training=training)
|
||||
o = self.activation(o)
|
||||
o = self.dropout(o, training=training)
|
||||
return o
|
||||
|
||||
|
||||
class Postnet(keras.layers.Layer):
|
||||
def __init__(self, output_filters, num_convs, **kwargs):
|
||||
super(Postnet, self).__init__(**kwargs)
|
||||
self.convolutions = []
|
||||
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0'))
|
||||
for idx in range(1, num_convs - 1):
|
||||
self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}'))
|
||||
self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}'))
|
||||
|
||||
def call(self, x, training=None):
|
||||
o = x
|
||||
for layer in self.convolutions:
|
||||
o = layer(o, training=training)
|
||||
return o
|
||||
|
||||
|
||||
class Encoder(keras.layers.Layer):
|
||||
def __init__(self, output_input_dim, **kwargs):
|
||||
super(Encoder, self).__init__(**kwargs)
|
||||
self.convolutions = []
|
||||
for idx in range(3):
|
||||
self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}'))
|
||||
self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm')
|
||||
|
||||
def call(self, x, training=None):
|
||||
o = x
|
||||
for layer in self.convolutions:
|
||||
o = layer(o, training=training)
|
||||
o = self.lstm(o)
|
||||
return o
|
||||
|
||||
|
||||
class Decoder(keras.layers.Layer):
|
||||
#pylint: disable=unused-argument
|
||||
def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type,
|
||||
prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask,
|
||||
use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, enable_tflite, **kwargs):
|
||||
super(Decoder, self).__init__(**kwargs)
|
||||
self.frame_dim = frame_dim
|
||||
self.r_init = tf.constant(r, dtype=tf.int32)
|
||||
self.r = tf.constant(r, dtype=tf.int32)
|
||||
self.output_dim = r * self.frame_dim
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.enable_tflite = enable_tflite
|
||||
|
||||
# layer constants
|
||||
self.max_decoder_steps = tf.constant(1000, dtype=tf.int32)
|
||||
self.stop_thresh = tf.constant(0.5, dtype=tf.float32)
|
||||
|
||||
# model dimensions
|
||||
self.query_dim = 1024
|
||||
self.decoder_rnn_dim = 1024
|
||||
self.prenet_dim = 256
|
||||
self.attn_dim = 128
|
||||
self.p_attention_dropout = 0.1
|
||||
self.p_decoder_dropout = 0.1
|
||||
|
||||
self.prenet = Prenet(prenet_type,
|
||||
prenet_dropout,
|
||||
[self.prenet_dim, self.prenet_dim],
|
||||
bias=False,
|
||||
name='prenet')
|
||||
self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name='attention_rnn', )
|
||||
self.attention_rnn_dropout = keras.layers.Dropout(0.5)
|
||||
|
||||
# TODO: implement other attn options
|
||||
self.attention = Attention(attn_dim=self.attn_dim,
|
||||
use_loc_attn=True,
|
||||
loc_attn_n_filters=32,
|
||||
loc_attn_kernel_size=31,
|
||||
use_windowing=False,
|
||||
norm=attn_norm,
|
||||
use_forward_attn=use_forward_attn,
|
||||
use_trans_agent=use_trans_agent,
|
||||
use_forward_attn_mask=use_forward_attn_mask,
|
||||
name='attention')
|
||||
self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name='decoder_rnn')
|
||||
self.decoder_rnn_dropout = keras.layers.Dropout(0.5)
|
||||
self.linear_projection = keras.layers.Dense(self.frame_dim * r, name='linear_projection/linear_layer')
|
||||
self.stopnet = keras.layers.Dense(1, name='stopnet/linear_layer')
|
||||
|
||||
|
||||
def set_max_decoder_steps(self, new_max_steps):
|
||||
self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32)
|
||||
|
||||
def set_r(self, new_r):
|
||||
self.r = tf.constant(new_r, dtype=tf.int32)
|
||||
self.output_dim = self.frame_dim * new_r
|
||||
|
||||
def build_decoder_initial_states(self, batch_size, memory_dim, memory_length):
|
||||
zero_frame = tf.zeros([batch_size, self.frame_dim])
|
||||
zero_context = tf.zeros([batch_size, memory_dim])
|
||||
attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
|
||||
decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32)
|
||||
attention_states = self.attention.init_states(batch_size, memory_length)
|
||||
return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states
|
||||
|
||||
def step(self, prenet_next, states,
|
||||
memory_seq_length=None, training=None):
|
||||
_, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states
|
||||
attention_rnn_input = tf.concat([prenet_next, context_next], -1)
|
||||
attention_rnn_output, attention_rnn_state = \
|
||||
self.attention_rnn(attention_rnn_input,
|
||||
attention_rnn_state, training=training)
|
||||
attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training)
|
||||
context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training)
|
||||
decoder_rnn_input = tf.concat([attention_rnn_output, context], -1)
|
||||
decoder_rnn_output, decoder_rnn_state = \
|
||||
self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training)
|
||||
decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training)
|
||||
linear_projection_input = tf.concat([decoder_rnn_output, context], -1)
|
||||
output_frame = self.linear_projection(linear_projection_input, training=training)
|
||||
stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1)
|
||||
stopnet_output = self.stopnet(stopnet_input, training=training)
|
||||
output_frame = output_frame[:, :self.r * self.frame_dim]
|
||||
states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states)
|
||||
return output_frame, stopnet_output, states, attention
|
||||
|
||||
def decode(self, memory, states, frames, memory_seq_length=None):
|
||||
B, _, _ = shape_list(memory)
|
||||
num_iter = shape_list(frames)[1] // self.r
|
||||
# init states
|
||||
frame_zero = tf.expand_dims(states[0], 1)
|
||||
frames = tf.concat([frame_zero, frames], axis=1)
|
||||
outputs = tf.TensorArray(dtype=tf.float32, size=num_iter)
|
||||
attentions = tf.TensorArray(dtype=tf.float32, size=num_iter)
|
||||
stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter)
|
||||
# pre-computes
|
||||
self.attention.process_values(memory)
|
||||
prenet_output = self.prenet(frames, training=True)
|
||||
step_count = tf.constant(0, dtype=tf.int32)
|
||||
|
||||
def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions):
|
||||
prenet_next = prenet_output[:, step]
|
||||
output, stop_token, states, attention = self.step(prenet_next,
|
||||
states,
|
||||
memory_seq_length)
|
||||
outputs = outputs.write(step, output)
|
||||
attentions = attentions.write(step, attention)
|
||||
stop_tokens = stop_tokens.write(step, stop_token)
|
||||
return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions
|
||||
_, memory, _, states, outputs, stop_tokens, attentions = \
|
||||
tf.while_loop(lambda *arg: True,
|
||||
_body,
|
||||
loop_vars=(step_count, memory, prenet_output,
|
||||
states, outputs, stop_tokens, attentions),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=num_iter)
|
||||
|
||||
outputs = outputs.stack()
|
||||
attentions = attentions.stack()
|
||||
stop_tokens = stop_tokens.stack()
|
||||
outputs = tf.transpose(outputs, [1, 0, 2])
|
||||
attentions = tf.transpose(attentions, [1, 0, 2])
|
||||
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
|
||||
stop_tokens = tf.squeeze(stop_tokens, axis=2)
|
||||
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
|
||||
return outputs, stop_tokens, attentions
|
||||
|
||||
def decode_inference(self, memory, states):
|
||||
B, _, _ = shape_list(memory)
|
||||
# init states
|
||||
outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
|
||||
attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
|
||||
stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
|
||||
|
||||
# pre-computes
|
||||
self.attention.process_values(memory)
|
||||
|
||||
# iter vars
|
||||
stop_flag = tf.constant(False, dtype=tf.bool)
|
||||
step_count = tf.constant(0, dtype=tf.int32)
|
||||
|
||||
def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag):
|
||||
frame_next = states[0]
|
||||
prenet_next = self.prenet(frame_next, training=False)
|
||||
output, stop_token, states, attention = self.step(prenet_next,
|
||||
states,
|
||||
None,
|
||||
training=False)
|
||||
stop_token = tf.math.sigmoid(stop_token)
|
||||
outputs = outputs.write(step, output)
|
||||
attentions = attentions.write(step, attention)
|
||||
stop_tokens = stop_tokens.write(step, stop_token)
|
||||
stop_flag = tf.greater(stop_token, self.stop_thresh)
|
||||
stop_flag = tf.reduce_all(stop_flag)
|
||||
return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag
|
||||
|
||||
cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
|
||||
_, memory, states, outputs, stop_tokens, attentions, stop_flag = \
|
||||
tf.while_loop(cond,
|
||||
_body,
|
||||
loop_vars=(step_count, memory, states, outputs,
|
||||
stop_tokens, attentions, stop_flag),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=self.max_decoder_steps)
|
||||
|
||||
outputs = outputs.stack()
|
||||
attentions = attentions.stack()
|
||||
stop_tokens = stop_tokens.stack()
|
||||
|
||||
outputs = tf.transpose(outputs, [1, 0, 2])
|
||||
attentions = tf.transpose(attentions, [1, 0, 2])
|
||||
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
|
||||
stop_tokens = tf.squeeze(stop_tokens, axis=2)
|
||||
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
|
||||
return outputs, stop_tokens, attentions
|
||||
|
||||
def decode_inference_tflite(self, memory, states):
|
||||
"""Inference with TF-Lite compatibility. It assumes
|
||||
batch_size is 1"""
|
||||
# init states
|
||||
# dynamic_shape is not supported in TFLite
|
||||
outputs = tf.TensorArray(dtype=tf.float32,
|
||||
size=self.max_decoder_steps,
|
||||
element_shape=tf.TensorShape(
|
||||
[self.output_dim]),
|
||||
clear_after_read=False,
|
||||
dynamic_size=False)
|
||||
# stop_flags = tf.TensorArray(dtype=tf.bool,
|
||||
# size=self.max_decoder_steps,
|
||||
# element_shape=tf.TensorShape(
|
||||
# []),
|
||||
# clear_after_read=False,
|
||||
# dynamic_size=False)
|
||||
attentions = ()
|
||||
stop_tokens = ()
|
||||
|
||||
# pre-computes
|
||||
self.attention.process_values(memory)
|
||||
|
||||
# iter vars
|
||||
stop_flag = tf.constant(False, dtype=tf.bool)
|
||||
step_count = tf.constant(0, dtype=tf.int32)
|
||||
|
||||
def _body(step, memory, states, outputs, stop_flag):
|
||||
frame_next = states[0]
|
||||
prenet_next = self.prenet(frame_next, training=False)
|
||||
output, stop_token, states, _ = self.step(prenet_next,
|
||||
states,
|
||||
None,
|
||||
training=False)
|
||||
stop_token = tf.math.sigmoid(stop_token)
|
||||
stop_flag = tf.greater(stop_token, self.stop_thresh)
|
||||
stop_flag = tf.reduce_all(stop_flag)
|
||||
# stop_flags = stop_flags.write(step, tf.logical_not(stop_flag))
|
||||
|
||||
outputs = outputs.write(step, tf.reshape(output, [-1]))
|
||||
return step + 1, memory, states, outputs, stop_flag
|
||||
|
||||
cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
|
||||
step_count, memory, states, outputs, stop_flag = \
|
||||
tf.while_loop(cond,
|
||||
_body,
|
||||
loop_vars=(step_count, memory, states, outputs,
|
||||
stop_flag),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=self.max_decoder_steps)
|
||||
|
||||
|
||||
outputs = outputs.stack()
|
||||
outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter
|
||||
outputs = tf.expand_dims(outputs, axis=[0])
|
||||
outputs = tf.transpose(outputs, [1, 0, 2])
|
||||
outputs = tf.reshape(outputs, [1, -1, self.frame_dim])
|
||||
return outputs, stop_tokens, attentions
|
||||
|
||||
|
||||
def call(self, memory, states, frames=None, memory_seq_length=None, training=False):
|
||||
if training:
|
||||
return self.decode(memory, states, frames, memory_seq_length)
|
||||
if self.enable_tflite:
|
||||
return self.decode_inference_tflite(memory, states)
|
||||
return self.decode_inference(memory, states)
|
|
@ -1,108 +0,0 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
|
||||
from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
|
||||
|
||||
#pylint: disable=too-many-ancestors
|
||||
class Tacotron2(keras.models.Model):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
num_speakers,
|
||||
r,
|
||||
postnet_output_dim=80,
|
||||
decoder_output_dim=80,
|
||||
attn_type='original',
|
||||
attn_win=False,
|
||||
attn_norm="softmax",
|
||||
attn_K=4,
|
||||
prenet_type="original",
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
separate_stopnet=True,
|
||||
bidirectional_decoder=False,
|
||||
enable_tflite=False):
|
||||
super(Tacotron2, self).__init__()
|
||||
self.r = r
|
||||
self.decoder_output_dim = decoder_output_dim
|
||||
self.postnet_output_dim = postnet_output_dim
|
||||
self.bidirectional_decoder = bidirectional_decoder
|
||||
self.num_speakers = num_speakers
|
||||
self.speaker_embed_dim = 256
|
||||
self.enable_tflite = enable_tflite
|
||||
|
||||
self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding')
|
||||
self.encoder = Encoder(512, name='encoder')
|
||||
# TODO: most of the decoder args have no use at the momment
|
||||
self.decoder = Decoder(decoder_output_dim,
|
||||
r,
|
||||
attn_type=attn_type,
|
||||
use_attn_win=attn_win,
|
||||
attn_norm=attn_norm,
|
||||
prenet_type=prenet_type,
|
||||
prenet_dropout=prenet_dropout,
|
||||
use_forward_attn=forward_attn,
|
||||
use_trans_agent=trans_agent,
|
||||
use_forward_attn_mask=forward_attn_mask,
|
||||
use_location_attn=location_attn,
|
||||
attn_K=attn_K,
|
||||
separate_stopnet=separate_stopnet,
|
||||
speaker_emb_dim=self.speaker_embed_dim,
|
||||
name='decoder',
|
||||
enable_tflite=enable_tflite)
|
||||
self.postnet = Postnet(postnet_output_dim, 5, name='postnet')
|
||||
|
||||
@tf.function(experimental_relax_shapes=True)
|
||||
def call(self, characters, text_lengths=None, frames=None, training=None):
|
||||
if training:
|
||||
return self.training(characters, text_lengths, frames)
|
||||
if not training:
|
||||
return self.inference(characters)
|
||||
raise RuntimeError(' [!] Set model training mode True or False')
|
||||
|
||||
def training(self, characters, text_lengths, frames):
|
||||
B, T = shape_list(characters)
|
||||
embedding_vectors = self.embedding(characters, training=True)
|
||||
encoder_output = self.encoder(embedding_vectors, training=True)
|
||||
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
|
||||
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True)
|
||||
postnet_frames = self.postnet(decoder_frames, training=True)
|
||||
output_frames = decoder_frames + postnet_frames
|
||||
return decoder_frames, output_frames, attentions, stop_tokens
|
||||
|
||||
def inference(self, characters):
|
||||
B, T = shape_list(characters)
|
||||
embedding_vectors = self.embedding(characters, training=False)
|
||||
encoder_output = self.encoder(embedding_vectors, training=False)
|
||||
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
|
||||
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
|
||||
postnet_frames = self.postnet(decoder_frames, training=False)
|
||||
output_frames = decoder_frames + postnet_frames
|
||||
print(output_frames.shape)
|
||||
return decoder_frames, output_frames, attentions, stop_tokens
|
||||
|
||||
@tf.function(
|
||||
experimental_relax_shapes=True,
|
||||
input_signature=[
|
||||
tf.TensorSpec([1, None], dtype=tf.int32),
|
||||
],)
|
||||
def inference_tflite(self, characters):
|
||||
B, T = shape_list(characters)
|
||||
embedding_vectors = self.embedding(characters, training=False)
|
||||
encoder_output = self.encoder(embedding_vectors, training=False)
|
||||
decoder_states = self.decoder.build_decoder_initial_states(B, 512, T)
|
||||
decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False)
|
||||
postnet_frames = self.postnet(decoder_frames, training=False)
|
||||
output_frames = decoder_frames + postnet_frames
|
||||
print(output_frames.shape)
|
||||
return decoder_frames, output_frames, attentions, stop_tokens
|
||||
|
||||
def build_inference(self, ):
|
||||
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
|
||||
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
|
||||
self(input_ids)
|
||||
|
|
@ -1,714 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"This is to test TTS tensorflow models with benchmark sentences.\n",
|
||||
"\n",
|
||||
"Before running this script please DON'T FORGET: \n",
|
||||
"- to set file paths.\n",
|
||||
"- to download related models.\n",
|
||||
" - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n",
|
||||
"- download or clone related repos, linked below.\n",
|
||||
"- setup the repositories. ```python setup.py install```\n",
|
||||
"- to checkout right commit versions (given next to the model in the models page).\n",
|
||||
"- to set the file paths below.\n",
|
||||
"\n",
|
||||
"Repositories:\n",
|
||||
"- TTS: https://github.com/mozilla/TTS\n",
|
||||
"- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n",
|
||||
"\n",
|
||||
"Known Issues:\n",
|
||||
"- To load the model second time you need to restart the notebook kernel. \n",
|
||||
"- Some of the advance methods are not yet implemented for Tensorflow."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false",
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# you may need to change this depending on your system\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
|
||||
"\n",
|
||||
"import sys\n",
|
||||
"import io\n",
|
||||
"import torch \n",
|
||||
"import tensorflow as tf\n",
|
||||
"print(tf.config.list_physical_devices('GPU'))\n",
|
||||
"\n",
|
||||
"import time\n",
|
||||
"import json\n",
|
||||
"import yaml\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import OrderedDict\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"plt.rcParams[\"figure.figsize\"] = (16,5)\n",
|
||||
"\n",
|
||||
"import librosa\n",
|
||||
"import librosa.display\n",
|
||||
"\n",
|
||||
"from TTS.tf.models.tacotron2 import Tacotron2\n",
|
||||
"from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.utils.synthesis import synthesis\n",
|
||||
"from TTS.utils.visual import visualize\n",
|
||||
"\n",
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" print(mel_postnet_spec.shape)\n",
|
||||
" print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n",
|
||||
" if use_cuda and not use_gl:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" waveform = waveform.squeeze()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" if figures: \n",
|
||||
" visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n",
|
||||
" IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n",
|
||||
" os.makedirs(OUT_FOLDER, exist_ok=True)\n",
|
||||
" file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
|
||||
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
|
||||
" ap.save_wav(waveform, out_path)\n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set constants\n",
|
||||
"ROOT_PATH = '../torch_model/'\n",
|
||||
"MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n",
|
||||
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
|
||||
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
|
||||
"CONFIG = load_config(CONFIG_PATH)\n",
|
||||
"# Run FLAGs\n",
|
||||
"use_cuda = True # use the available GPU (only for torch)\n",
|
||||
"# Set the vocoder\n",
|
||||
"use_gl = True # use GL if True\n",
|
||||
"BACKEND = 'tf' # set the backend for inference "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false",
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n",
|
||||
"from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n",
|
||||
"c = CONFIG\n",
|
||||
"num_speakers = 0\n",
|
||||
"r = 1\n",
|
||||
"num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, num_speakers, c)\n",
|
||||
"\n",
|
||||
"# before loading weights you need to run the model once to generate the variables\n",
|
||||
"input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n",
|
||||
"mel_pred = model(input_ids, training=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false",
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = load_checkpoint(model, MODEL_PATH)\n",
|
||||
"# model = tf.function(model, experimental_relax_shapes=True)\n",
|
||||
"ap = AudioProcessor(**CONFIG.audio) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# wrapper class to use tf.function\n",
|
||||
"class ModelInference(tf.keras.Model):\n",
|
||||
" def __init__(self, model):\n",
|
||||
" super(ModelInference, self).__init__()\n",
|
||||
" self.model = model\n",
|
||||
" \n",
|
||||
" @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n",
|
||||
" def call(self, characters):\n",
|
||||
" return self.model(characters, training=False)\n",
|
||||
" \n",
|
||||
"model = ModelInference(model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD WAVERNN\n",
|
||||
"if use_gl == False:\n",
|
||||
" from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n",
|
||||
" \n",
|
||||
" vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
|
||||
" vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
|
||||
" vocoder_model.remove_weight_norm()\n",
|
||||
" ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
||||
" if use_cuda:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
" vocoder_model.eval();\n",
|
||||
" print(count_parameters(vocoder_model))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Comparision with https://mycroft.ai/blog/available-voices/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### https://espnet.github.io/icassp2020-tts/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The Commission also recommends\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Other examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Comparison with https://keithito.github.io/audio-samples/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \" He has read the whole thing.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"He reads books.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Thisss isrealy awhsome.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This is your internet browser, Firefox.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"This is your internet browser Firefox.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Eren, how are you?\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"source": [
|
||||
"### Hard Sentences"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Encouraged, he started with a minute a day.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"If he decided to watch TV he really watched it.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# for twb dataset\n",
|
||||
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wavs = []\n",
|
||||
"model.eval()\n",
|
||||
"model.decoder.prenet.eval()\n",
|
||||
"model.decoder.max_decoder_steps = 2000\n",
|
||||
"# model.decoder.prenet.train()\n",
|
||||
"speaker_id = None\n",
|
||||
"sentence = '''This is App Store Optimization report.\n",
|
||||
"The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and what’s new section.\n",
|
||||
"In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n",
|
||||
"Product name is the actual app name on the AppStore or Google Play Store.\n",
|
||||
"Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n",
|
||||
"Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n",
|
||||
"If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
|
||||
"If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps’ Store presence back then.\n",
|
||||
"You can also filter for a specific app using Product Name.\n",
|
||||
"If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n",
|
||||
"'''\n",
|
||||
"\n",
|
||||
"for s in sentence.split('\\n'):\n",
|
||||
" print(s)\n",
|
||||
" align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n",
|
||||
" wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"Collapsed": "false"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -1,81 +0,0 @@
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def tf_create_dummy_inputs():
|
||||
""" Create dummy inputs for TF Tacotron2 model """
|
||||
batch_size = 4
|
||||
max_input_length = 32
|
||||
max_mel_length = 128
|
||||
pad = 1
|
||||
n_chars = 24
|
||||
input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32)
|
||||
input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size])
|
||||
input_lengths[-1] = max_input_length
|
||||
input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32)
|
||||
mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80])
|
||||
mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
|
||||
mel_lengths[-1] = max_mel_length
|
||||
mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32)
|
||||
return input_ids, input_lengths, mel_outputs, mel_lengths
|
||||
|
||||
|
||||
def compare_torch_tf(torch_tensor, tf_tensor):
|
||||
""" Compute the average absolute difference b/w torch and tf tensors """
|
||||
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
|
||||
|
||||
|
||||
def convert_tf_name(tf_name):
|
||||
""" Convert certain patterns in TF layer names to Torch patterns """
|
||||
tf_name_tmp = tf_name
|
||||
tf_name_tmp = tf_name_tmp.replace(':0', '')
|
||||
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0')
|
||||
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1')
|
||||
tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh')
|
||||
tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight')
|
||||
tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight')
|
||||
tf_name_tmp = tf_name_tmp.replace('/beta', '/bias')
|
||||
tf_name_tmp = tf_name_tmp.replace('/', '.')
|
||||
return tf_name_tmp
|
||||
|
||||
|
||||
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
|
||||
""" Transfer weigths from torch state_dict to TF variables """
|
||||
print(" > Passing weights from Torch to TF ...")
|
||||
for tf_var in tf_vars:
|
||||
torch_var_name = var_map_dict[tf_var.name]
|
||||
print(f' | > {tf_var.name} <-- {torch_var_name}')
|
||||
# if tuple, it is a bias variable
|
||||
if not isinstance(torch_var_name, tuple):
|
||||
torch_layer_name = '.'.join(torch_var_name.split('.')[-2:])
|
||||
torch_weight = state_dict[torch_var_name]
|
||||
if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name:
|
||||
# out_dim, in_dim, filter -> filter, in_dim, out_dim
|
||||
numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy()
|
||||
elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name:
|
||||
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
|
||||
# if variable is for bidirectional lstm and it is a bias vector there
|
||||
# needs to be pre-defined two matching torch bias vectors
|
||||
elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name:
|
||||
bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name]
|
||||
assert len(bias_vectors) == 2
|
||||
numpy_weight = bias_vectors[0] + bias_vectors[1]
|
||||
elif 'rnn' in tf_var.name and 'kernel' in tf_var.name:
|
||||
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
|
||||
elif 'rnn' in tf_var.name and 'bias' in tf_var.name:
|
||||
bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key]
|
||||
assert len(bias_vectors) == 2
|
||||
numpy_weight = bias_vectors[0] + bias_vectors[1]
|
||||
elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name:
|
||||
numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy()
|
||||
else:
|
||||
numpy_weight = torch_weight.detach().cpu().numpy()
|
||||
assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}"
|
||||
tf.keras.backend.set_value(tf_var, numpy_weight)
|
||||
return tf_vars
|
||||
|
||||
|
||||
def load_tf_vars(model_tf, tf_vars):
|
||||
for tf_var in tf_vars:
|
||||
model_tf.get_layer(tf_var.name).set_weights(tf_var)
|
||||
return model_tf
|
|
@ -1,104 +0,0 @@
|
|||
import os
|
||||
import datetime
|
||||
import importlib
|
||||
import pickle
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
||||
state = {
|
||||
'model': model.weights,
|
||||
'optimizer': optimizer,
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
'r': r
|
||||
}
|
||||
state.update(kwargs)
|
||||
pickle.dump(state, open(output_path, 'wb'))
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path):
|
||||
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
|
||||
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
|
||||
tf_vars = model.weights
|
||||
for tf_var in tf_vars:
|
||||
layer_name = tf_var.name
|
||||
try:
|
||||
chkp_var_value = chkp_var_dict[layer_name]
|
||||
except KeyError:
|
||||
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
|
||||
layer_name = f"{class_name}/{layer_name}"
|
||||
chkp_var_value = chkp_var_dict[layer_name]
|
||||
|
||||
tf.keras.backend.set_value(tf_var, chkp_var_value)
|
||||
if 'r' in checkpoint.keys():
|
||||
model.decoder.set_r(checkpoint['r'])
|
||||
return model
|
||||
|
||||
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
if max_len is None:
|
||||
max_len = sequence_length.max()
|
||||
batch_size = sequence_length.size(0)
|
||||
seq_range = np.empty([0, max_len], dtype=np.int8)
|
||||
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
|
||||
if sequence_length.is_cuda:
|
||||
seq_range_expand = seq_range_expand.cuda()
|
||||
seq_length_expand = (
|
||||
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
|
||||
# B x T_max
|
||||
return seq_range_expand < seq_length_expand
|
||||
|
||||
|
||||
# @tf.custom_gradient
|
||||
def check_gradient(x, grad_clip):
|
||||
x_normed = tf.clip_by_norm(x, grad_clip)
|
||||
grad_norm = tf.norm(grad_clip)
|
||||
return x_normed, grad_norm
|
||||
|
||||
|
||||
def count_parameters(model, c):
|
||||
try:
|
||||
return model.count_params()
|
||||
except RuntimeError:
|
||||
input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32'))
|
||||
input_lengths = np.random.randint(100, 129, (8, ))
|
||||
input_lengths[-1] = 128
|
||||
input_lengths = tf.convert_to_tensor(input_lengths.astype('int32'))
|
||||
mel_spec = np.random.rand(8, 2 * c.r,
|
||||
c.audio['num_mels']).astype('float32')
|
||||
mel_spec = tf.convert_to_tensor(mel_spec)
|
||||
speaker_ids = np.random.randint(
|
||||
0, 5, (8, )) if c.use_speaker_embedding else None
|
||||
_ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids)
|
||||
return model.count_params()
|
||||
|
||||
|
||||
def setup_model(num_chars, num_speakers, c, enable_tflite=False):
|
||||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in "tacotron":
|
||||
raise NotImplementedError(' [!] Tacotron model is not ready.')
|
||||
# tacotron2
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder,
|
||||
enable_tflite=enable_tflite)
|
||||
return model
|
|
@ -1,42 +0,0 @@
|
|||
import pickle
|
||||
import datetime
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
||||
state = {
|
||||
'model': model.weights,
|
||||
'optimizer': optimizer,
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
'r': r
|
||||
}
|
||||
state.update(kwargs)
|
||||
pickle.dump(state, open(output_path, 'wb'))
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path):
|
||||
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
|
||||
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
|
||||
tf_vars = model.weights
|
||||
for tf_var in tf_vars:
|
||||
layer_name = tf_var.name
|
||||
try:
|
||||
chkp_var_value = chkp_var_dict[layer_name]
|
||||
except KeyError:
|
||||
class_name = list(chkp_var_dict.keys())[0].split("/")[0]
|
||||
layer_name = f"{class_name}/{layer_name}"
|
||||
chkp_var_value = chkp_var_dict[layer_name]
|
||||
|
||||
tf.keras.backend.set_value(tf_var, chkp_var_value)
|
||||
if 'r' in checkpoint.keys():
|
||||
model.decoder.set_r(checkpoint['r'])
|
||||
return model
|
||||
|
||||
|
||||
def load_tflite_model(tflite_path):
|
||||
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
||||
tflite_model.allocate_tensors()
|
||||
return tflite_model
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
import tensorflow as tf
|
||||
|
||||
|
||||
def shape_list(x):
|
||||
"""Deal with dynamic shape in tensorflow cleanly."""
|
||||
static = x.shape.as_list()
|
||||
dynamic = tf.shape(x)
|
||||
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
|
|
@ -1,31 +0,0 @@
|
|||
import tensorflow as tf
|
||||
|
||||
|
||||
def convert_tacotron2_to_tflite(model,
|
||||
output_path=None,
|
||||
experimental_converter=True):
|
||||
"""Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is
|
||||
provided, else return TFLite model."""
|
||||
|
||||
concrete_function = model.inference_tflite.get_concrete_function()
|
||||
converter = tf.lite.TFLiteConverter.from_concrete_functions(
|
||||
[concrete_function])
|
||||
converter.experimental_new_converter = experimental_converter
|
||||
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
||||
converter.target_spec.supported_ops = [
|
||||
tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
|
||||
]
|
||||
tflite_model = converter.convert()
|
||||
print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.')
|
||||
if output_path is not None:
|
||||
# same model binary if outputpath is provided
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(tflite_model)
|
||||
return None
|
||||
return tflite_model
|
||||
|
||||
|
||||
def load_tflite_model(tflite_path):
|
||||
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
||||
tflite_model.allocate_tensors()
|
||||
return tflite_model
|
641
train.py
641
train.py
|
@ -1,641 +0,0 @@
|
|||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from TTS.datasets.TTSDataset import MyDataset
|
||||
from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||
init_distributed, reduce_tensor)
|
||||
from TTS.layers.losses import TacotronLoss
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder,
|
||||
get_git_branch, set_init_dict,
|
||||
setup_model, KeepAverage, check_config)
|
||||
from TTS.utils.io import (save_best_model, save_checkpoint,
|
||||
load_config, copy_config_file)
|
||||
from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
|
||||
gradual_training_scheduler, set_weight_decay,
|
||||
setup_torch_training_env)
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
||||
get_speakers
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.measures import alignment_diagonal_score
|
||||
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
batch_group_size=0 if is_val else c.batch_group_size *
|
||||
c.batch_size,
|
||||
min_seq_len=c.min_seq_len,
|
||||
max_seq_len=c.max_seq_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
verbose=verbose)
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=c.eval_batch_size if is_val else c.batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=dataset.collate_fn,
|
||||
drop_last=False,
|
||||
sampler=sampler,
|
||||
num_workers=c.num_val_loader_workers
|
||||
if is_val else c.num_loader_workers,
|
||||
pin_memory=False)
|
||||
return loader
|
||||
|
||||
|
||||
def format_data(data):
|
||||
if c.use_speaker_embedding:
|
||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
speaker_names = data[2]
|
||||
linear_input = data[3] if c.model in ["Tacotron"] else None
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
speaker_ids = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
else:
|
||||
speaker_ids = None
|
||||
|
||||
# set stop targets view, we predict a single stop token per iteration.
|
||||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) >
|
||||
0.0).unsqueeze(2).float().squeeze(2)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda(non_blocking=True)
|
||||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
|
||||
stop_targets = stop_targets.cuda(non_blocking=True)
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
|
||||
|
||||
|
||||
def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
|
||||
verbose=(epoch == 0))
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
if use_cuda:
|
||||
batch_n_iter = int(
|
||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||
else:
|
||||
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||
end_time = time.time()
|
||||
c_logger.print_train_start()
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
global_step += 1
|
||||
|
||||
# setup lr
|
||||
if c.noam_schedule:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
if optimizer_st:
|
||||
optimizer_st.zero_grad()
|
||||
|
||||
# forward pass model
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||
decoder_backward_output = None
|
||||
alignments_backward = None
|
||||
|
||||
# set the alignment lengths wrt reduction factor for guided attention
|
||||
if mel_lengths.max() % model.decoder.r != 0:
|
||||
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
|
||||
else:
|
||||
alignment_lengths = mel_lengths // model.decoder.r
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
||||
linear_input, stop_tokens, stop_targets,
|
||||
mel_lengths, decoder_backward_output,
|
||||
alignments, alignment_lengths, alignments_backward,
|
||||
text_lengths)
|
||||
|
||||
# backward pass
|
||||
loss_dict['loss'].backward()
|
||||
optimizer, current_lr = adam_weight_decay(optimizer)
|
||||
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
|
||||
optimizer.step()
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
align_error = 1 - alignment_diagonal_score(alignments)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
# backpass and check the grad norm for stop loss
|
||||
if c.separate_stopnet:
|
||||
loss_dict['stopnet_loss'].backward()
|
||||
optimizer_st, _ = adam_weight_decay(optimizer_st)
|
||||
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
|
||||
optimizer_st.step()
|
||||
else:
|
||||
grad_norm_st = 0
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
|
||||
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
update_train_values['avg_loader_time'] = loader_time
|
||||
update_train_values['avg_step_time'] = step_time
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
# print training progress
|
||||
if global_step % c.print_step == 0:
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
avg_spec_length, avg_text_length,
|
||||
step_time, loader_time, current_lr,
|
||||
loss_dict, keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Plot Training Iter Stats
|
||||
# reduce TB load
|
||||
if global_step % c.tb_plot_step == 0:
|
||||
iter_stats = {
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"grad_norm_st": grad_norm_st,
|
||||
"step_time": step_time
|
||||
}
|
||||
iter_stats.update(loss_dict)
|
||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||
|
||||
if global_step % c.save_step == 0:
|
||||
if c.checkpoint:
|
||||
# save model
|
||||
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
|
||||
optimizer_st=optimizer_st,
|
||||
model_loss=loss_dict['postnet_loss'])
|
||||
|
||||
# Diagnostic visualizations
|
||||
const_spec = postnet_output[0].data.cpu().numpy()
|
||||
gt_spec = linear_input[0].data.cpu().numpy() if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else mel_input[0].data.cpu().numpy()
|
||||
align_img = alignments[0].data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(const_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img),
|
||||
}
|
||||
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
|
||||
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
# Sample audio
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
train_audio = ap.inv_spectrogram(const_spec.T)
|
||||
else:
|
||||
train_audio = ap.inv_melspectrogram(const_spec.T)
|
||||
tb_logger.tb_train_audios(global_step,
|
||||
{'TrainAudio': train_audio},
|
||||
c.audio["sample_rate"])
|
||||
end_time = time.time()
|
||||
|
||||
# print epoch stats
|
||||
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
|
||||
|
||||
# Plot Epoch Stats
|
||||
if args.rank == 0:
|
||||
epoch_stats = {"epoch_time": epoch_time}
|
||||
epoch_stats.update(keep_avg.avg_values)
|
||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||
if c.tb_model_param_stats:
|
||||
tb_logger.tb_model_weights(model, global_step)
|
||||
return keep_avg.avg_values, global_step
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
c_logger.print_eval_start()
|
||||
if data_loader is not None:
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
|
||||
assert mel_input.shape[1] % model.decoder.r == 0
|
||||
|
||||
# forward pass model
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||
decoder_backward_output = None
|
||||
alignments_backward = None
|
||||
|
||||
# set the alignment lengths wrt reduction factor for guided attention
|
||||
if mel_lengths.max() % model.decoder.r != 0:
|
||||
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
|
||||
else:
|
||||
alignment_lengths = mel_lengths // model.decoder.r
|
||||
|
||||
# compute loss
|
||||
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
||||
linear_input, stop_tokens, stop_targets,
|
||||
mel_lengths, decoder_backward_output,
|
||||
alignments, alignment_lengths, alignments_backward,
|
||||
text_lengths)
|
||||
|
||||
# step time
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# compute alignment score
|
||||
align_error = 1 - alignment_diagonal_score(alignments)
|
||||
loss_dict['align_error'] = align_error
|
||||
|
||||
# aggregate losses from processes
|
||||
if num_gpus > 1:
|
||||
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
|
||||
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
|
||||
if c.stopnet:
|
||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
|
||||
|
||||
# detach loss values
|
||||
loss_dict_new = dict()
|
||||
for key, value in loss_dict.items():
|
||||
if isinstance(value, (int, float)):
|
||||
loss_dict_new[key] = value
|
||||
else:
|
||||
loss_dict_new[key] = value.item()
|
||||
loss_dict = loss_dict_new
|
||||
|
||||
# update avg stats
|
||||
update_train_values = dict()
|
||||
for key, value in loss_dict.items():
|
||||
update_train_values['avg_' + key] = value
|
||||
keep_avg.update_values(update_train_values)
|
||||
|
||||
if c.print_eval:
|
||||
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(mel_input.shape[0])
|
||||
const_spec = postnet_output[idx].data.cpu().numpy()
|
||||
gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [
|
||||
"Tacotron", "TacotronGST"
|
||||
] else mel_input[idx].data.cpu().numpy()
|
||||
align_img = alignments[idx].data.cpu().numpy()
|
||||
|
||||
eval_figures = {
|
||||
"prediction": plot_spectrogram(const_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img)
|
||||
}
|
||||
|
||||
# Sample audio
|
||||
if c.model in ["Tacotron", "TacotronGST"]:
|
||||
eval_audio = ap.inv_spectrogram(const_spec.T)
|
||||
else:
|
||||
eval_audio = ap.inv_melspectrogram(const_spec.T)
|
||||
tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio},
|
||||
c.audio["sample_rate"])
|
||||
|
||||
# Plot Validation Stats
|
||||
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
align_b_img = alignments_backward[idx].data.cpu().numpy()
|
||||
eval_figures['alignment2'] = plot_alignment(align_b_img)
|
||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||
tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||
|
||||
if args.rank == 0 and epoch > c.test_delay_epochs:
|
||||
if c.test_sentences_file is None:
|
||||
test_sentences = [
|
||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||
"Be a voice, not an echo.",
|
||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||
"This cake is great. It's so delicious and moist.",
|
||||
"Prior to November 22, 1963."
|
||||
]
|
||||
else:
|
||||
with open(c.test_sentences_file, "r") as f:
|
||||
test_sentences = [s.strip() for s in f.readlines()]
|
||||
|
||||
# test sentences
|
||||
test_audios = {}
|
||||
test_figures = {}
|
||||
print(" | > Synthesizing test sentences")
|
||||
speaker_id = 0 if c.use_speaker_embedding else None
|
||||
style_wav = c.get("style_wav_for_test")
|
||||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
|
||||
model,
|
||||
test_sentence,
|
||||
c,
|
||||
use_cuda,
|
||||
ap,
|
||||
speaker_id=speaker_id,
|
||||
style_wav=style_wav,
|
||||
truncated=False,
|
||||
enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument
|
||||
use_griffin_lim=True,
|
||||
do_trim_silence=False)
|
||||
|
||||
file_path = os.path.join(AUDIO_PATH, str(global_step))
|
||||
os.makedirs(file_path, exist_ok=True)
|
||||
file_path = os.path.join(file_path,
|
||||
"TestSentence_{}.wav".format(idx))
|
||||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
|
||||
postnet_output, ap)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
except:
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(global_step, test_audios,
|
||||
c.audio['sample_rate'])
|
||||
tb_logger.tb_test_figures(global_step, test_figures)
|
||||
return keep_avg.avg_values
|
||||
|
||||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train, meta_data_eval, symbols, phonemes
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
if 'characters' in c.keys():
|
||||
symbols, phonemes = make_symbols(**c.characters)
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
init_distributed(args.rank, num_gpus, args.group_id,
|
||||
c.distributed["backend"], c.distributed["url"])
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
# parse speakers
|
||||
if c.use_speaker_embedding:
|
||||
speakers = get_speakers(meta_data_train)
|
||||
if args.restore_path:
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
assert all([speaker in speaker_mapping
|
||||
for speaker in speakers]), "As of now you, you cannot " \
|
||||
"introduce new speakers to " \
|
||||
"a previously trained model."
|
||||
else:
|
||||
speaker_mapping = {name: i for i, name in enumerate(speakers)}
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print("Training with {} speakers: {}".format(num_speakers,
|
||||
", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
|
||||
model = setup_model(num_chars, num_speakers, c)
|
||||
|
||||
params = set_weight_decay(model, c.wd)
|
||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||
if c.stopnet and c.separate_stopnet:
|
||||
optimizer_st = RAdam(model.decoder.stopnet.parameters(),
|
||||
lr=c.lr,
|
||||
weight_decay=0)
|
||||
else:
|
||||
optimizer_st = None
|
||||
|
||||
# setup criterion
|
||||
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||
try:
|
||||
# TODO: fix optimizer init, model.cuda() needs to be called before
|
||||
# optimizer restore
|
||||
# optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except:
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
model.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = c.lr
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
flush=True)
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
criterion.cuda()
|
||||
|
||||
# DISTRUBUTED
|
||||
if num_gpus > 1:
|
||||
model = apply_gradient_allreduce(model)
|
||||
|
||||
if c.noam_schedule:
|
||||
scheduler = NoamLR(optimizer,
|
||||
warmup_steps=c.warmup_steps,
|
||||
last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
scheduler = None
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
if 'best_loss' not in locals():
|
||||
best_loss = float('inf')
|
||||
|
||||
global_step = args.restore_step
|
||||
for epoch in range(0, c.epochs):
|
||||
c_logger.print_epoch_start(epoch, c.epochs)
|
||||
# set gradual training
|
||||
if c.gradual_training is not None:
|
||||
r, c.batch_size = gradual_training_scheduler(global_step, c)
|
||||
c.r = r
|
||||
model.decoder.set_r(r)
|
||||
if c.bidirectional_decoder:
|
||||
model.decoder_backward.set_r(r)
|
||||
print("\n > Number of output frames:", model.decoder.r)
|
||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||
optimizer_st, scheduler, ap,
|
||||
global_step, epoch)
|
||||
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_postnet_loss']
|
||||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_postnet_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
|
||||
OUT_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
parser.add_argument('--debug',
|
||||
type=bool,
|
||||
default=False,
|
||||
help='Do not verify commit integrity to run training.')
|
||||
|
||||
# DISTRUBUTED
|
||||
parser.add_argument(
|
||||
'--rank',
|
||||
type=int,
|
||||
default=0,
|
||||
help='DISTRIBUTED: process rank for distributed training.')
|
||||
parser.add_argument('--group_id',
|
||||
type=str,
|
||||
default="",
|
||||
help='DISTRIBUTED: process group id.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.continue_path != '':
|
||||
args.output_path = args.continue_path
|
||||
args.config_path = os.path.join(args.continue_path, 'config.json')
|
||||
list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv
|
||||
latest_model_file = max(list_of_files, key=os.path.getctime)
|
||||
args.restore_path = latest_model_file
|
||||
print(f" > Training continues for {args.restore_path}")
|
||||
|
||||
# setup output paths and read configs
|
||||
c = load_config(args.config_path)
|
||||
check_config(c)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
OUT_PATH = args.continue_path
|
||||
if args.continue_path == '':
|
||||
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
|
||||
|
||||
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||
|
||||
c_logger = ConsoleLogger()
|
||||
|
||||
if args.rank == 0:
|
||||
os.makedirs(AUDIO_PATH, exist_ok=True)
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
copy_config_file(args.config_path,
|
||||
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||
os.chmod(AUDIO_PATH, 0o775)
|
||||
os.chmod(OUT_PATH, 0o775)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
|
||||
|
||||
# write model desc to tensorboard
|
||||
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
|
@ -1,29 +0,0 @@
|
|||
This folder contains a symlink called TTS to the parent folder:
|
||||
|
||||
lrwxr-xr-x TTS -> ..
|
||||
|
||||
This is used to appease the distribute/setuptools gods. When the project was
|
||||
initially set up, the repository folder itself was considered a namespace, and
|
||||
development was done with `sys.path` hacks. This means if you tried to install
|
||||
TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of
|
||||
`TTS.models`, `TTS.utils`...
|
||||
|
||||
Installing TTS would then pollute the package namespace with generic names like
|
||||
those above. In order to make things installable in both install and development
|
||||
modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed
|
||||
to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect
|
||||
using `packages_dir` in `setup.py` is not enough because it breaks the editable
|
||||
installation, which can only handle the simplest of `package_dir` redirects.
|
||||
|
||||
Our solution is to use a symlink in order to add the extra `TTS` namespace. In
|
||||
`setup.py`, we only look for packages inside `tts_namespace` (this folder),
|
||||
which contains a symlink called TTS pointing to the repository root. The final
|
||||
result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`...
|
||||
|
||||
With this hack, `pip install -e` will then add a symlink to the `tts_namespace`
|
||||
in your `site-packages` folder, which works properly. It's important not to add
|
||||
anything else in this folder because it will pollute the package namespace when
|
||||
installing the project.
|
||||
|
||||
This does not work if you check out your project on a filesystem that does not
|
||||
support symlinks.
|
|
@ -1 +0,0 @@
|
|||
..
|
Binary file not shown.
Binary file not shown.
356
utils/audio.py
356
utils/audio.py
|
@ -1,356 +0,0 @@
|
|||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.signal
|
||||
|
||||
from TTS.utils.data import StandardScaler
|
||||
|
||||
|
||||
class AudioProcessor(object):
|
||||
def __init__(self,
|
||||
sample_rate=None,
|
||||
num_mels=None,
|
||||
min_level_db=None,
|
||||
frame_shift_ms=None,
|
||||
frame_length_ms=None,
|
||||
hop_length=None,
|
||||
win_length=None,
|
||||
ref_level_db=None,
|
||||
fft_size=1024,
|
||||
power=None,
|
||||
preemphasis=0.0,
|
||||
signal_norm=None,
|
||||
symmetric_norm=None,
|
||||
max_norm=None,
|
||||
mel_fmin=None,
|
||||
mel_fmax=None,
|
||||
spec_gain=20,
|
||||
stft_pad_mode='reflect',
|
||||
clip_norm=True,
|
||||
griffin_lim_iters=None,
|
||||
do_trim_silence=False,
|
||||
trim_db=60,
|
||||
do_sound_norm=False,
|
||||
stats_path=None,
|
||||
**_):
|
||||
|
||||
print(" > Setting up Audio Processor...")
|
||||
# setup class attributed
|
||||
self.sample_rate = sample_rate
|
||||
self.num_mels = num_mels
|
||||
self.min_level_db = min_level_db or 0
|
||||
self.frame_shift_ms = frame_shift_ms
|
||||
self.frame_length_ms = frame_length_ms
|
||||
self.ref_level_db = ref_level_db
|
||||
self.fft_size = fft_size
|
||||
self.power = power
|
||||
self.preemphasis = preemphasis
|
||||
self.griffin_lim_iters = griffin_lim_iters
|
||||
self.signal_norm = signal_norm
|
||||
self.symmetric_norm = symmetric_norm
|
||||
self.mel_fmin = mel_fmin or 0
|
||||
self.mel_fmax = mel_fmax
|
||||
self.spec_gain = float(spec_gain)
|
||||
self.stft_pad_mode = 'reflect'
|
||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
self.trim_db = trim_db
|
||||
self.do_sound_norm = do_sound_norm
|
||||
self.stats_path = stats_path
|
||||
# setup stft parameters
|
||||
if hop_length is None:
|
||||
# compute stft parameters from given time values
|
||||
self.hop_length, self.win_length = self._stft_parameters()
|
||||
else:
|
||||
# use stft parameters from config file
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||
assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
print(" | > {}:{}".format(key, value))
|
||||
# create spectrogram utils
|
||||
self.mel_basis = self._build_mel_basis()
|
||||
self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
|
||||
# setup scaler
|
||||
if stats_path:
|
||||
mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
|
||||
self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
|
||||
self.signal_norm = True
|
||||
self.max_norm = None
|
||||
self.clip_norm = None
|
||||
self.symmetric_norm = None
|
||||
|
||||
### setting up the parameters ###
|
||||
def _build_mel_basis(self, ):
|
||||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.fft_size,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
|
||||
def _stft_parameters(self, ):
|
||||
"""Compute necessary stft parameters with given time values"""
|
||||
factor = self.frame_length_ms / self.frame_shift_ms
|
||||
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||
win_length = int(hop_length * factor)
|
||||
return hop_length, win_length
|
||||
|
||||
### normalization ###
|
||||
def _normalize(self, S):
|
||||
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
|
||||
#pylint: disable=no-else-return
|
||||
S = S.copy()
|
||||
if self.signal_norm:
|
||||
# mean-var scaling
|
||||
if hasattr(self, 'mel_scaler'):
|
||||
if S.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.transform(S.T).T
|
||||
elif S.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.transform(S.T).T
|
||||
else:
|
||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||
# range normalization
|
||||
S -= self.ref_level_db # discard certain range of DB assuming it is air noise
|
||||
S_norm = ((S - self.min_level_db) / (-self.min_level_db))
|
||||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, 0, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
return S
|
||||
|
||||
def _denormalize(self, S):
|
||||
"""denormalize values"""
|
||||
#pylint: disable=no-else-return
|
||||
S_denorm = S.copy()
|
||||
if self.signal_norm:
|
||||
# mean-var scaling
|
||||
if hasattr(self, 'mel_scaler'):
|
||||
if S_denorm.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
||||
elif S_denorm.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
||||
else:
|
||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
||||
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
|
||||
return S_denorm + self.ref_level_db
|
||||
else:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, 0, self.max_norm)
|
||||
S_denorm = (S_denorm * -self.min_level_db /
|
||||
self.max_norm) + self.min_level_db
|
||||
return S_denorm + self.ref_level_db
|
||||
else:
|
||||
return S_denorm
|
||||
|
||||
### Mean-STD scaling ###
|
||||
def load_stats(self, stats_path):
|
||||
stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg
|
||||
mel_mean = stats['mel_mean']
|
||||
mel_std = stats['mel_std']
|
||||
linear_mean = stats['linear_mean']
|
||||
linear_std = stats['linear_std']
|
||||
stats_config = stats['audio_config']
|
||||
# check all audio parameters used for computing stats
|
||||
skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power']
|
||||
for key in stats_config.keys():
|
||||
if key in skip_parameters:
|
||||
continue
|
||||
assert stats_config[key] == self.__dict__[key],\
|
||||
f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
|
||||
return mel_mean, mel_std, linear_mean, linear_std, stats_config
|
||||
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std):
|
||||
self.mel_scaler = StandardScaler()
|
||||
self.mel_scaler.set_stats(mel_mean, mel_std)
|
||||
self.linear_scaler = StandardScaler()
|
||||
self.linear_scaler.set_stats(linear_mean, linear_std)
|
||||
|
||||
### DB and AMP conversion ###
|
||||
# pylint: disable=no-self-use
|
||||
def _amp_to_db(self, x):
|
||||
return self.spec_gain * np.log10(np.maximum(1e-5, x))
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def _db_to_amp(self, x):
|
||||
return np.power(10.0, x / self.spec_gain)
|
||||
|
||||
### Preemphasis ###
|
||||
def apply_preemphasis(self, x):
|
||||
if self.preemphasis == 0:
|
||||
raise RuntimeError(" [!] Preemphasis is set 0.0.")
|
||||
return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
|
||||
|
||||
def apply_inv_preemphasis(self, x):
|
||||
if self.preemphasis == 0:
|
||||
raise RuntimeError(" [!] Preemphasis is set 0.0.")
|
||||
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
|
||||
|
||||
### SPECTROGRAMs ###
|
||||
def _linear_to_mel(self, spectrogram):
|
||||
return np.dot(self.mel_basis, spectrogram)
|
||||
|
||||
def _mel_to_linear(self, mel_spec):
|
||||
return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
|
||||
|
||||
def spectrogram(self, y):
|
||||
if self.preemphasis != 0:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amp_to_db(np.abs(D))
|
||||
return self._normalize(S)
|
||||
|
||||
def melspectrogram(self, y):
|
||||
if self.preemphasis != 0:
|
||||
D = self._stft(self.apply_preemphasis(y))
|
||||
else:
|
||||
D = self._stft(y)
|
||||
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
|
||||
return self._normalize(S)
|
||||
|
||||
def inv_spectrogram(self, spectrogram):
|
||||
"""Converts spectrogram to waveform using librosa"""
|
||||
S = self._denormalize(spectrogram)
|
||||
S = self._db_to_amp(S)
|
||||
# Reconstruct phase
|
||||
if self.preemphasis != 0:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def inv_melspectrogram(self, mel_spectrogram):
|
||||
'''Converts melspectrogram to waveform using librosa'''
|
||||
D = self._denormalize(mel_spectrogram)
|
||||
S = self._db_to_amp(D)
|
||||
S = self._mel_to_linear(S) # Convert back to linear
|
||||
if self.preemphasis != 0:
|
||||
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
|
||||
return self._griffin_lim(S**self.power)
|
||||
|
||||
def out_linear_to_mel(self, linear_spec):
|
||||
S = self._denormalize(linear_spec)
|
||||
S = self._db_to_amp(S)
|
||||
S = self._linear_to_mel(np.abs(S))
|
||||
S = self._amp_to_db(S)
|
||||
mel = self._normalize(S)
|
||||
return mel
|
||||
|
||||
### STFT and ISTFT ###
|
||||
def _stft(self, y):
|
||||
return librosa.stft(
|
||||
y=y,
|
||||
n_fft=self.fft_size,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
pad_mode=self.stft_pad_mode,
|
||||
)
|
||||
|
||||
def _istft(self, y):
|
||||
return librosa.istft(
|
||||
y, hop_length=self.hop_length, win_length=self.win_length)
|
||||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
y = self._istft(S_complex * angles)
|
||||
for _ in range(self.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(self._stft(y)))
|
||||
y = self._istft(S_complex * angles)
|
||||
return y
|
||||
|
||||
def compute_stft_paddings(self, x, pad_sides=1):
|
||||
'''compute right padding (final frame) or both sides padding (first and final frames)
|
||||
'''
|
||||
assert pad_sides in (1, 2)
|
||||
pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
|
||||
if pad_sides == 1:
|
||||
return 0, pad
|
||||
return pad // 2, pad // 2 + pad % 2
|
||||
|
||||
### Audio Processing ###
|
||||
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
||||
window_length = int(self.sample_rate * min_silence_sec)
|
||||
hop_length = int(window_length / 4)
|
||||
threshold = self._db_to_amp(threshold_db)
|
||||
for x in range(hop_length, len(wav) - window_length, hop_length):
|
||||
if np.max(wav[x:x + window_length]) < threshold:
|
||||
return x + hop_length
|
||||
return len(wav)
|
||||
|
||||
def trim_silence(self, wav):
|
||||
""" Trim silent parts with a threshold and 0.01 sec margin """
|
||||
margin = int(self.sample_rate * 0.01)
|
||||
wav = wav[margin:-margin]
|
||||
return librosa.effects.trim(
|
||||
wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]
|
||||
|
||||
@staticmethod
|
||||
def sound_norm(x):
|
||||
return x / abs(x).max() * 0.9
|
||||
|
||||
### save and load ###
|
||||
def load_wav(self, filename, sr=None):
|
||||
if sr is None:
|
||||
x, sr = sf.read(filename)
|
||||
else:
|
||||
x, sr = librosa.load(filename, sr=sr)
|
||||
if self.do_trim_silence:
|
||||
try:
|
||||
x = self.trim_silence(x)
|
||||
except ValueError:
|
||||
print(f' [!] File cannot be trimmed for silence - {filename}')
|
||||
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
|
||||
if self.do_sound_norm:
|
||||
x = self.sound_norm(x)
|
||||
return x
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
|
||||
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16))
|
||||
|
||||
@staticmethod
|
||||
def mulaw_encode(wav, qc):
|
||||
mu = 2 ** qc - 1
|
||||
# wav_abs = np.minimum(np.abs(wav), 1.0)
|
||||
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
|
||||
# Quantize signal to the specified number of levels.
|
||||
signal = (signal + 1) / 2 * mu + 0.5
|
||||
return np.floor(signal,)
|
||||
|
||||
@staticmethod
|
||||
def mulaw_decode(wav, qc):
|
||||
"""Recovers waveform from quantized values."""
|
||||
mu = 2 ** qc - 1
|
||||
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
|
||||
return x
|
||||
|
||||
|
||||
@staticmethod
|
||||
def encode_16bits(x):
|
||||
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
|
||||
|
||||
@staticmethod
|
||||
def quantize(x, bits):
|
||||
return (x + 1.) * (2**bits - 1) / 2
|
||||
|
||||
@staticmethod
|
||||
def dequantize(x, bits):
|
||||
return 2 * x / (2**bits - 1) - 1
|
|
@ -1,95 +0,0 @@
|
|||
import datetime
|
||||
from TTS.utils.io import AttrDict
|
||||
|
||||
|
||||
tcolors = AttrDict({
|
||||
'OKBLUE': '\033[94m',
|
||||
'HEADER': '\033[95m',
|
||||
'OKGREEN': '\033[92m',
|
||||
'WARNING': '\033[93m',
|
||||
'FAIL': '\033[91m',
|
||||
'ENDC': '\033[0m',
|
||||
'BOLD': '\033[1m',
|
||||
'UNDERLINE': '\033[4m'
|
||||
})
|
||||
|
||||
|
||||
class ConsoleLogger():
|
||||
def __init__(self):
|
||||
# TODO: color code for value changes
|
||||
# use these to compare values between iterations
|
||||
self.old_train_loss_dict = None
|
||||
self.old_epoch_loss_dict = None
|
||||
self.old_eval_loss_dict = None
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def get_time(self):
|
||||
now = datetime.datetime.now()
|
||||
return now.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
def print_epoch_start(self, epoch, max_epoch):
|
||||
print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD,
|
||||
epoch, max_epoch, tcolors.ENDC),
|
||||
flush=True)
|
||||
|
||||
def print_train_start(self):
|
||||
print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
|
||||
|
||||
def print_train_step(self, batch_steps, step, global_step, avg_spec_length,
|
||||
avg_text_length, step_time, loader_time, lr,
|
||||
loss_dict, avg_loss_dict):
|
||||
indent = " | > "
|
||||
print()
|
||||
log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
|
||||
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC)
|
||||
for key, value in loss_dict.items():
|
||||
# print the avg value if given
|
||||
if f'avg_{key}' in avg_loss_dict.keys():
|
||||
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
|
||||
else:
|
||||
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||
log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\
|
||||
f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"
|
||||
print(log_text, flush=True)
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def print_train_epoch_end(self, global_step, epoch, epoch_time,
|
||||
print_dict):
|
||||
indent = " | > "
|
||||
log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n"
|
||||
for key, value in print_dict.items():
|
||||
log_text += "{}{}: {:.5f}\n".format(indent, key, value)
|
||||
print(log_text, flush=True)
|
||||
|
||||
def print_eval_start(self):
|
||||
print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n")
|
||||
|
||||
def print_eval_step(self, step, loss_dict, avg_loss_dict):
|
||||
indent = " | > "
|
||||
print()
|
||||
log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n"
|
||||
for key, value in loss_dict.items():
|
||||
# print the avg value if given
|
||||
if f'avg_{key}' in avg_loss_dict.keys():
|
||||
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
|
||||
else:
|
||||
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||
print(log_text, flush=True)
|
||||
|
||||
def print_epoch_end(self, epoch, avg_loss_dict):
|
||||
indent = " | > "
|
||||
log_text = " {}--> EVAL PERFORMANCE{}\n".format(
|
||||
tcolors.BOLD, tcolors.ENDC)
|
||||
for key, value in avg_loss_dict.items():
|
||||
# print the avg value if given
|
||||
color = tcolors.FAIL
|
||||
sign = '+'
|
||||
diff = 0
|
||||
if self.old_eval_loss_dict is not None:
|
||||
diff = value - self.old_eval_loss_dict[key]
|
||||
if diff <= 0:
|
||||
color = tcolors.OKGREEN
|
||||
sign = ''
|
||||
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
|
||||
self.old_eval_loss_dict = avg_loss_dict
|
||||
print(log_text, flush=True)
|
|
@ -1,77 +0,0 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def _pad_data(x, length):
|
||||
_pad = 0
|
||||
assert x.ndim == 1
|
||||
return np.pad(
|
||||
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def prepare_data(inputs):
|
||||
max_len = max((len(x) for x in inputs))
|
||||
return np.stack([_pad_data(x, max_len) for x in inputs])
|
||||
|
||||
|
||||
def _pad_tensor(x, length):
|
||||
_pad = 0.
|
||||
assert x.ndim == 2
|
||||
x = np.pad(
|
||||
x, [[0, 0], [0, length - x.shape[1]]],
|
||||
mode='constant',
|
||||
constant_values=_pad)
|
||||
return x
|
||||
|
||||
|
||||
def prepare_tensor(inputs, out_steps):
|
||||
max_len = max((x.shape[1] for x in inputs))
|
||||
remainder = max_len % out_steps
|
||||
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
|
||||
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
|
||||
|
||||
|
||||
def _pad_stop_target(x, length):
|
||||
_pad = 0.
|
||||
assert x.ndim == 1
|
||||
return np.pad(
|
||||
x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def prepare_stop_target(inputs, out_steps):
|
||||
""" Pad row vectors with 1. """
|
||||
max_len = max((x.shape[0] for x in inputs))
|
||||
remainder = max_len % out_steps
|
||||
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
|
||||
return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
|
||||
|
||||
|
||||
def pad_per_step(inputs, pad_len):
|
||||
return np.pad(
|
||||
inputs, [[0, 0], [0, 0], [0, pad_len]],
|
||||
mode='constant',
|
||||
constant_values=0.0)
|
||||
|
||||
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
class StandardScaler():
|
||||
|
||||
def set_stats(self, mean, scale):
|
||||
self.mean_ = mean
|
||||
self.scale_ = scale
|
||||
|
||||
def reset_stats(self):
|
||||
delattr(self, 'mean_')
|
||||
delattr(self, 'scale_')
|
||||
|
||||
def transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X -= self.mean_
|
||||
X /= self.scale_
|
||||
return X
|
||||
|
||||
def inverse_transform(self, X):
|
||||
X = np.asarray(X)
|
||||
X *= self.scale_
|
||||
X += self.mean_
|
||||
return X
|
||||
|
|
@ -1,362 +0,0 @@
|
|||
import os
|
||||
import glob
|
||||
import torch
|
||||
import shutil
|
||||
import datetime
|
||||
import subprocess
|
||||
import importlib
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def get_git_branch():
|
||||
try:
|
||||
out = subprocess.check_output(["git", "branch"]).decode("utf8")
|
||||
current = next(line for line in out.split("\n")
|
||||
if line.startswith("*"))
|
||||
current.replace("* ", "")
|
||||
except subprocess.CalledProcessError:
|
||||
current = "inside_docker"
|
||||
return current
|
||||
|
||||
|
||||
def get_commit_hash():
|
||||
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
|
||||
# try:
|
||||
# subprocess.check_output(['git', 'diff-index', '--quiet',
|
||||
# 'HEAD']) # Verify client is clean
|
||||
# except:
|
||||
# raise RuntimeError(
|
||||
# " !! Commit before training to get the commit hash.")
|
||||
try:
|
||||
commit = subprocess.check_output(
|
||||
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
# Not copying .git folder into docker container
|
||||
except subprocess.CalledProcessError:
|
||||
commit = "0000000"
|
||||
print(' > Git Hash: {}'.format(commit))
|
||||
return commit
|
||||
|
||||
|
||||
def create_experiment_folder(root_path, model_name, debug):
|
||||
""" Create a folder with the current date and time """
|
||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||
if debug:
|
||||
commit_hash = 'debug'
|
||||
else:
|
||||
commit_hash = get_commit_hash()
|
||||
output_folder = os.path.join(
|
||||
root_path, model_name + '-' + date_str + '-' + commit_hash)
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
print(" > Experiment folder: {}".format(output_folder))
|
||||
return output_folder
|
||||
|
||||
|
||||
def remove_experiment_folder(experiment_path):
|
||||
"""Check folder if there is a checkpoint, otherwise remove the folder"""
|
||||
|
||||
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
|
||||
if not checkpoint_files:
|
||||
if os.path.exists(experiment_path):
|
||||
shutil.rmtree(experiment_path, ignore_errors=True)
|
||||
print(" ! Run is removed from {}".format(experiment_path))
|
||||
else:
|
||||
print(" ! Run is kept in {}".format(experiment_path))
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
r"""Count number of trainable parameters in a network"""
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def split_dataset(items):
|
||||
is_multi_speaker = False
|
||||
speakers = [item[-1] for item in items]
|
||||
is_multi_speaker = len(set(speakers)) > 1
|
||||
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
|
||||
len(items) * 0.01)
|
||||
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
|
||||
np.random.seed(0)
|
||||
np.random.shuffle(items)
|
||||
if is_multi_speaker:
|
||||
items_eval = []
|
||||
# most stupid code ever -- Fix it !
|
||||
while len(items_eval) < eval_split_size:
|
||||
speakers = [item[-1] for item in items]
|
||||
speaker_counter = Counter(speakers)
|
||||
item_idx = np.random.randint(0, len(items))
|
||||
if speaker_counter[items[item_idx][-1]] > 1:
|
||||
items_eval.append(items[item_idx])
|
||||
del items[item_idx]
|
||||
return items_eval, items
|
||||
return items[:eval_split_size], items[eval_split_size:]
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
if max_len is None:
|
||||
max_len = sequence_length.data.max()
|
||||
batch_size = sequence_length.size(0)
|
||||
seq_range = torch.arange(0, max_len).long()
|
||||
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
|
||||
if sequence_length.is_cuda:
|
||||
seq_range_expand = seq_range_expand.to(sequence_length.device)
|
||||
seq_length_expand = (
|
||||
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
|
||||
# B x T_max
|
||||
return seq_range_expand < seq_length_expand
|
||||
|
||||
|
||||
def set_init_dict(model_dict, checkpoint_state, c):
|
||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||
for k, v in checkpoint_state.items():
|
||||
if k not in model_dict:
|
||||
print(" | > Layer missing in the model definition: {}".format(k))
|
||||
# 1. filter out unnecessary keys
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in checkpoint_state.items() if k in model_dict
|
||||
}
|
||||
# 2. filter out different size layers
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if v.numel() == model_dict[k].numel()
|
||||
}
|
||||
# 3. skip reinit layers
|
||||
if c.reinit_layers is not None:
|
||||
for reinit_layer_name in c.reinit_layers:
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if reinit_layer_name not in k
|
||||
}
|
||||
# 4. overwrite entries in the existing state dict
|
||||
model_dict.update(pretrained_dict)
|
||||
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
|
||||
len(model_dict)))
|
||||
return model_dict
|
||||
|
||||
|
||||
def setup_model(num_chars, num_speakers, c):
|
||||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('TTS.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in "tacotron":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
gst=c.use_gst,
|
||||
memory_size=c.memory_size,
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder,
|
||||
double_decoder_consistency=c.double_decoder_consistency,
|
||||
ddc_r=c.ddc_r)
|
||||
elif c.model.lower() == "tacotron2":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
gst=c.use_gst,
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder,
|
||||
double_decoder_consistency=c.double_decoder_consistency,
|
||||
ddc_r=c.ddc_r)
|
||||
return model
|
||||
|
||||
class KeepAverage():
|
||||
def __init__(self):
|
||||
self.avg_values = {}
|
||||
self.iters = {}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.avg_values[key]
|
||||
|
||||
def items(self):
|
||||
return self.avg_values.items()
|
||||
|
||||
def add_value(self, name, init_val=0, init_iter=0):
|
||||
self.avg_values[name] = init_val
|
||||
self.iters[name] = init_iter
|
||||
|
||||
def update_value(self, name, value, weighted_avg=False):
|
||||
if name not in self.avg_values:
|
||||
# add value if not exist before
|
||||
self.add_value(name, init_val=value)
|
||||
else:
|
||||
# else update existing value
|
||||
if weighted_avg:
|
||||
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
||||
self.iters[name] += 1
|
||||
else:
|
||||
self.avg_values[name] = self.avg_values[name] * \
|
||||
self.iters[name] + value
|
||||
self.iters[name] += 1
|
||||
self.avg_values[name] /= self.iters[name]
|
||||
|
||||
def add_values(self, name_dict):
|
||||
for key, value in name_dict.items():
|
||||
self.add_value(key, init_val=value)
|
||||
|
||||
def update_values(self, value_dict):
|
||||
for key, value in value_dict.items():
|
||||
self.update_value(key, value)
|
||||
|
||||
|
||||
def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
|
||||
if alternative in c.keys() and c[alternative] is not None:
|
||||
return
|
||||
if restricted:
|
||||
assert name in c.keys(), f' [!] {name} not defined in config.json'
|
||||
if name in c.keys():
|
||||
if max_val:
|
||||
assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
|
||||
if min_val:
|
||||
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
|
||||
if enum_list:
|
||||
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
|
||||
if val_type:
|
||||
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
|
||||
|
||||
|
||||
def check_config(c):
|
||||
_check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
|
||||
_check_argument('run_name', c, restricted=True, val_type=str)
|
||||
_check_argument('run_description', c, val_type=str)
|
||||
|
||||
# AUDIO
|
||||
_check_argument('audio', c, restricted=True, val_type=dict)
|
||||
|
||||
# audio processing parameters
|
||||
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
||||
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
||||
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
||||
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
||||
_check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
|
||||
_check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
|
||||
_check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
|
||||
_check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
|
||||
_check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
|
||||
|
||||
# vocabulary parameters
|
||||
_check_argument('characters', c, restricted=False, val_type=dict)
|
||||
_check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
_check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
|
||||
# normalization parameters
|
||||
_check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
|
||||
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
||||
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
||||
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
|
||||
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
||||
|
||||
# training parameters
|
||||
_check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('r', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('gradual_training', c, restricted=False, val_type=list)
|
||||
_check_argument('loss_masking', c, restricted=True, val_type=bool)
|
||||
# _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
|
||||
|
||||
# validation parameters
|
||||
_check_argument('run_eval', c, restricted=True, val_type=bool)
|
||||
_check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('test_sentences_file', c, restricted=False, val_type=str)
|
||||
|
||||
# optimizer
|
||||
_check_argument('noam_schedule', c, restricted=False, val_type=bool)
|
||||
_check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
|
||||
_check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('lr', c, restricted=True, val_type=float, min_val=0)
|
||||
_check_argument('wd', c, restricted=True, val_type=float, min_val=0)
|
||||
_check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('seq_len_norm', c, restricted=True, val_type=bool)
|
||||
|
||||
# tacotron prenet
|
||||
_check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
|
||||
_check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
|
||||
_check_argument('prenet_dropout', c, restricted=True, val_type=bool)
|
||||
|
||||
# attention
|
||||
_check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
|
||||
_check_argument('attention_heads', c, restricted=True, val_type=int)
|
||||
_check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
|
||||
_check_argument('windowing', c, restricted=True, val_type=bool)
|
||||
_check_argument('use_forward_attn', c, restricted=True, val_type=bool)
|
||||
_check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
|
||||
_check_argument('transition_agent', c, restricted=True, val_type=bool)
|
||||
_check_argument('transition_agent', c, restricted=True, val_type=bool)
|
||||
_check_argument('location_attn', c, restricted=True, val_type=bool)
|
||||
_check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
|
||||
_check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
|
||||
_check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
|
||||
|
||||
# stopnet
|
||||
_check_argument('stopnet', c, restricted=True, val_type=bool)
|
||||
_check_argument('separate_stopnet', c, restricted=True, val_type=bool)
|
||||
|
||||
# tensorboard
|
||||
_check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
|
||||
_check_argument('checkpoint', c, restricted=True, val_type=bool)
|
||||
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
|
||||
|
||||
# dataloading
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from TTS.utils.text import cleaners
|
||||
_check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
|
||||
_check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
|
||||
_check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
|
||||
_check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
|
||||
|
||||
# paths
|
||||
_check_argument('output_path', c, restricted=True, val_type=str)
|
||||
|
||||
# multi-speaker gst
|
||||
_check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
|
||||
_check_argument('style_wav_for_test', c, restricted=True, val_type=str)
|
||||
_check_argument('use_gst', c, restricted=True, val_type=bool)
|
||||
|
||||
# datasets - checking only the first entry
|
||||
_check_argument('datasets', c, restricted=True, val_type=list)
|
||||
for dataset_entry in c['datasets']:
|
||||
_check_argument('name', dataset_entry, restricted=True, val_type=str)
|
||||
_check_argument('path', dataset_entry, restricted=True, val_type=str)
|
||||
_check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str)
|
||||
_check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
|
78
utils/io.py
78
utils/io.py
|
@ -1,78 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
import re
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
config = AttrDict()
|
||||
with open(config_path, "r") as f:
|
||||
input_str = f.read()
|
||||
input_str = re.sub(r'\\\n', '', input_str)
|
||||
input_str = re.sub(r'//.*\n', '\n', input_str)
|
||||
data = json.loads(input_str)
|
||||
config.update(data)
|
||||
return config
|
||||
|
||||
|
||||
def copy_config_file(config_file, out_path, new_fields):
|
||||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
if isinstance(value, str):
|
||||
new_line = '"{}":"{}",\n'.format(key, value)
|
||||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(out_path, "w")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(state['model'])
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
# set model stepsize
|
||||
if 'r' in state.keys():
|
||||
model.decoder.set_r(state['r'])
|
||||
return model, state
|
||||
|
||||
|
||||
def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
||||
new_state_dict = model.state_dict()
|
||||
state = {
|
||||
'model': new_state_dict,
|
||||
'optimizer': optimizer.state_dict() if optimizer is not None else None,
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
'r': r
|
||||
}
|
||||
state.update(kwargs)
|
||||
torch.save(state, output_path)
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs):
|
||||
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print(" > CHECKPOINT : {}".format(checkpoint_path))
|
||||
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs)
|
||||
|
||||
|
||||
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
|
||||
if target_loss < best_loss:
|
||||
file_name = 'best_model.pth.tar'
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print(" > BEST MODEL : {}".format(checkpoint_path))
|
||||
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
|
||||
best_loss = target_loss
|
||||
return best_loss
|
|
@ -1,18 +0,0 @@
|
|||
import torch
|
||||
|
||||
|
||||
def alignment_diagonal_score(alignments, binary=False):
|
||||
"""
|
||||
Compute how diagonal alignment predictions are. It is useful
|
||||
to measure the alignment consistency of a model
|
||||
Args:
|
||||
alignments (torch.Tensor): batch of alignments.
|
||||
binary (bool): if True, ignore scores and consider attention
|
||||
as a binary mask.
|
||||
Shape:
|
||||
alignments : batch x decoder_steps x encoder_steps
|
||||
"""
|
||||
maxs = alignments.max(dim=1)[0]
|
||||
if binary:
|
||||
maxs[maxs > 0] = 1
|
||||
return maxs.mean(dim=1).mean(dim=0).item()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue