style update #3

This commit is contained in:
Eren Gölge 2021-04-09 01:17:15 +02:00
parent 18d9ec8036
commit 87ee6ceb57
30 changed files with 717 additions and 682 deletions

View File

@ -25,12 +25,11 @@ import subprocess
import sys
import zipfile
import pandas
import soundfile as sf
import tensorflow as tf
from absl import logging
import pandas
gfile = tf.compat.v1.gfile
SUBSETS = {

View File

@ -1,6 +1,8 @@
import numpy as np
cimport numpy as np
cimport cython
cimport numpy as np
from cython.parallel import prange

View File

@ -6,13 +6,12 @@ import random
from statistics import StatisticsError, mean, median, mode, stdev
import matplotlib.pyplot as plt
import seaborn as sns
from text.cmudict import CMUDict
def get_audio_seconds(frames):
return (frames*12.5)/1000
return (frames * 12.5) / 1000
def append_data_statistics(meta_data):
@ -29,9 +28,7 @@ def append_data_statistics(meta_data):
median_audio_len = median(audio_len_list)
try:
std = stdev(
d["audio_len"] for d in data
)
std = stdev(d["audio_len"] for d in data)
except StatisticsError:
std = 0
@ -46,24 +43,22 @@ def process_meta_data(path):
meta_data = {}
# load meta data
with open(path, 'r') as f:
data = csv.reader(f, delimiter='|')
with open(path, "r") as f:
data = csv.reader(f, delimiter="|")
for row in data:
frames = int(row[2])
utt = row[3]
audio_len = get_audio_seconds(frames)
char_count = len(utt)
if not meta_data.get(char_count):
meta_data[char_count] = {
"data": []
}
meta_data[char_count] = {"data": []}
meta_data[char_count]["data"].append(
{
"utt": utt,
"frames": frames,
"audio_len": audio_len,
"row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3])
"row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]),
}
)
@ -74,30 +69,30 @@ def process_meta_data(path):
def get_data_points(meta_data):
x = meta_data
y_avg = [meta_data[d]['mean'] for d in meta_data]
y_mode = [meta_data[d]['mode'] for d in meta_data]
y_median = [meta_data[d]['median'] for d in meta_data]
y_std = [meta_data[d]['std'] for d in meta_data]
y_num_samples = [len(meta_data[d]['data']) for d in meta_data]
y_avg = [meta_data[d]["mean"] for d in meta_data]
y_mode = [meta_data[d]["mode"] for d in meta_data]
y_median = [meta_data[d]["median"] for d in meta_data]
y_std = [meta_data[d]["std"] for d in meta_data]
y_num_samples = [len(meta_data[d]["data"]) for d in meta_data]
return {
"x": x,
"y_avg": y_avg,
"y_mode": y_mode,
"y_median": y_median,
"y_std": y_std,
"y_num_samples": y_num_samples
"y_num_samples": y_num_samples,
}
def save_training(file_path, meta_data):
rows = []
for char_cnt in meta_data:
data = meta_data[char_cnt]['data']
data = meta_data[char_cnt]["data"]
for d in data:
rows.append(d['row'] + "\n")
rows.append(d["row"] + "\n")
random.shuffle(rows)
with open(file_path, 'w+') as f:
with open(file_path, "w+") as f:
for row in rows:
f.write(row)
@ -108,15 +103,15 @@ def plot(meta_data, save_path=None):
save = True
graph_data = get_data_points(meta_data)
x = graph_data['x']
y_avg = graph_data['y_avg']
y_std = graph_data['y_std']
y_mode = graph_data['y_mode']
y_median = graph_data['y_median']
y_num_samples = graph_data['y_num_samples']
x = graph_data["x"]
y_avg = graph_data["y_avg"]
y_std = graph_data["y_std"]
y_mode = graph_data["y_mode"]
y_median = graph_data["y_median"]
y_num_samples = graph_data["y_num_samples"]
plt.figure()
plt.plot(x, y_avg, 'ro')
plt.plot(x, y_avg, "ro")
plt.xlabel("character lengths", fontsize=30)
plt.ylabel("avg seconds", fontsize=30)
if save:
@ -124,7 +119,7 @@ def plot(meta_data, save_path=None):
plt.savefig(os.path.join(save_path, name))
plt.figure()
plt.plot(x, y_mode, 'ro')
plt.plot(x, y_mode, "ro")
plt.xlabel("character lengths", fontsize=30)
plt.ylabel("mode seconds", fontsize=30)
if save:
@ -132,7 +127,7 @@ def plot(meta_data, save_path=None):
plt.savefig(os.path.join(save_path, name))
plt.figure()
plt.plot(x, y_median, 'ro')
plt.plot(x, y_median, "ro")
plt.xlabel("character lengths", fontsize=30)
plt.ylabel("median seconds", fontsize=30)
if save:
@ -140,7 +135,7 @@ def plot(meta_data, save_path=None):
plt.savefig(os.path.join(save_path, name))
plt.figure()
plt.plot(x, y_std, 'ro')
plt.plot(x, y_std, "ro")
plt.xlabel("character lengths", fontsize=30)
plt.ylabel("standard deviation", fontsize=30)
if save:
@ -148,7 +143,7 @@ def plot(meta_data, save_path=None):
plt.savefig(os.path.join(save_path, name))
plt.figure()
plt.plot(x, y_num_samples, 'ro')
plt.plot(x, y_num_samples, "ro")
plt.xlabel("character lengths", fontsize=30)
plt.ylabel("number of samples", fontsize=30)
if save:
@ -161,8 +156,8 @@ def plot_phonemes(train_path, cmu_dict_path, save_path):
phonemes = {}
with open(train_path, 'r') as f:
data = csv.reader(f, delimiter='|')
with open(train_path, "r") as f:
data = csv.reader(f, delimiter="|")
phonemes["None"] = 0
for row in data:
words = row[3].split()
@ -194,15 +189,12 @@ def plot_phonemes(train_path, cmu_dict_path, save_path):
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--train_file_path', required=True,
help='this is the path to the train.txt file that the preprocess.py script creates'
)
parser.add_argument(
'--save_to', help='path to save charts of data to'
)
parser.add_argument(
'--cmu_dict_path', help='give cmudict-0.7b to see phoneme distribution'
"--train_file_path",
required=True,
help="this is the path to the train.txt file that the preprocess.py script creates",
)
parser.add_argument("--save_to", help="path to save charts of data to")
parser.add_argument("--cmu_dict_path", help="give cmudict-0.7b to see phoneme distribution")
args = parser.parse_args()
meta_data = process_meta_data(args.train_file_path)
plt.rcParams["figure.figsize"] = (10, 5)
@ -213,5 +205,6 @@ def main():
plt.show()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -26,3 +26,8 @@ exclude = '''
# the root of the project
)
'''
[tool.isort]
line_length = 120
profile = "black"
multi_line_output = 3

View File

@ -18,10 +18,12 @@ bokeh==1.4.0
pysbd
# pyworld
soundfile
nose==1.3.7
cardboardlint==1.3.0
pylint==2.5.3
gdown
umap-learn==0.4.6
cython
pyyaml
pyyaml
# quality and style
nose
black
isort
pylint==2.7.4

View File

@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
conf = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
# pylint: disable=protected-access
@ -20,10 +20,10 @@ class TestAudio(unittest.TestCase):
self.ap = AudioProcessor(**conf.audio)
def test_audio_synthesis(self):
""" 1. load wav
2. set normalization parameters
3. extract mel-spec
4. invert to wav and save the output
"""1. load wav
2. set normalization parameters
3. extract mel-spec
4. invert to wav and save the output
"""
print(" > Sanity check for the process wav -> mel -> wav")
@ -35,23 +35,24 @@ class TestAudio(unittest.TestCase):
wav = self.ap.load_wav(WAV_FILE)
mel = self.ap.melspectrogram(wav)
wav_ = self.ap.inv_melspectrogram(mel)
file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\
.format(max_norm, signal_norm, symmetric_norm, clip_norm)
file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format(
max_norm, signal_norm, symmetric_norm, clip_norm
)
print(" | > Creating wav file at : ", file_name)
self.ap.save_wav(wav_, OUT_PATH + file_name)
# maxnorm = 1.0
_test(1., False, False, False)
_test(1., True, False, False)
_test(1., True, True, False)
_test(1., True, False, True)
_test(1., True, True, True)
_test(1.0, False, False, False)
_test(1.0, True, False, False)
_test(1.0, True, True, False)
_test(1.0, True, False, True)
_test(1.0, True, True, True)
# maxnorm = 4.0
_test(4., False, False, False)
_test(4., True, False, False)
_test(4., True, True, False)
_test(4., True, False, True)
_test(4., True, True, True)
_test(4.0, False, False, False)
_test(4.0, True, False, False)
_test(4.0, True, True, False)
_test(4.0, True, False, True)
_test(4.0, True, True, True)
def test_normalize(self):
"""Check normalization and denormalization for range values and consistency """
@ -67,7 +68,9 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
@ -81,8 +84,9 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
# check value range
@ -97,13 +101,14 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = False
self.ap.max_norm = 4.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap.denormalize(x_norm)
@ -114,13 +119,14 @@ class TestAudio(unittest.TestCase):
self.ap.clip_norm = True
self.ap.max_norm = 4.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap.denormalize(x_norm)
@ -130,8 +136,9 @@ class TestAudio(unittest.TestCase):
self.ap.symmetric_norm = False
self.ap.max_norm = 1.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
@ -143,22 +150,23 @@ class TestAudio(unittest.TestCase):
self.ap.symmetric_norm = True
self.ap.max_norm = 1.0
x_norm = self.ap.normalize(x)
print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}")
print(
f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}"
)
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type
assert x_norm.min() < 0, x_norm.min()
x_ = self.ap.denormalize(x_norm)
assert (x - x_).sum() < 1e-3
def test_scaler(self):
scaler_stats_path = os.path.join(get_tests_input_path(), 'scale_stats.npy')
conf.audio['stats_path'] = scaler_stats_path
conf.audio['preemphasis'] = 0.0
conf.audio['do_trim_silence'] = True
conf.audio['signal_norm'] = True
scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
conf.audio["stats_path"] = scaler_stats_path
conf.audio["preemphasis"] = 0.0
conf.audio["do_trim_silence"] = True
conf.audio["signal_norm"] = True
ap = AudioProcessor(**conf.audio)
mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)

View File

@ -9,99 +9,99 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def test_encoder():
input_dummy = torch.rand(8, 14, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths = torch.randint(31, 37, (8,)).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# relative positional transformer encoder
layer = Encoder(out_channels=11,
in_hidden_channels=14,
encoder_type='relative_position_transformer',
encoder_params={
'hidden_channels_ffn': 768,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None
}).to(device)
layer = Encoder(
out_channels=11,
in_hidden_channels=14,
encoder_type="relative_position_transformer",
encoder_params={
"hidden_channels_ffn": 768,
"num_heads": 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# residual conv bn encoder
layer = Encoder(out_channels=11,
in_hidden_channels=14,
encoder_type='residual_conv_bn',
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13
}).to(device)
layer = Encoder(
out_channels=11,
in_hidden_channels=14,
encoder_type="residual_conv_bn",
encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# FFTransformer encoder
layer = Encoder(out_channels=14,
in_hidden_channels=14,
encoder_type='fftransformer',
encoder_params={
"hidden_channels_ffn": 31,
"num_heads": 2,
"num_layers": 2,
"dropout_p": 0.1
}).to(device)
layer = Encoder(
out_channels=14,
in_hidden_channels=14,
encoder_type="fftransformer",
encoder_params={"hidden_channels_ffn": 31, "num_heads": 2, "num_layers": 2, "dropout_p": 0.1},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 14, 37]
def test_decoder():
input_dummy = torch.rand(8, 128, 37).to(device)
input_lengths = torch.randint(31, 37, (8, )).long().to(device)
input_lengths = torch.randint(31, 37, (8,)).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(
sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# residual bn conv decoder
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# transformer decoder
layer = Decoder(out_channels=11,
in_hidden_channels=128,
decoder_type='relative_position_transformer',
decoder_params={
'hidden_channels_ffn': 128,
'num_heads': 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 8,
"rel_attn_window_size": 4,
"input_length": None
}).to(device)
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="relative_position_transformer",
decoder_params={
"hidden_channels_ffn": 128,
"num_heads": 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 8,
"rel_attn_window_size": 4,
"input_length": None,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# wavenet decoder
layer = Decoder(out_channels=11,
in_hidden_channels=128,
decoder_type='wavenet',
decoder_params={
"num_blocks": 12,
"hidden_channels": 192,
"kernel_size": 5,
"dilation_rate": 1,
"num_layers": 4,
"dropout_p": 0.05
}).to(device)
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="wavenet",
decoder_params={
"num_blocks": 12,
"hidden_channels": 192,
"kernel_size": 5,
"dilation_rate": 1,
"num_layers": 4,
"dropout_p": 0.05,
},
).to(device)
output = layer(input_dummy, input_mask)
# FFTransformer decoder
layer = Decoder(out_channels=11,
in_hidden_channels=128,
decoder_type='fftransformer',
decoder_params={
'hidden_channels_ffn': 31,
'num_heads': 2,
"dropout_p": 0.1,
"num_layers": 2,
}).to(device)
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="fftransformer",
decoder_params={
"hidden_channels_ffn": 31,
"num_heads": 2,
"dropout_p": 0.1,
"num_layers": 2,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]

View File

@ -11,13 +11,13 @@ from TTS.tts.models.glow_tts import GlowTTS
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
#pylint: disable=unused-variable
# pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
@ -32,11 +32,11 @@ class GlowTTSTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, c.audio['num_mels'], 30).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
criterion = GlowTTSLoss()
@ -47,27 +47,28 @@ class GlowTTSTrainTest(unittest.TestCase):
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type='rel_pos_transformer',
encoder_type="rel_pos_transformer",
encoder_params={
'kernel_size': 3,
'dropout_p': 0.1,
'num_layers': 6,
'num_heads': 2,
'hidden_channels_ffn': 16, # 4 times the hidden_channels
'input_length': None
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False).to(device)
mean_only=False,
).to(device)
# reference model to compare model weights
model_ref = GlowTTS(
@ -76,38 +77,37 @@ class GlowTTSTrainTest(unittest.TestCase):
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type='rel_pos_transformer',
encoder_type="rel_pos_transformer",
encoder_params={
'kernel_size': 3,
'dropout_p': 0.1,
'num_layers': 6,
'num_heads': 2,
'hidden_channels_ffn': 16, # 4 times the hidden_channels
'input_length': None
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False).to(device)
mean_only=False,
).to(device)
model.train()
print(" > Num parameters for GlowTTS model:%s" %
(count_parameters(model)))
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
# pass the state to ref model
model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
@ -115,18 +115,17 @@ class GlowTTSTrainTest(unittest.TestCase):
for _ in range(5):
optimizer.zero_grad()
z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, None)
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
o_dur_log, o_total_dur, input_lengths)
loss = loss_dict['loss']
input_dummy, input_lengths, mel_spec, mel_lengths, None
)
loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, input_lengths)
loss = loss_dict["loss"]
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1

View File

@ -10,7 +10,7 @@ from TTS.tts.utils.generic_utils import sequence_mask
class PrenetTests(unittest.TestCase):
def test_in_out(self): #pylint: disable=no-self-use
def test_in_out(self): # pylint: disable=no-self-use
layer = Prenet(128, out_features=[256, 128])
dummy_input = T.rand(4, 128)
@ -22,7 +22,7 @@ class PrenetTests(unittest.TestCase):
class CBHGTests(unittest.TestCase):
def test_in_out(self):
#pylint: disable=attribute-defined-outside-init
# pylint: disable=attribute-defined-outside-init
layer = self.cbhg = CBHG(
128,
K=8,
@ -30,7 +30,8 @@ class CBHGTests(unittest.TestCase):
conv_projections=[160, 128],
highway_features=80,
gru_features=80,
num_highways=4)
num_highways=4,
)
# B x D x T
dummy_input = T.rand(4, 128, 8)
@ -53,26 +54,27 @@ class DecoderTests(unittest.TestCase):
attn_norm="sigmoid",
attn_K=5,
attn_type="original",
prenet_type='original',
prenet_type="original",
prenet_dropout=True,
forward_attn=True,
trans_agent=True,
forward_attn_mask=True,
location_attn=True,
separate_stopnet=True)
separate_stopnet=True,
)
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
output, alignment, stop_tokens = layer(
dummy_input, dummy_memory, mask=None)
output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None)
assert output.shape[0] == 4
assert output.shape[1] == 80, "size not {}".format(output.shape[1])
assert output.shape[2] == 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
class EncoderTests(unittest.TestCase):
def test_in_out(self): #pylint: disable=no-self-use
def test_in_out(self): # pylint: disable=no-self-use
layer = Encoder(128)
dummy_input = T.rand(4, 8, 128)
@ -85,7 +87,7 @@ class EncoderTests(unittest.TestCase):
class L1LossMaskedTests(unittest.TestCase):
def test_in_out(self): #pylint: disable=no-self-use
def test_in_out(self): # pylint: disable=no-self-use
# test input == target
layer = L1LossMasked(seq_len_norm=False)
dummy_input = T.ones(4, 8, 128).float()
@ -105,16 +107,14 @@ class L1LossMaskedTests(unittest.TestCase):
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 1.0, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())
@ -138,22 +138,20 @@ class L1LossMaskedTests(unittest.TestCase):
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())
class SSIMLossTests(unittest.TestCase):
def test_in_out(self): #pylint: disable=no-self-use
def test_in_out(self): # pylint: disable=no-self-use
# test input == target
layer = SSIMLoss()
dummy_input = T.ones(4, 8, 128).float()
@ -173,16 +171,14 @@ class SSIMLossTests(unittest.TestCase):
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())
@ -206,15 +202,13 @@ class SSIMLossTests(unittest.TestCase):
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
dummy_input = T.rand(4, 8, 128).float()
dummy_target = dummy_input.detach()
dummy_length = (T.arange(5, 9)).long()
mask = (
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
output = layer(dummy_input + mask, dummy_target, dummy_length)
assert output.item() == 0, "0 vs {}".format(output.item())

View File

@ -12,11 +12,11 @@ from TTS.tts.datasets.preprocess import ljspeech
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
#pylint: disable=unused-variable
# pylint: disable=unused-variable
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
ok_ljspeech = os.path.exists(c.data_path)
DATA_EXIST = True
@ -33,25 +33,27 @@ class TestTTSDataset(unittest.TestCase):
self.ap = AudioProcessor(**c.audio)
def _create_dataloader(self, batch_size, r, bgs):
items = ljspeech(c.data_path, 'metadata.csv')
items = ljspeech(c.data_path, "metadata.csv")
dataset = TTSDataset.MyDataset(
r,
c.text_cleaner,
compute_linear_spec=True,
ap=self.ap,
meta_data=items,
tp=c.characters if 'characters' in c.keys() else None,
tp=c.characters if "characters" in c.keys() else None,
batch_group_size=bgs,
min_seq_len=c.min_seq_len,
max_seq_len=float("inf"),
use_phonemes=False)
use_phonemes=False,
)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=False,
collate_fn=dataset.collate_fn,
drop_last=True,
num_workers=c.num_loader_workers)
num_workers=c.num_loader_workers,
)
return dataloader, dataset
def test_loader(self):
@ -72,18 +74,17 @@ class TestTTSDataset(unittest.TestCase):
neg_values = text_input[text_input < 0]
check_count = len(neg_values)
assert check_count == 0, \
" !! Negative values in text_input: {}".format(check_count)
assert check_count == 0, " !! Negative values in text_input: {}".format(check_count)
# TODO: more assertion here
assert isinstance(speaker_name[0], str)
assert linear_input.shape[0] == c.batch_size
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
assert mel_input.shape[0] == c.batch_size
assert mel_input.shape[2] == c.audio['num_mels']
assert mel_input.shape[2] == c.audio["num_mels"]
# check normalization ranges
if self.ap.symmetric_norm:
assert mel_input.max() <= self.ap.max_norm
assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type
assert mel_input.min() >= -self.ap.max_norm # pylint: disable=invalid-unary-operand-type
assert mel_input.min() < 0
else:
assert mel_input.max() <= self.ap.max_norm
@ -134,7 +135,7 @@ class TestTTSDataset(unittest.TestCase):
# check mel_spec consistency
wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
mel = self.ap.melspectrogram(wav).astype('float32')
mel = self.ap.melspectrogram(wav).astype("float32")
mel = torch.FloatTensor(mel).contiguous()
mel_dl = mel_input[0]
# NOTE: Below needs to check == 0 but due to an unknown reason
@ -145,15 +146,14 @@ class TestTTSDataset(unittest.TestCase):
# check mel-spec correctness
mel_spec = mel_input[0].cpu().numpy()
wav = self.ap.inv_melspectrogram(mel_spec.T)
self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav')
shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav')
self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
# check linear-spec
linear_spec = linear_input[0].cpu().numpy()
wav = self.ap.inv_spectrogram(linear_spec.T)
self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav')
shutil.copy(item_idx[0],
OUTPATH + '/linear_target_dataloader.wav')
self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
# check the last time step to be zero padded
assert linear_input[0, -1].sum() != 0
@ -202,8 +202,8 @@ class TestTTSDataset(unittest.TestCase):
# check the second itme in the batch
assert linear_input[1 - idx, -1].sum() == 0
assert mel_input[1 - idx, -1].sum() == 0
assert stop_target[1, mel_lengths[1]-1] == 1
assert stop_target[1, mel_lengths[1]:].sum() == 0
assert stop_target[1, mel_lengths[1] - 1] == 1
assert stop_target[1, mel_lengths[1] :].sum() == 0
assert len(mel_lengths.shape) == 1
# check batch zero-frame conditions (zero-frame disabled)

View File

@ -6,12 +6,11 @@ from TTS.tts.datasets.preprocess import common_voice
class TestPreprocessors(unittest.TestCase):
def test_common_voice_preprocessor(self): #pylint: disable=no-self-use
def test_common_voice_preprocessor(self): # pylint: disable=no-self-use
root_path = get_tests_input_path()
meta_file = "common_voice.tsv"
items = common_voice(root_path, meta_file)
assert items[0][0] == 'The applicants are invited for coffee and visa is given immediately.'
assert items[0][0] == "The applicants are invited for coffee and visa is given immediately."
assert items[0][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav")
assert items[-1][0] == "Competition for limited resources has also resulted in some local conflicts."

View File

@ -17,9 +17,7 @@ class SpeakerEncoderTests(unittest.TestCase):
def test_in_out(self):
dummy_input = T.rand(4, 20, 80) # B x T x D
dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
model = SpeakerEncoder(
input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3
)
model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3)
# computing d vectors
output = model.forward(dummy_input)
assert output.shape[0] == 4
@ -36,9 +34,7 @@ class SpeakerEncoderTests(unittest.TestCase):
output_norm = T.nn.functional.normalize(output, dim=1, p=2)
assert_diff = (output_norm - output).sum().item()
assert output.type() == "torch.FloatTensor"
assert (
abs(assert_diff) < 1e-4
), f" [!] output_norm has wrong values - {assert_diff}"
assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
# compute d for a given batch
dummy_input = T.rand(1, 240, 80) # B x T x D
output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5)
@ -74,6 +70,7 @@ class GE2ELossTests(unittest.TestCase):
output = loss.forward(dummy_input)
assert output.item() < 0.005
class AngleProtoLossTests(unittest.TestCase):
# pylint: disable=R0201
def test_in_out(self):
@ -103,6 +100,7 @@ class AngleProtoLossTests(unittest.TestCase):
output = loss.forward(dummy_input)
assert output.item() < 0.005
# class LoaderTest(unittest.TestCase):
# def test_output(self):
# items = libri_tts("/home/erogol/Data/Libri-TTS/train-clean-360/")

View File

@ -10,11 +10,10 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def test_duration_predictor():
input_dummy = torch.rand(8, 128, 27).to(device)
input_lengths = torch.randint(20, 27, (8, )).long().to(device)
input_lengths = torch.randint(20, 27, (8,)).long().to(device)
input_lengths[-1] = 27
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
1).to(device)
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
layer = DurationPredictor(hidden_channels=128).to(device)
@ -29,7 +28,7 @@ def test_speedy_speech():
T_de = 74
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
x_lengths = torch.randint(31, T_en, (B, )).long().to(device)
x_lengths = torch.randint(31, T_en, (B,)).long().to(device)
x_lengths[-1] = T_en
# set durations. max total duration should be equal to T_de
@ -53,34 +52,18 @@ def test_speedy_speech():
assert list(o_dr.shape) == [B, T_en]
# with speaker embedding
model = SpeedySpeech(num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
c_in_channels=256).to(device)
model.forward(x_dummy,
x_lengths,
y_lengths,
durations,
g=torch.randint(0, 10, (B,)).to(device))
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device))
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]
assert list(o_dr.shape) == [B, T_en]
# with speaker external embedding
model = SpeedySpeech(num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
external_c=True,
c_in_channels=256).to(device)
model.forward(x_dummy,
x_lengths,
y_lengths,
durations,
g=torch.rand((B, 256)).to(device))
model = SpeedySpeech(
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device))
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
assert list(attn.shape) == [B, T_de, T_en]

View File

@ -4,5 +4,5 @@ from TTS.tts.utils.text import phonemes
class SymbolsTest(unittest.TestCase):
def test_uniqueness(self): #pylint: disable=no-self-use
def test_uniqueness(self): # pylint: disable=no-self-use
assert sorted(phonemes) == sorted(list(set(phonemes))), " {} vs {} ".format(len(phonemes), len(set(phonemes)))

View File

@ -14,8 +14,8 @@ class SynthesizerTest(unittest.TestCase):
def _create_random_model(self):
# pylint: disable=global-statement
global symbols, phonemes
config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json'))
if 'characters' in config.keys():
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
if "characters" in config.keys():
symbols, phonemes = make_symbols(**config.characters)
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
@ -25,11 +25,11 @@ class SynthesizerTest(unittest.TestCase):
def test_in_out(self):
self._create_random_model()
config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
config = load_config(os.path.join(get_tests_input_path(), "server_config.json"))
tts_root_path = get_tests_output_path()
config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
config["tts_checkpoint"] = os.path.join(tts_root_path, config["tts_checkpoint"])
config["tts_config"] = os.path.join(tts_root_path, config["tts_config"])
synthesizer = Synthesizer(config["tts_checkpoint"], config["tts_config"], None, None)
synthesizer.tts("Better this test works!!")
def test_split_into_sentences(self):
@ -38,20 +38,48 @@ class SynthesizerTest(unittest.TestCase):
# pylint: disable=attribute-defined-outside-init
self.seg = Synthesizer.get_segmenter("en")
sis = Synthesizer.split_into_sentences
assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences']
assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.']
assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.']
assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?']
assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.']
assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?']
assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.']
assert sis(self, 'The email format is Firstname.Lastname@example.com. I think you reversed them.') == ['The email format is Firstname.Lastname@example.com.', 'I think you reversed them.']
assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.']
assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the final lowercase "she" we see it's all one sentence
assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.']
assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.']
assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."']
assert sis(self, 'The address is not google.com.') == ['The address is not google.com.']
assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item']
assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item']
assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item']
assert sis(self, "Hello. Two sentences") == ["Hello.", "Two sentences"]
assert sis(self, "He went to meet the adviser from Scott, Waltman & Co. next morning.") == [
"He went to meet the adviser from Scott, Waltman & Co. next morning."
]
assert sis(self, "Let's run it past Sarah and co. They'll want to see this.") == [
"Let's run it past Sarah and co.",
"They'll want to see this.",
]
assert sis(self, "Where is Bobby Jr.'s rabbit?") == ["Where is Bobby Jr.'s rabbit?"]
assert sis(self, "Please inform the U.K. authorities right away.") == [
"Please inform the U.K. authorities right away."
]
assert sis(self, "Were David and co. at the event?") == ["Were David and co. at the event?"]
assert sis(self, "paging dr. green, please come to theatre four immediately.") == [
"paging dr. green, please come to theatre four immediately."
]
assert sis(self, "The email format is Firstname.Lastname@example.com. I think you reversed them.") == [
"The email format is Firstname.Lastname@example.com.",
"I think you reversed them.",
]
assert sis(
self,
"The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.",
) == [
"The demo site is: https://top100.example.com/subsection/latestnews.html.",
"Please send us your feedback.",
]
assert sis(self, "Scowling at him, 'You are not done yet!' she yelled.") == [
"Scowling at him, 'You are not done yet!' she yelled."
] # with the final lowercase "she" we see it's all one sentence
assert sis(self, "Hey!! So good to see you.") == ["Hey!!", "So good to see you."]
assert sis(self, "He went to Yahoo! but I don't know the division.") == [
"He went to Yahoo! but I don't know the division."
]
assert sis(self, "If you can't remember a quote, “at least make up a memorable one that's plausible...\"") == [
"If you can't remember a quote, “at least make up a memorable one that's plausible...\""
]
assert sis(self, "The address is not google.com.") == ["The address is not google.com."]
assert sis(self, "1.) The first item 2.) The second item") == ["1.) The first item", "2.) The second item"]
assert sis(self, "1) The first item 2) The second item") == ["1) The first item", "2) The second item"]
assert sis(self, "a. The first item b. The second item c. The third list item") == [
"a. The first item",
"b. The second item",
"c. The third list item",
]

View File

@ -11,13 +11,13 @@ from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
#pylint: disable=unused-variable
# pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
@ -26,20 +26,19 @@ WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): # pylint: disable=no-self-use
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
@ -48,14 +47,14 @@ class TacotronTrainTest(unittest.TestCase):
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
@ -66,13 +65,12 @@ class TacotronTrainTest(unittest.TestCase):
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
@ -80,20 +78,19 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
@ -102,14 +99,14 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
@ -120,39 +117,46 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
#pylint: disable=no-self-use
# pylint: disable=no-self-use
def test_train_step(self):
# with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device)
model = Tacotron2(
num_chars=24,
r=c.r,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
@ -162,7 +166,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
@ -177,36 +182,45 @@ class TacotronGSTTrainTest(unittest.TestCase):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
#print(param.grad)
if name == "gst_layer.encoder.recurrence.weight_hh_l0":
# print(param.grad)
continue
assert (param != param_ref).any(
), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref
)
count += 1
# with file gst style
mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device)
mel_spec = (
torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device)
)
mel_spec = mel_spec.repeat(8, 1, 1)
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device)
model = Tacotron2(
num_chars=24,
r=c.r,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
@ -216,7 +230,8 @@ class TacotronGSTTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(10):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
@ -231,47 +246,57 @@ class TacotronGSTTrainTest(unittest.TestCase):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
#print(param.grad)
if name == "gst_layer.encoder.recurrence.weight_hh_l0":
# print(param.grad)
continue
assert (param != param_ref).any(
), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
name, count, param.shape, param, param_ref
)
count += 1
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens'], gst_use_speaker_embedding=c.gst['gst_use_speaker_embedding']).to(device)
model = Tacotron2(
num_chars=24,
r=c.r,
num_speakers=5,
speaker_embedding_dim=55,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
).to(device)
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()
@ -282,14 +307,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer.step()
# check parameter changes
count = 0
for name_param, param_ref in zip(model.named_parameters(),
model_ref.parameters()):
for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
if name == "gst_layer.encoder.recurrence.weight_hh_l0":
continue
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1

View File

@ -10,48 +10,51 @@ from TTS.tts.tf.models.tacotron2 import Tacotron2
from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
from TTS.utils.io import load_config
tf.get_logger().setLevel('INFO')
tf.get_logger().setLevel("INFO")
#pylint: disable=unused-variable
# pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
class TacotronTFTrainTest(unittest.TestCase):
@staticmethod
def generate_dummy_inputs():
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
chars_seq_lengths = torch.randint(100, 128, (8,)).long().to(device)
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids
def test_train_step(self):
''' test forward pass '''
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()
""" test forward pass """
(
chars_seq,
chars_seq_lengths,
mel_spec,
mel_postnet_spec,
mel_lengths,
stop_targets,
speaker_ids,
) = self.generate_dummy_inputs()
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
@ -68,15 +71,23 @@ class TacotronTFTrainTest(unittest.TestCase):
# inference pass
output = model(chars_seq, training=False)
def test_forward_attention(self,):
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()
def test_forward_attention(
self,
):
(
chars_seq,
chars_seq_lengths,
mel_spec,
mel_postnet_spec,
mel_lengths,
stop_targets,
speaker_ids,
) = self.generate_dummy_inputs()
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True)
@ -93,45 +104,51 @@ class TacotronTFTrainTest(unittest.TestCase):
# inference pass
output = model(chars_seq, training=False)
def test_tflite_conversion(self, ): #pylint:disable=no-self-use
model = Tacotron2(num_chars=24,
num_speakers=0,
r=3,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm='sigmoid',
prenet_type='original',
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=0,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=True)
def test_tflite_conversion(
self,
): # pylint:disable=no-self-use
model = Tacotron2(
num_chars=24,
num_speakers=0,
r=3,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type="original",
attn_win=False,
attn_norm="sigmoid",
prenet_type="original",
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=0,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=True,
)
model.build_inference()
convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True)
convert_tacotron2_to_tflite(model, output_path="test_tacotron2.tflite", experimental_converter=True)
# init tflite model
tflite_model = load_tflite_model('test_tacotron2.tflite')
tflite_model = load_tflite_model("test_tacotron2.tflite")
# fake input
inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg
inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) # pylint:disable=unexpected-keyword-arg
# run inference
# get input and output details
input_details = tflite_model.get_input_details()
output_details = tflite_model.get_output_details()
# reshape input tensor for the new input shape
tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg
tflite_model.resize_tensor_input(
input_details[0]["index"], inputs.shape
) # pylint:disable=unexpected-keyword-arg
tflite_model.allocate_tensors()
detail = input_details[0]
input_shape = detail['shape']
tflite_model.set_tensor(detail['index'], inputs)
input_shape = detail["shape"]
tflite_model.set_tensor(detail["index"], inputs)
# run the tflite_model
tflite_model.invoke()
# collect outputs
decoder_output = tflite_model.get_tensor(output_details[0]['index'])
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
decoder_output = tflite_model.get_tensor(output_details[0]["index"])
postnet_output = tflite_model.get_tensor(output_details[1]["index"])
# remove tflite binary
os.remove('test_tacotron2.tflite')
os.remove("test_tacotron2.tflite")

View File

@ -11,13 +11,13 @@ from TTS.tts.models.tacotron import Tacotron
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
#pylint: disable=unused-variable
# pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
@ -32,147 +32,140 @@ class TacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
speaker_embedding_dim=55,
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths,
speaker_embeddings=speaker_embeddings)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
# with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 120, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device)
mel_lengths = torch.randint(20, 120, (8,)).long().to(device)
mel_lengths[-1] = 120
stop_targets = torch.zeros(8, 120, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
@ -180,65 +173,64 @@ class TacotronGSTTrainTest(unittest.TestCase):
num_chars=32,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
# print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
# with file gst style
mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device)
mel_spec = (
torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device)
)
mel_spec = mel_spec.repeat(8, 1, 1)
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
linear_spec = torch.rand(8, mel_spec.size(1), c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, mel_spec.size(1), (8, )).long().to(device)
linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device)
mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
@ -246,113 +238,109 @@ class TacotronGSTTrainTest(unittest.TestCase):
num_chars=32,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
# print(model)
print(" > Num parameters for Tacotron GST model:%s" %
(count_parameters(model)))
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(10):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8, )).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_embeddings = torch.rand(8, 55).to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) >
0.0).unsqueeze(2).float().squeeze()
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron(
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio['fft_size'],
decoder_output_dim=c.audio['num_mels'],
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
gst=True,
gst_embedding_dim=c.gst['gst_embedding_dim'],
gst_num_heads=c.gst['gst_num_heads'],
gst_style_tokens=c.gst['gst_style_tokens'],
gst_use_speaker_embedding=c.gst['gst_use_speaker_embedding'],
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
r=c.r,
memory_size=c.memory_size,
speaker_embedding_dim=55,
).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train()
print(" > Num parameters for Tacotron model:%s" %
(count_parameters(model)))
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5):
mel_out, linear_out, align, stop_tokens = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths,
speaker_embeddings=speaker_embeddings)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings
)
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec,
mel_lengths) + stop_loss
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for name_param, param_ref in zip(model.named_parameters(),
model_ref.parameters()):
for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
name, param = name_param
if name == 'gst_layer.encoder.recurrence.weight_hh_l0':
if name == "gst_layer.encoder.recurrence.weight_hh_l0":
continue
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1

View File

@ -17,5 +17,5 @@ def test_currency() -> None:
def test_expand_numbers() -> None:
assert phoneme_cleaners("-1") == 'minus one'
assert phoneme_cleaners("1") == 'one'
assert phoneme_cleaners("-1") == "minus one"
assert phoneme_cleaners("1") == "one"

View File

@ -7,7 +7,8 @@ from tests import get_tests_input_path, get_tests_path
from TTS.tts.utils.text import *
from TTS.utils.io import load_config
conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
conf = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
def test_phoneme_to_sequence():
@ -18,7 +19,7 @@ def test_phoneme_to_sequence():
text_hat = sequence_to_phoneme(sequence)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
gt = "ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!"
assert text_hat == text_hat_with_params == gt
# multiple punctuations
@ -87,6 +88,7 @@ def test_phoneme_to_sequence():
print(len(sequence))
assert text_hat == text_hat_with_params == gt
def test_phoneme_to_sequence_with_blank_token():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
@ -105,7 +107,7 @@ def test_phoneme_to_sequence_with_blank_token():
text_hat = sequence_to_phoneme(sequence)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True)
gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?'
gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?"
print(text_hat)
print(len(sequence))
assert text_hat == text_hat_with_params == gt
@ -116,7 +118,7 @@ def test_phoneme_to_sequence_with_blank_token():
text_hat = sequence_to_phoneme(sequence)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True)
gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ'
gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ"
print(text_hat)
print(len(sequence))
assert text_hat == text_hat_with_params == gt
@ -127,7 +129,7 @@ def test_phoneme_to_sequence_with_blank_token():
text_hat = sequence_to_phoneme(sequence)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True)
gt = 'biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!'
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
print(text_hat)
print(len(sequence))
assert text_hat == text_hat_with_params == gt
@ -138,7 +140,7 @@ def test_phoneme_to_sequence_with_blank_token():
text_hat = sequence_to_phoneme(sequence)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True)
gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ.'
gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ."
print(text_hat)
print(len(sequence))
assert text_hat == text_hat_with_params == gt
@ -165,9 +167,10 @@ def test_phoneme_to_sequence_with_blank_token():
print(len(sequence))
assert text_hat == text_hat_with_params == gt
def test_text2phone():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
gt = 'ɹ|iː|s|ə|n|t| |ɹ|ᵻ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|ŋ|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!'
gt = "ɹ|iː|s|ə|n|t| |ɹ|ᵻ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|ŋ|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
lang = "en-us"
ph = text2phone(text, lang)
assert gt == ph

View File

@ -13,31 +13,32 @@ file_path = os.path.dirname(os.path.realpath(__file__))
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
C = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
ok_ljspeech = os.path.exists(test_data_path)
def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers):
''' run dataloader with given parameters and check conditions '''
def gan_dataset_case(
batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers
):
""" run dataloader with given parameters and check conditions """
ap = AudioProcessor(**C.audio)
_, train_items = load_wav_data(test_data_path, 10)
dataset = GANDataset(ap,
train_items,
seq_len=seq_len,
hop_len=hop_len,
pad_short=2000,
conv_pad=conv_pad,
return_segments=return_segments,
use_noise_augment=use_noise_augment,
use_cache=use_cache)
loader = DataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers,
pin_memory=True,
drop_last=True)
dataset = GANDataset(
ap,
train_items,
seq_len=seq_len,
hop_len=hop_len,
pad_short=2000,
conv_pad=conv_pad,
return_segments=return_segments,
use_noise_augment=use_noise_augment,
use_cache=use_cache,
)
loader = DataLoader(
dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last=True
)
max_iter = 10
count_iter = 0
@ -61,8 +62,8 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us
mel = ap.melspectrogram(audio)
# the first 2 and the last 2 frames are skipped due to the padding
# differences in stft
max_diff = abs((feat - mel[:, :feat1.shape[-1]])[:, 2:-2]).max()
assert max_diff <= 0, f' [!] {max_diff}'
max_diff = abs((feat - mel[:, : feat1.shape[-1]])[:, 2:-2]).max()
assert max_diff <= 0, f" [!] {max_diff}"
count_iter += 1
# if count_iter == max_iter:
@ -79,17 +80,17 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us
def test_parametrized_gan_dataset():
''' test dataloader with different parameters '''
""" test dataloader with different parameters """
params = [
[32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0],
[32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 4],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, True, 0],
[1, C.audio['hop_length'], C.audio['hop_length'], 0, True, True, True, 0],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, True, True, True, 0],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, True, True, 0],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, False, 0],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, False, False, 0],
[32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 0],
[32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 4],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, True, 0],
[1, C.audio["hop_length"], C.audio["hop_length"], 0, True, True, True, 0],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, True, True, True, 0],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, False, True, True, 0],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 0],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, False, 0],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, False, False, False, 0],
]
for param in params:
print(param)

View File

@ -14,7 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
C = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
C = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
ap = AudioProcessor(**C.audio)
@ -45,7 +45,8 @@ def test_multiscale_stft_loss():
stft_loss = MultiScaleSTFTLoss(
[ap.fft_size // 2, ap.fft_size, ap.fft_size * 2],
[ap.hop_length // 2, ap.hop_length, ap.hop_length * 2],
[ap.win_length // 2, ap.win_length, ap.win_length * 2])
[ap.win_length // 2, ap.win_length, ap.win_length * 2],
)
wav = ap.load_wav(WAV_FILE)
wav = torch.from_numpy(wav[None, :]).float()
loss_m, loss_sc = stft_loss(wav, wav)

View File

@ -1,8 +1,10 @@
import numpy as np
import torch
from TTS.vocoder.models.parallel_wavegan_discriminator import (ParallelWaveganDiscriminator,
ResidualParallelWaveganDiscriminator)
from TTS.vocoder.models.parallel_wavegan_discriminator import (
ParallelWaveganDiscriminator,
ResidualParallelWaveganDiscriminator,
)
def test_pwgan_disciminator():
@ -15,7 +17,8 @@ def test_pwgan_disciminator():
dilation_factor=1,
nonlinear_activation="LeakyReLU",
nonlinear_activation_params={"negative_slope": 0.2},
bias=True)
bias=True,
)
dummy_x = torch.rand((4, 1, 64 * 256))
output = model(dummy_x)
assert np.all(output.shape == (4, 1, 64 * 256))
@ -35,7 +38,8 @@ def test_redisual_pwgan_disciminator():
dropout=0.0,
bias=True,
nonlinear_activation="LeakyReLU",
nonlinear_activation_params={"negative_slope": 0.2})
nonlinear_activation_params={"negative_slope": 0.2},
)
dummy_x = torch.rand((4, 1, 64 * 256))
output = model(dummy_x)
assert np.all(output.shape == (4, 1, 64 * 256))

View File

@ -18,7 +18,8 @@ def test_pwgan_generator():
dropout=0.0,
bias=True,
use_weight_norm=True,
upsample_factors=[4, 4, 4, 4])
upsample_factors=[4, 4, 4, 4],
)
dummy_c = torch.rand((2, 80, 5))
output = model(dummy_c)
assert np.all(output.shape == (2, 1, 5 * 256)), output.shape

View File

@ -23,5 +23,4 @@ def test_pqmf():
print(w2_.max())
print(w2_.min())
print(w2_.mean())
sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'),
w2_.flatten().detach(), sr)
sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)

View File

@ -5,14 +5,12 @@ from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscrimin
def test_rwd():
layer = RandomWindowDiscriminator(cond_channels=80,
window_sizes=(512, 1024, 2048, 4096,
8192),
cond_disc_downsample_factors=[
(8, 4, 2, 2, 2), (8, 4, 2, 2),
(8, 4, 2), (8, 4), (4, 2, 2)
],
hop_length=256)
layer = RandomWindowDiscriminator(
cond_channels=80,
window_sizes=(512, 1024, 2048, 4096, 8192),
cond_disc_downsample_factors=[(8, 4, 2, 2, 2), (8, 4, 2, 2), (8, 4, 2), (8, 4), (4, 2, 2)],
hop_length=256,
)
x = torch.rand([4, 1, 22050])
c = torch.rand([4, 80, 22050 // 256])

View File

@ -24,5 +24,4 @@ def test_pqmf():
print(w2_.max())
print(w2_.min())
print(w2_.mean())
sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'),
w2_.flatten(), sr)
sf.write(os.path.join(get_tests_output_path(), "tf_pqmf_output.wav"), w2_.flatten(), sr)

View File

@ -14,8 +14,7 @@ file_path = os.path.dirname(os.path.realpath(__file__))
OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
C = load_config(os.path.join(get_tests_input_path(),
"test_vocoder_wavernn_config.json"))
C = load_config(os.path.join(get_tests_input_path(), "test_vocoder_wavernn_config.json"))
test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
test_mel_feat_path = os.path.join(test_data_path, "mel")
@ -33,25 +32,20 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
C.data_path = test_data_path
preprocess_wav_files(test_data_path, C, ap)
_, train_items = load_wav_feat_data(
test_data_path, test_mel_feat_path, 5)
_, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5)
dataset = WaveRNNDataset(ap=ap,
items=train_items,
seq_len=seq_len,
hop_len=hop_len,
pad=pad,
mode=mode,
mulaw=mulaw
)
dataset = WaveRNNDataset(
ap=ap, items=train_items, seq_len=seq_len, hop_len=hop_len, pad=pad, mode=mode, mulaw=mulaw
)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset,
shuffle=True,
collate_fn=dataset.collate,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=True,
)
loader = DataLoader(
dataset,
shuffle=True,
collate_fn=dataset.collate,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=True,
)
max_iter = 10
count_iter = 0
@ -59,10 +53,8 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
try:
for data in loader:
x_input, mels, _ = data
expected_feat_shape = (ap.num_mels,
(x_input.shape[-1] // hop_len) + (pad * 2))
assert np.all(
mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
count_iter += 1
@ -77,15 +69,15 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
def test_parametrized_wavernn_dataset():
''' test dataloader with different parameters '''
""" test dataloader with different parameters """
params = [
[16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, True, 0],
[16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", False, 4],
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, False, 0],
[1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, True, 0],
[1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", False, 0],
[1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, False, 2],
[1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", False, 0],
[16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
[16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
[1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
[1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
[1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
[1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
[1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
]
for param in params:
print(param)

View File

@ -75,12 +75,12 @@ def test_wavegrad_forward():
c = torch.rand(32, 80, 20)
noise_scale = torch.rand(32)
model = Wavegrad(in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
[1, 2, 4, 8], [1, 2, 4, 8],
[1, 2, 4, 8]])
model = Wavegrad(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
o = model.forward(x, c, noise_scale)
assert o.shape[0] == 32

View File

@ -6,7 +6,7 @@ from torch import optim
from TTS.vocoder.models.wavegrad import Wavegrad
#pylint: disable=unused-variable
# pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
@ -20,19 +20,19 @@ class WavegradTrainTest(unittest.TestCase):
mel_spec = torch.rand(8, 80, 20).to(device)
criterion = torch.nn.L1Loss().to(device)
model = Wavegrad(in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
[1, 2, 4, 8], [1, 2, 4, 8],
[1, 2, 4, 8]])
model = Wavegrad(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
model_ref = Wavegrad(in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
[1, 2, 4, 8], [1, 2, 4, 8],
[1, 2, 4, 8]])
model_ref = Wavegrad(
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
model.train()
model.to(device)
betas = np.linspace(1e-6, 1e-2, 1000)
@ -40,8 +40,7 @@ class WavegradTrainTest(unittest.TestCase):
model_ref.load_state_dict(model.state_dict())
model_ref.to(device)
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
optimizer = optim.Adam(model.parameters(), lr=0.001)
@ -53,11 +52,10 @@ class WavegradTrainTest(unittest.TestCase):
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(),
model_ref.parameters()):
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
# if count not in [145, 59]:
assert (param != param_ref).any(
), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref)
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1