Update tests for the new trainer API

This commit is contained in:
Eren Gölge 2021-06-18 13:27:19 +02:00
parent fcfd95669a
commit 626c9d41e6
24 changed files with 174 additions and 272 deletions

View File

@ -3,8 +3,7 @@ import unittest
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.models import setup_model from TTS.tts.models import setup_model
from TTS.tts.utils.io import save_checkpoint from TTS.utils.io import save_checkpoint
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.synthesizer import Synthesizer from TTS.utils.synthesizer import Synthesizer
from .. import get_tests_output_path from .. import get_tests_output_path
@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase):
# pylint: disable=R0201 # pylint: disable=R0201
def _create_random_model(self): def _create_random_model(self):
# pylint: disable=global-statement # pylint: disable=global-statement
global symbols, phonemes
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
if config.has("characters") and config.characters: model = setup_model(config)
symbols, phonemes = make_symbols(**config.characters.to_dict())
num_chars = len(phonemes) if config.use_phonemes else len(symbols)
model = setup_model(num_chars, 0, config)
output_path = os.path.join(get_tests_output_path()) output_path = os.path.join(get_tests_output_path())
save_checkpoint(model, None, 10, 10, 1, output_path, None) save_checkpoint(config, model, None, None, 10, 1, output_path)
def test_in_out(self): def test_in_out(self):
self._create_random_model() self._create_random_model()

View File

@ -6,7 +6,6 @@ import torch
from tests import get_tests_input_path, get_tests_output_path, run_cli from tests import get_tests_input_path, get_tests_output_path, run_cli
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.models import setup_model from TTS.tts.models import setup_model
from TTS.tts.utils.text.symbols import phonemes, symbols
torch.manual_seed(1) torch.manual_seed(1)
@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config # load config
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) model = setup_model(c)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test
@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config # load config
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) model = setup_model(c)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test
@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
# load config # load config
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) model = setup_model(c)
model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test

View File

@ -13,7 +13,7 @@ config = AlignTTSConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=False, use_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",

View File

@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase):
criterion = GlowTTSLoss() criterion = GlowTTSLoss()
# model to train # model to train
model = GlowTTS( config = GlowTTSConfig(num_chars=32)
num_chars=32, model = GlowTTS(config).to(device)
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
# reference model to compare model weights # reference model to compare model weights
model_ref = GlowTTS( model_ref = GlowTTS(config).to(device)
num_chars=32,
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
model.train() model.train()
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase):
speaker_ids = torch.randint(0, 5, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device)
# create model # create model
model = GlowTTS( config = GlowTTSConfig(num_chars=32)
num_chars=32, model = GlowTTS(config).to(device)
hidden_channels_enc=48,
hidden_channels_dec=48,
hidden_channels_dp=32,
out_channels=80,
encoder_type="rel_pos_transformer",
encoder_params={
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 16, # 4 times the hidden_channels
"input_length": None,
},
use_encoder_prenet=True,
num_flow_blocks_dec=12,
kernel_size_dec=5,
dilation_rate=1,
num_block_layers=4,
dropout_p_dec=0.0,
num_speakers=0,
c_in_channels=0,
num_splits=4,
num_squeeze=1,
sigmoid_scale=False,
mean_only=False,
).to(device)
model.eval() model.eval()
print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

View File

@ -13,7 +13,7 @@ config = GlowTTSConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
use_espeak_phonemes=True, use_espeak_phonemes=True,

View File

@ -1,7 +1,8 @@
import torch import torch
from TTS.tts.configs import SpeedySpeechConfig
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
from TTS.tts.models.speedy_speech import SpeedySpeech from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.data import sequence_mask
use_cuda = torch.cuda.is_available() use_cuda = torch.cuda.is_available()
@ -40,7 +41,8 @@ def test_speedy_speech():
y_lengths = durations.sum(1) y_lengths = durations.sum(1)
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128) config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
model = SpeedySpeech(config)
if use_cuda: if use_cuda:
model.cuda() model.cuda()
@ -55,7 +57,12 @@ def test_speedy_speech():
assert list(o_dr.shape) == [B, T_en] assert list(o_dr.shape) == [B, T_en]
# with speaker embedding # with speaker embedding
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) config = SpeedySpeechConfig(
model_args=SpeedySpeechArgs(
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
)
)
model = SpeedySpeech(config).to(device)
model.forward( model.forward(
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
) )
@ -68,9 +75,17 @@ def test_speedy_speech():
assert list(o_dr.shape) == [B, T_en] assert list(o_dr.shape) == [B, T_en]
# with speaker external embedding # with speaker external embedding
model = SpeedySpeech( config = SpeedySpeechConfig(
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 model_args=SpeedySpeechArgs(
).to(device) num_chars=num_chars,
out_channels=80,
hidden_channels=128,
num_speakers=10,
use_d_vector=True,
d_vector_dim=256,
)
)
model = SpeedySpeech(config).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
o_de = outputs["model_outputs"] o_de = outputs["model_outputs"]
attn = outputs["alignments"] attn = outputs["alignments"]

View File

@ -4,16 +4,18 @@ import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.configs import SpeedySpeechConfig
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")
config = SpeedySpeechConfig( config = SpeedySpeechConfig(
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
phoneme_language="en-us", phoneme_language="en-us",

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=False, use_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",
@ -24,11 +24,11 @@ config = Tacotron2Config(
print_step=1, print_step=1,
print_eval=True, print_eval=True,
use_speaker_embedding=True, use_speaker_embedding=True,
use_external_speaker_embedding_file=True, use_d_vector_file=True,
test_sentences=[ test_sentences=[
"Be a voice, not an echo.", "Be a voice, not an echo.",
], ],
external_speaker_embedding_file="tests/data/ljspeech/speakers.json", d_vector_file="tests/data/ljspeech/speakers.json",
max_decoder_steps=50, max_decoder_steps=50,
) )

View File

@ -7,6 +7,7 @@ from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs import Tacotron2Config from TTS.tts.configs import Tacotron2Config
from TTS.tts.configs.shared_configs import GSTConfig
from TTS.tts.layers.losses import MSELossMasked from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -17,19 +18,20 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available() use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = Tacotron2Config() config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80)
ap = AudioProcessor(**c.audio) ap = AudioProcessor(**config_global.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
class TacotronTrainTest(unittest.TestCase): class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): # pylint: disable=no-self-use def test_train_step(self): # pylint: disable=no-self-use
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30 mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) model = Tacotron2(config).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5): for i in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase):
class MultiSpeakeTacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30 mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) config.d_vector_dim = 55
model = Tacotron2(config).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5): for i in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids}
@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
# pylint: disable=no-self-use # pylint: disable=no-self-use
def test_train_step(self): def test_train_step(self):
# with random gst mel style # with random gst mel style
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30 mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) config.use_gst = True
config.gst = GSTConfig()
model = Tacotron2(config).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(10): for i in range(10):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30 mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) model = Tacotron2(config).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(10): for i in range(10):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[0] = 30 mel_lengths[0] = 30
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) config.d_vector_dim = 55
model = Tacotron2(config).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for i in range(5): for i in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=False, use_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",

View File

@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase):
num_chars=24, num_chars=24,
num_speakers=0, num_speakers=0,
r=3, r=3,
postnet_output_dim=80, out_channels=80,
decoder_output_dim=80, decoder_output_dim=80,
attn_type="original", attn_type="original",
attn_win=False, attn_win=False,

View File

@ -13,7 +13,7 @@ config = Tacotron2Config(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=False, use_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",

View File

@ -6,7 +6,7 @@ import torch
from torch import nn, optim from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs import TacotronConfig from TTS.tts.configs import GSTConfig, TacotronConfig
from TTS.tts.layers.losses import L1LossMasked from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.models.tacotron import Tacotron from TTS.tts.models.tacotron import Tacotron
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -17,9 +17,9 @@ torch.manual_seed(1)
use_cuda = torch.cuda.is_available() use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
c = TacotronConfig() config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
ap = AudioProcessor(**c.audio) ap = AudioProcessor(**config_global.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
@ -31,11 +31,12 @@ def count_parameters(model):
class TacotronTrainTest(unittest.TestCase): class TacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1) mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device) criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron( model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
num_chars=32,
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase):
class MultiSpeakeTacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1) mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device) criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron( config.d_vector_dim = 55
num_chars=32, model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
d_vector_dim=55,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
class TacotronGSTTrainTest(unittest.TestCase): class TacotronGSTTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
# with random gst mel style # with random gst mel style
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths = torch.randint(20, 120, (8,)).long().to(device)
mel_lengths[-1] = 120 mel_lengths[-1] = 120
stop_targets = torch.zeros(8, 120, 1).float().to(device) stop_targets = torch.zeros(8, 120, 1).float().to(device)
@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device) criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron( config.use_gst = True
num_chars=32, config.gst = GSTConfig()
num_speakers=5, model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
# print(model) # print(model)
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(10): for _ in range(10):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1) mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device)
@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device) criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron( model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
num_chars=32,
num_speakers=5,
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
memory_size=c.memory_size,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
# print(model) # print(model)
print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(10): for _ in range(10):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids}
@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase):
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():
config = config_global.copy()
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device)
input_lengths[-1] = 128 input_lengths[-1] = 128
mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
mel_lengths[-1] = mel_spec.size(1) mel_lengths[-1] = mel_spec.size(1)
stop_targets = torch.zeros(8, 30, 1).float().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device)
@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()) :, 0] = 1.0 stop_targets[:, int(idx.item()) :, 0] = 1.0
stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = L1LossMasked(seq_len_norm=False).to(device) criterion = L1LossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron( config.d_vector_dim = 55
num_chars=32, model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
use_gst=True,
gst=c.gst,
r=c.r,
memory_size=c.memory_size,
d_vector_dim=55,
).to(
device
) # FIXME: missing num_speakers parameter to Tacotron ctor
model.train() model.train()
print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
for param, param_ref in zip(model.parameters(), model_ref.parameters()): for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param assert (param - param_ref).sum() == 0, param
count += 1 count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=config.lr)
for _ in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}

View File

@ -13,7 +13,7 @@ config = TacotronConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=False, use_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",

View File

@ -12,7 +12,7 @@ config = FullbandMelganConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -13,7 +13,7 @@ config = HifiganConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = MelganConfig(
batch_size=4, batch_size=4,
eval_batch_size=4, eval_batch_size=4,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -29,9 +29,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = MultibandMelganConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -30,9 +30,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -40,7 +38,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -12,7 +12,7 @@ config = ParallelWaveganConfig(
batch_size=4, batch_size=4,
eval_batch_size=4, eval_batch_size=4,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -28,9 +28,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -3,11 +3,13 @@ import random
import numpy as np import numpy as np
import torch import torch
from TTS.vocoder.models.wavernn import WaveRNN from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs
def test_wavernn(): def test_wavernn():
model = WaveRNN( config = WavernnConfig()
config.model_args = WavernnArgs(
rnn_dims=512, rnn_dims=512,
fc_dims=512, fc_dims=512,
mode=10, mode=10,
@ -20,14 +22,30 @@ def test_wavernn():
compute_dims=128, compute_dims=128,
res_out_dims=128, res_out_dims=128,
num_res_blocks=10, num_res_blocks=10,
hop_length=256,
sample_rate=22050,
) )
config.audio.hop_length = 256
config.audio.sample_rate = 2048
dummy_x = torch.rand((2, 1280)) dummy_x = torch.rand((2, 1280))
dummy_m = torch.rand((2, 80, 9)) dummy_m = torch.rand((2, 80, 9))
y_size = random.randrange(20, 60) y_size = random.randrange(20, 60)
dummy_y = torch.rand((80, y_size)) dummy_y = torch.rand((80, y_size))
# mode: mold
model = Wavernn(config)
output = model(dummy_x, dummy_m) output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape assert np.all(output.shape == (2, 1280, 30)), output.shape
# mode: gauss
config.model_params.mode = "gauss"
model = Wavernn(config)
output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 2)), output.shape
# mode: quantized
config.model_params.mode = 4
model = Wavernn(config)
output = model(dummy_x, dummy_m)
assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape
output = model.inference(dummy_y, True, 5500, 550) output = model.inference(dummy_y, True, 5500, 550)
assert np.all(output.shape == (256 * (y_size - 1),)) assert np.all(output.shape == (256 * (y_size - 1),))

View File

@ -4,7 +4,8 @@ import numpy as np
import torch import torch
from torch import optim from torch import optim
from TTS.vocoder.models.wavegrad import Wavegrad from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
# pylint: disable=unused-variable # pylint: disable=unused-variable
@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase):
mel_spec = torch.rand(8, 80, 20).to(device) mel_spec = torch.rand(8, 80, 20).to(device)
criterion = torch.nn.L1Loss().to(device) criterion = torch.nn.L1Loss().to(device)
model = Wavegrad( args = WavegradArgs(
in_channels=80, in_channels=80,
out_channels=1, out_channels=1,
upsample_factors=[5, 5, 3, 2, 2], upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
) )
config = WavegradConfig(model_params=args)
model = Wavegrad(config)
model_ref = Wavegrad( model_ref = Wavegrad(config)
in_channels=80,
out_channels=1,
upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
)
model.train() model.train()
model.to(device) model.to(device)
betas = np.linspace(1e-6, 1e-2, 1000) betas = np.linspace(1e-6, 1e-2, 1000)

View File

@ -1,7 +1,8 @@
import torch import torch
from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock
from TTS.vocoder.models.wavegrad import Wavegrad from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
def test_positional_encoding(): def test_positional_encoding():
@ -75,12 +76,14 @@ def test_wavegrad_forward():
c = torch.rand(32, 80, 20) c = torch.rand(32, 80, 20)
noise_scale = torch.rand(32) noise_scale = torch.rand(32)
model = Wavegrad( args = WavegradArgs(
in_channels=80, in_channels=80,
out_channels=1, out_channels=1,
upsample_factors=[5, 5, 3, 2, 2], upsample_factors=[5, 5, 3, 2, 2],
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
) )
config = WavegradConfig(model_params=args)
model = Wavegrad(config)
o = model.forward(x, c, noise_scale) o = model.forward(x, c, noise_scale)
assert o.shape[0] == 32 assert o.shape[0] == 32

View File

@ -12,7 +12,7 @@ config = WavegradConfig(
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -29,15 +29,15 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
)
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)

View File

@ -4,15 +4,18 @@ import shutil
from tests import get_device_id, get_tests_output_path, run_cli from tests import get_device_id, get_tests_output_path, run_cli
from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.models.wavernn import WavernnArgs
config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs") output_path = os.path.join(get_tests_output_path(), "train_outputs")
config = WavernnConfig( config = WavernnConfig(
model_params=WavernnArgs(),
batch_size=8, batch_size=8,
eval_batch_size=8, eval_batch_size=8,
num_loader_workers=0, num_loader_workers=0,
num_val_loader_workers=0, num_eval_loader_workers=0,
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,
@ -28,9 +31,7 @@ config.audio.trim_db = 60
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch
command_train = ( command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} "
)
run_cli(command_train) run_cli(command_train)
# Find latest folder # Find latest folder
@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch
command_train = ( command_train = (
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
) )
run_cli(command_train) run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)