From 39c71ee8a98bcbfea242e6b203556150ee64205b Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 29 Oct 2020 15:47:15 +0100 Subject: [PATCH] wavegrad refactoring, fixing tests for glow-tts and wavegrad --- TTS/vocoder/layers/wavegrad.py | 65 ++++++++++------------------------ TTS/vocoder/models/wavegrad.py | 13 +++++-- run_tests.sh | 6 ++-- tests/test_encoder.py | 4 +-- tests/test_server_package.sh | 6 ++-- tests/test_wavegrad_train.py | 11 +++++- 6 files changed, 49 insertions(+), 56 deletions(-) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index c6c20eb5..2c781fd6 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -11,31 +11,8 @@ class Conv1d(nn.Conv1d): nn.init.zeros_(self.bias) -# class PositionalEncoding(nn.Module): -# def __init__(self, n_channels): -# super().__init__() -# self.n_channels = n_channels -# self.length = n_channels // 2 -# assert n_channels % 2 == 0 - -# def forward(self, x, noise_level): -# """ -# Shapes: -# x: B x C x T -# noise_level: B -# """ -# return (x + self.encoding(noise_level)[:, :, None]) - -# def encoding(self, noise_level): -# step = torch.arange( -# self.length, dtype=noise_level.dtype, device=noise_level.device) / self.length -# encoding = noise_level.unsqueeze(1) * torch.exp( -# -ln(1e4) * step.unsqueeze(0)) -# encoding = torch.cat([torch.sin(encoding), torch.cos(encoding)], dim=-1) -# return encoding - - class PositionalEncoding(nn.Module): + """Positional encoding with noise level conditioning""" def __init__(self, n_channels, max_len=10000): super().__init__() self.n_channels = n_channels @@ -64,9 +41,7 @@ class FiLM(nn.Module): self.encoding = PositionalEncoding(input_size) self.input_conv = weight_norm(nn.Conv1d(input_size, input_size, 3, padding=1)) self.output_conv = weight_norm(nn.Conv1d(input_size, output_size * 2, 3, padding=1)) - self.ini_parameters() - def ini_parameters(self): nn.init.xavier_uniform_(self.input_conv.weight) nn.init.xavier_uniform_(self.output_conv.weight) nn.init.zeros_(self.input_conv.bias) @@ -120,30 +95,28 @@ class UBlock(nn.Module): ]) def forward(self, x, shift, scale): - block1 = F.interpolate(x, size=x.shape[-1] * self.factor) - block1 = self.block1(block1) + o1 = F.interpolate(x, size=x.shape[-1] * self.factor) + o1 = self.block1(o1) - block2 = F.leaky_relu(x, 0.2) - block2 = F.interpolate(block2, size=x.shape[-1] * self.factor) - block2 = self.block2[0](block2) - # block2 = film_shift + film_scale * block2 - block2 = shif_and_scale(block2, scale, shift) - block2 = F.leaky_relu(block2, 0.2) - block2 = self.block2[1](block2) + o2 = F.leaky_relu(x, 0.2) + o2 = F.interpolate(o2, size=x.shape[-1] * self.factor) + o2 = self.block2[0](o2) + o2 = shif_and_scale(o2, scale, shift) + o2 = F.leaky_relu(o2, 0.2) + o2 = self.block2[1](o2) - x = block1 + block2 + x = o1 + o2 - # block3 = film_shift + film_scale * x - block3 = shif_and_scale(x, scale, shift) - block3 = F.leaky_relu(block3, 0.2) - block3 = self.block3[0](block3) - # block3 = film_shift + film_scale * block3 - block3 = shif_and_scale(block3, scale, shift) - block3 = F.leaky_relu(block3, 0.2) - block3 = self.block3[1](block3) + o3 = shif_and_scale(x, scale, shift) + o3 = F.leaky_relu(o3, 0.2) + o3 = self.block3[0](o3) - x = x + block3 - return x + o3 = shif_and_scale(o3, scale, shift) + o3 = F.leaky_relu(o3, 0.2) + o3 = self.block3[1](o3) + + o = x + o3 + return o class DBlock(nn.Module): diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index cbdb1205..120f0de0 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -20,6 +20,15 @@ class Wavegrad(nn.Module): super().__init__() self.hop_len = np.prod(upsample_factors) + self.noise_level = None + self.num_steps = None + self.beta = None + self.alpha = None + self.alpha_hat = None + self.noise_level = None + self.c1 = None + self.c2 = None + self.sigma = None # dblocks self.dblocks = nn.ModuleList([ @@ -75,6 +84,7 @@ class Wavegrad(nn.Module): def compute_y_n(self, y_0): + """Compute noisy audio based on noise schedule""" self.noise_level = self.noise_level.to(y_0) if len(y_0.shape) == 3: y_0 = y_0.squeeze(1) @@ -87,6 +97,7 @@ class Wavegrad(nn.Module): return noise.unsqueeze(1), noisy_audio.unsqueeze(1), noise_scale[:, 0] def compute_noise_level(self, num_steps, min_val, max_val): + """Compute noise schedule parameters""" beta = np.linspace(min_val, max_val, num_steps) alpha = 1 - beta alpha_hat = np.cumprod(alpha) @@ -101,5 +112,3 @@ class Wavegrad(nn.Module): self.c1 = 1 / self.alpha**0.5 self.c2 = (1 - self.alpha) / (1 - self.alpha_hat)**0.5 self.sigma = ((1.0 - self.alpha_hat[:-1]) / (1.0 - self.alpha_hat[1:]) * self.beta[1:])**0.5 - - diff --git a/run_tests.sh b/run_tests.sh index 27f54b24..998d8ec4 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,12 +1,14 @@ TF_CPP_MIN_LOG_LEVEL=3 # tests -nosetests tests -x &&\ +# nosetests tests -x &&\ # runtime tests ./tests/test_server_package.sh && \ ./tests/test_tts_train.sh && \ -./tests/test_vocoder_train.sh && \ +./tests/test_vocoder_gan_train.sh && \ +./tests/test_vocoder_wavernn_train.sh && \ +./tests/test_glow-tts_train.sh && \ # linter check cardboardlinter --refspec master \ No newline at end of file diff --git a/tests/test_encoder.py b/tests/test_encoder.py index a646eaa6..4d4dbba1 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -62,7 +62,7 @@ class GE2ELossTests(unittest.TestCase): assert output.item() >= 0.0 # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) - dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.nn.init.orthogonal_(dummy_input) dummy_input = T.cat( [ dummy_input[0].repeat(5, 1, 1).transpose(0, 1), @@ -91,7 +91,7 @@ class AngleProtoLossTests(unittest.TestCase): # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) - dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.nn.init.orthogonal_(dummy_input) dummy_input = T.cat( [ dummy_input[0].repeat(5, 1, 1).transpose(0, 1), diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh index 83ffc6f0..7e75415a 100755 --- a/tests/test_server_package.sh +++ b/tests/test_server_package.sh @@ -6,12 +6,12 @@ if [[ ! -f tests/outputs/checkpoint_10.pth.tar ]]; then exit 1 fi +rm -f dist/*.whl +python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json + python -m venv /tmp/venv source /tmp/venv/bin/activate pip install --quiet --upgrade pip setuptools wheel - -rm -f dist/*.whl -python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json pip install --quiet dist/TTS*.whl # this is related to https://github.com/librosa/librosa/issues/1160 diff --git a/tests/test_wavegrad_train.py b/tests/test_wavegrad_train.py index 1fd1d10e..d517b66b 100644 --- a/tests/test_wavegrad_train.py +++ b/tests/test_wavegrad_train.py @@ -30,9 +30,18 @@ class WavegradTrainTest(unittest.TestCase): upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]]) + + model_ref = Wavegrad(in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], + [1, 2, 4, 8], [1, 2, 4, 8], + [1, 2, 4, 8]]) model.train() model.to(device) - model_ref = copy.deepcopy(model) + model.compute_noise_level(1000, 1e-6, 1e-2) + model_ref.load_state_dict(model.state_dict()) + model_ref.to(device) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()):