From bc5307caa03efb44ca4194950fd913fe498dae75 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 26 May 2021 20:35:58 -0300 Subject: [PATCH] add unit tests for SoftmaxAngleProtoLoss and ResnetSpeakerEncoder and bugfix --- TTS/speaker_encoder/models/resnet.py | 6 +-- tests/inputs/test_speaker_encoder_config.json | 4 +- tests/test_speaker_encoder.py | 53 +++++++++++++++++-- tests/test_speaker_encoder_train.py | 2 +- tests/test_speaker_manager.py | 4 +- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 0cb7e0a8..fe89c5aa 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -126,7 +126,7 @@ class ResNetSpeakerEncoder(nn.Module): nn.init.xavier_normal_(out) return out - def forward(self, x, training=True): + def forward(self, x, l2_norm=False): x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): @@ -157,7 +157,7 @@ class ResNetSpeakerEncoder(nn.Module): x = x.view(x.size()[0], -1) x = self.fc(x) - if not training: + if l2_norm: x = torch.nn.functional.normalize(x, p=2, dim=1) return x @@ -179,7 +179,7 @@ class ResNetSpeakerEncoder(nn.Module): offset = int(offset) end_offset = int(offset+num_frames) frames = x[:, offset:end_offset] - embed = self.forward(frames, training=False) + embed = self.forward(frames, l2_norm=True) embeddings.append(embed) embeddings = torch.stack(embeddings) diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index 4f3678e1..09a2f6a4 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -46,6 +46,7 @@ "batch_size": 32, "output_path": "", // DATASET-RELATED: output path for all training outputs. "model_params": { + "model_name": "lstm", "input_dim": 40, "proj_dim": 256, "lstm_dim": 768, @@ -54,8 +55,7 @@ }, "storage": { "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, // the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness + "storage_size": 15 // the size of the in-memory storage with respect to a single batch }, "datasets":null } \ No newline at end of file diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index 3b45d2e2..f56a9577 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -3,13 +3,13 @@ import unittest import torch as T from tests import get_tests_input_path -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss +from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -# from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder file_path = get_tests_input_path() -class SpeakerEncoderTests(unittest.TestCase): +class LSTMSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D @@ -39,6 +39,31 @@ class SpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 +class ResNetSpeakerEncoderTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + dummy_input = T.rand(4, 20, 80) # B x T x D + dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] + model = ResNetSpeakerEncoder(input_dim=80, proj_dim=256) + # computing d vectors + output = model.forward(dummy_input) + assert output.shape[0] == 4 + assert output.shape[1] == 256 + output = model.forward(dummy_input, l2_norm=True) + assert output.shape[0] == 4 + assert output.shape[1] == 256 + + # check normalization + output_norm = T.nn.functional.normalize(output, dim=1, p=2) + assert_diff = (output_norm - output).sum().item() + assert output.type() == "torch.FloatTensor" + assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" + # compute d for a given batch + dummy_input = T.rand(1, 240, 80) # B x T x D + output = model.compute_embedding(dummy_input, num_frames=160, num_eval=10) + assert output.shape[0] == 1 + assert output.shape[1] == 256 + assert len(output.shape) == 2 class GE2ELossTests(unittest.TestCase): # pylint: disable=R0201 @@ -67,7 +92,6 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 - class AngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -96,3 +120,24 @@ class AngleProtoLossTests(unittest.TestCase): loss = AngleProtoLoss() output = loss.forward(dummy_input) assert output.item() < 0.005 + +class SoftmaxAngleProtoLossTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + + embedding_dim = 64 + num_speakers = 5 + batch_size = 4 + + dummy_label = T.randint(low=0, high=num_speakers, size=(batch_size, num_speakers)) + # check random input + dummy_input = T.rand(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim + loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers) + output = loss.forward(dummy_input, dummy_label) + assert output.item() >= 0.0 + + # check all zeros + dummy_input = T.ones(batch_size, num_speakers, embedding_dim) # num_speaker x num_utterance x dim + loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers) + output = loss.forward(dummy_input, dummy_label) + assert output.item() >= 0.0 diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 525730f2..831c48f2 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -19,7 +19,7 @@ config = SpeakerEncoderConfig( print_step=1, save_step=1, print_eval=True, - audio=BaseAudioConfig(num_mels=40), + audio=BaseAudioConfig(num_mels=80), ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index ffb98ed7..f80e56fc 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -6,7 +6,7 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.speaker_encoder.model import SpeakerEncoder +from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.speaker_encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase): config.audio.resample = True # create a dummy speaker encoder - model = SpeakerEncoder(**config.model_params) + model = setup_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder