mirror of https://github.com/coqui-ai/TTS.git
Remove speedy_speech implementation
This commit is contained in:
parent
7d8f77385a
commit
1ebf9ec6bf
|
@ -1,47 +0,0 @@
|
||||||
import unittest
|
|
||||||
|
|
||||||
import torch as T
|
|
||||||
|
|
||||||
from TTS.tts.models.fast_pitch import FastPitch, FastPitchArgs, average_pitch
|
|
||||||
# pylint: disable=unused-variable
|
|
||||||
|
|
||||||
|
|
||||||
class AveragePitchTests(unittest.TestCase):
|
|
||||||
def test_in_out(self): # pylint: disable=no-self-use
|
|
||||||
pitch = T.rand(1, 1, 128)
|
|
||||||
|
|
||||||
durations = T.randint(1, 5, (1, 21))
|
|
||||||
coeff = 128.0 / durations.sum()
|
|
||||||
durations = T.round(durations * coeff)
|
|
||||||
diff = 128.0 - durations.sum()
|
|
||||||
durations[0, -1] += diff
|
|
||||||
durations = durations.long()
|
|
||||||
|
|
||||||
pitch_avg = average_pitch(pitch, durations)
|
|
||||||
|
|
||||||
index = 0
|
|
||||||
for idx, dur in enumerate(durations[0]):
|
|
||||||
assert abs(pitch_avg[0, 0, idx] - pitch[0, 0, index : index + dur.item()].mean()) < 1e-5
|
|
||||||
index += dur
|
|
||||||
|
|
||||||
|
|
||||||
def expand_encoder_outputs_test():
|
|
||||||
model = FastPitch(FastPitchArgs(num_chars=10))
|
|
||||||
|
|
||||||
inputs = T.rand(2, 5, 57)
|
|
||||||
durations = T.randint(1, 4, (2, 57))
|
|
||||||
|
|
||||||
x_mask = T.ones(2, 1, 57)
|
|
||||||
y_mask = T.ones(2, 1, durations.sum(1).max())
|
|
||||||
|
|
||||||
expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
|
|
||||||
|
|
||||||
for b in range(durations.shape[0]):
|
|
||||||
index = 0
|
|
||||||
for idx, dur in enumerate(durations[b]):
|
|
||||||
diff = (
|
|
||||||
expanded[b, :, index : index + dur.item()]
|
|
||||||
- inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
|
|
||||||
).sum()
|
|
||||||
assert abs(diff) < 1e-6, diff
|
|
||||||
index += dur
|
|
|
@ -1,96 +0,0 @@
|
||||||
import torch
|
|
||||||
|
|
||||||
from TTS.tts.configs import SpeedySpeechConfig
|
|
||||||
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
|
||||||
from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs
|
|
||||||
from TTS.tts.utils.helpers import sequence_mask
|
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
|
||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
||||||
|
|
||||||
|
|
||||||
def test_duration_predictor():
|
|
||||||
input_dummy = torch.rand(8, 128, 27).to(device)
|
|
||||||
input_lengths = torch.randint(20, 27, (8,)).long().to(device)
|
|
||||||
input_lengths[-1] = 27
|
|
||||||
|
|
||||||
x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
|
|
||||||
|
|
||||||
layer = DurationPredictor(hidden_channels=128).to(device)
|
|
||||||
|
|
||||||
output = layer(input_dummy, x_mask)
|
|
||||||
assert list(output.shape) == [8, 1, 27]
|
|
||||||
|
|
||||||
|
|
||||||
def test_speedy_speech():
|
|
||||||
num_chars = 7
|
|
||||||
B = 8
|
|
||||||
T_en = 37
|
|
||||||
T_de = 74
|
|
||||||
|
|
||||||
x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device)
|
|
||||||
x_lengths = torch.randint(31, T_en, (B,)).long().to(device)
|
|
||||||
x_lengths[-1] = T_en
|
|
||||||
|
|
||||||
# set durations. max total duration should be equal to T_de
|
|
||||||
durations = torch.randint(1, 4, (B, T_en))
|
|
||||||
durations = durations * (T_de / durations.sum(1)).unsqueeze(1)
|
|
||||||
durations = durations.to(torch.long).to(device)
|
|
||||||
max_dur = durations.sum(1).max()
|
|
||||||
durations[:, 0] += T_de - max_dur if T_de > max_dur else 0
|
|
||||||
|
|
||||||
y_lengths = durations.sum(1)
|
|
||||||
|
|
||||||
config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128))
|
|
||||||
model = SpeedySpeech(config)
|
|
||||||
if use_cuda:
|
|
||||||
model.cuda()
|
|
||||||
|
|
||||||
# forward pass
|
|
||||||
outputs = model(x_dummy, x_lengths, y_lengths, durations)
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
||||||
|
|
||||||
# with speaker embedding
|
|
||||||
config = SpeedySpeechConfig(
|
|
||||||
model_args=SpeedySpeechArgs(
|
|
||||||
num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model = SpeedySpeech(config).to(device)
|
|
||||||
model.forward(
|
|
||||||
x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
|
|
||||||
)
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
||||||
|
|
||||||
# with speaker external embedding
|
|
||||||
config = SpeedySpeechConfig(
|
|
||||||
model_args=SpeedySpeechArgs(
|
|
||||||
num_chars=num_chars,
|
|
||||||
out_channels=80,
|
|
||||||
hidden_channels=128,
|
|
||||||
num_speakers=10,
|
|
||||||
use_d_vector=True,
|
|
||||||
d_vector_dim=256,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
model = SpeedySpeech(config).to(device)
|
|
||||||
model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)})
|
|
||||||
o_de = outputs["model_outputs"]
|
|
||||||
attn = outputs["alignments"]
|
|
||||||
o_dr = outputs["durations_log"]
|
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}"
|
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
|
||||||
assert list(o_dr.shape) == [B, T_en]
|
|
|
@ -4,14 +4,12 @@ import shutil
|
||||||
|
|
||||||
from tests import get_device_id, get_tests_output_path, run_cli
|
from tests import get_device_id, get_tests_output_path, run_cli
|
||||||
from TTS.tts.configs import SpeedySpeechConfig
|
from TTS.tts.configs import SpeedySpeechConfig
|
||||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
|
||||||
|
|
||||||
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
|
||||||
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
output_path = os.path.join(get_tests_output_path(), "train_outputs")
|
||||||
|
|
||||||
|
|
||||||
config = SpeedySpeechConfig(
|
config = SpeedySpeechConfig(
|
||||||
model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0),
|
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
eval_batch_size=8,
|
eval_batch_size=8,
|
||||||
num_loader_workers=0,
|
num_loader_workers=0,
|
||||||
|
|
Loading…
Reference in New Issue