Fix DelightfulTTS (#2823)

* Fix tests

* Make style
This commit is contained in:
Eren Gölge 2023-07-31 13:52:45 +02:00 committed by GitHub
parent 483888b9d8
commit 69f080eb47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 64 additions and 58 deletions

View File

@ -140,13 +140,13 @@ class DelightfulTTSConfig(BaseTTSConfig):
d_vector_dim: int = None d_vector_dim: int = None
# testing # testing
test_sentences: List[str] = field( test_sentences: List[List[str]] = field(
default_factory=lambda: [ default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
"Be a voice, not an echo.", ["Be a voice, not an echo."],
"I'm sorry Dave. I'm afraid I can't do that.", ["I'm sorry Dave. I'm afraid I can't do that."],
"This cake is great. It's so delicious and moist.", ["This cake is great. It's so delicious and moist."],
"Prior to November 22, 1963.", ["Prior to November 22, 1963."],
] ]
) )

View File

@ -602,6 +602,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items return items
def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
txt_file = os.path.join(root_path, meta_file) txt_file = os.path.join(root_path, meta_file)
items = [] items = []

View File

@ -49,7 +49,7 @@ def id_to_torch(aux_id, cuda=False):
def embedding_to_torch(d_vector, cuda=False): def embedding_to_torch(d_vector, cuda=False):
if d_vector is not None: if d_vector is not None:
d_vector = np.asarray(d_vector) d_vector = np.asarray(d_vector)
d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) d_vector = torch.from_numpy(d_vector).float()
d_vector = d_vector.squeeze().unsqueeze(0) d_vector = d_vector.squeeze().unsqueeze(0)
if cuda: if cuda:
return d_vector.cuda() return d_vector.cuda()
@ -1151,7 +1151,7 @@ class DelightfulTTS(BaseTTSE2E):
if speaker_name is None: if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id() speaker_id = self.speaker_manager.get_random_id()
else: else:
speaker_id = self.speaker_manager.ids[speaker_name] speaker_id = self.speaker_manager.name_to_id[speaker_name]
return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
@ -1208,16 +1208,15 @@ class DelightfulTTS(BaseTTSE2E):
# set speaker inputs # set speaker inputs
_speaker_id = None _speaker_id = None
if speaker_id is not None and (self.args.use_speaker_embedding or self.args.use_d_vector_file): if speaker_id is not None and self.args.use_speaker_embedding:
if isinstance(speaker_id, str) and self.args.use_speaker_embedding: if isinstance(speaker_id, str) and self.args.use_speaker_embedding:
# get the speaker id for the speaker embedding layer # get the speaker id for the speaker embedding layer
_speaker_id = self.speaker_manager.name_to_id[speaker_id] _speaker_id = self.speaker_manager.name_to_id[speaker_id]
_speaker_id = id_to_torch(_speaker_id, cuda=is_cuda) _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda)
else:
if speaker_id is not None and self.args.use_d_vector_file:
# get the average d_vector for the speaker # get the average d_vector for the speaker
d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False) d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
if d_vector is not None and self.args.use_d_vector_file:
d_vector = embedding_to_torch(d_vector, cuda=is_cuda) d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda)

View File

@ -1814,7 +1814,7 @@ class Vits(BaseTTS):
# rollback values # rollback values
_forward = self.forward _forward = self.forward
disc = None disc = None
if hasattr(self, 'disc'): if hasattr(self, "disc"):
disc = self.disc disc = self.disc
training = self.training training = self.training
@ -1916,7 +1916,7 @@ class Vits(BaseTTS):
"input_lengths": x_lengths, "input_lengths": x_lengths,
"scales": scales, "scales": scales,
"sid": None if speaker_id is None else torch.tensor([speaker_id]).cpu().numpy(), "sid": None if speaker_id is None else torch.tensor([speaker_id]).cpu().numpy(),
"langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy() "langid": None if language_id is None else torch.tensor([language_id]).cpu().numpy(),
}, },
) )
return audio[0][0] return audio[0][0]

View File

@ -164,6 +164,7 @@ def multilingual_cleaners(text):
text = collapse_whitespace(text) text = collapse_whitespace(text)
return text return text
def no_cleaners(text): def no_cleaners(text):
# remove newline characters # remove newline characters
text = text.replace("\n", "") text = text.replace("\n", "")

View File

@ -1,7 +1,8 @@
from train_glowtts import config
import json import json
import re import re
from train_glowtts import config
s = json.dumps(config, default=vars, indent=2) s = json.dumps(config, default=vars, indent=2)
s = re.sub(r'"test_sentences":\s*\[\],', '', s) s = re.sub(r'"test_sentences":\s*\[\],', "", s)
print(s) print(s)

View File

@ -3,27 +3,28 @@ import os
# Trainer: Where the ✨️ happens. # Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer. # TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseAudioConfig
# GlowTTSConfig: all model related values for training, validating and testing. # GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig from TTS.tts.configs.glow_tts_config import GlowTTSConfig
# BaseDatasetConfig: defines name, formatter and path of the dataset. # BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig from TTS.tts.configs.shared_configs import BaseAudioConfig, BaseDatasetConfig, CharactersConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
# we use the same path as this script as our training folder. # we use the same path as this script as our training folder.
output_path = '/storage/output-glowtts/' output_path = "/storage/output-glowtts/"
# DEFINE DATASET CONFIG # DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path. # Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter. # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
formatter="bel_tts_formatter", meta_file_train="ipa_final_dataset.csv", path=os.path.join(output_path, "/storage/filtered_dataset/") formatter="bel_tts_formatter",
meta_file_train="ipa_final_dataset.csv",
path=os.path.join(output_path, "/storage/filtered_dataset/"),
) )
characters = CharactersConfig( characters = CharactersConfig(
@ -71,7 +72,6 @@ config = GlowTTSConfig(
) )
if __name__ == "__main__": if __name__ == "__main__":
# INITIALIZE THE AUDIO PROCESSOR # INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O. # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers. # It mainly serves to the dataloader and the training loggers.

View File

@ -1,15 +1,15 @@
import os import os
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseAudioConfig
from coqpit import Coqpit from coqpit import Coqpit
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.shared_configs import BaseAudioConfig
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.vocoder.configs.hifigan_config import *; from TTS.vocoder.configs.hifigan_config import *
from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN from TTS.vocoder.models.gan import GAN
output_path = '/storage/output-hifigan/' output_path = "/storage/output-hifigan/"
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
mel_fmin=50, mel_fmin=50,

View File

@ -5,7 +5,7 @@ from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseDatasetConfig from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTTS, VocoderConfig from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio.processor import AudioProcessor from TTS.utils.audio.processor import AudioProcessor
@ -14,7 +14,9 @@ data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad"
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us") dataset_config = BaseDatasetConfig(
dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us"
)
audio_config = DelightfulTtsAudioConfig() audio_config = DelightfulTtsAudioConfig()
@ -73,9 +75,7 @@ speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speak
config.model_args.num_speakers = speaker_manager.num_speakers config.model_args.num_speakers = speaker_manager.num_speakers
model = DelightfulTTS( model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None)
ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None
)
trainer = Trainer( trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples

View File

@ -39,7 +39,9 @@ config = DelightfulTTSConfig(
print_eval=True, print_eval=True,
binary_align_loss_alpha=0.0, binary_align_loss_alpha=0.0,
use_attn_priors=False, use_attn_priors=False,
test_sentences=["Be a voice, not an echo."], test_sentences=[
["Be a voice, not an echo.", "ljspeech-0"],
],
output_path=output_path, output_path=output_path,
use_speaker_embedding=False, use_speaker_embedding=False,
use_d_vector_file=True, use_d_vector_file=True,

View File

@ -37,7 +37,9 @@ config = DelightfulTTSConfig(
print_eval=True, print_eval=True,
binary_align_loss_alpha=0.0, binary_align_loss_alpha=0.0,
use_attn_priors=False, use_attn_priors=False,
test_sentences=["Be a voice, not an echo."], test_sentences=[
["Be a voice, not an echo.", "ljspeech"],
],
output_path=output_path, output_path=output_path,
num_speakers=4, num_speakers=4,
use_speaker_embedding=True, use_speaker_embedding=True,

View File

@ -51,7 +51,7 @@ config = DelightfulTTSConfig(
use_attn_priors=False, use_attn_priors=False,
print_eval=True, print_eval=True,
test_sentences=[ test_sentences=[
"Be a voice, not an echo.", ["Be a voice, not an echo."],
], ],
use_speaker_embedding=False, use_speaker_embedding=False,
) )