Drop diffusion for XTTS

This commit is contained in:
Eren G??lge 2023-11-06 19:01:02 +01:00
parent 5d418bb84a
commit 46940cb64b
7 changed files with 4 additions and 2891 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -9,8 +9,6 @@ import torchaudio
from coqpit import Coqpit
from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps
from TTS.tts.layers.xtts.gpt import GPT
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
from TTS.tts.layers.xtts.stream_generator import init_stream_support
@ -168,12 +166,10 @@ class XttsAudioConfig(Coqpit):
Args:
sample_rate (int): The sample rate in which the GPT operates.
diffusion_sample_rate (int): The sample rate of the diffusion audio waveform.
output_sample_rate (int): The sample rate of the output audio waveform.
"""
sample_rate: int = 22050
diffusion_sample_rate: int = 24000
output_sample_rate: int = 24000
@ -697,24 +693,11 @@ class Xtts(BaseTTS):
hasattr(self, "hifigan_decoder") and self.hifigan_decoder is not None
), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
else:
assert hasattr(
self, "diffusion_decoder"
), "You must disable hifigan decoders to use difffusion by setting `use_hifigan: false`"
mel = do_spectrogram_diffusion(
self.diffusion_decoder,
diffuser,
gpt_latents,
diffusion_conditioning,
temperature=diffusion_temperature,
)
wav = self.vocoder.inference(mel)
return {
"wav": wav.cpu().numpy().squeeze(),
"gpt_latents": gpt_latents,
"speaker_embedding": speaker_embedding,
"diffusion_conditioning": diffusion_conditioning,
}
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):

View File

@ -98,7 +98,7 @@ def main():
)
# define audio config
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
)
# training parameters config
config = GPTTrainerConfig(

View File

@ -99,7 +99,7 @@ def main():
)
# define audio config
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
)
# training parameters config
config = GPTTrainerConfig(

View File

@ -89,7 +89,7 @@ model_args = GPTArgs(
use_ne_hifigan=True,
)
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
)
config = GPTTrainerConfig(
epochs=1,

View File

@ -89,7 +89,7 @@ model_args = GPTArgs(
use_ne_hifigan=True,
)
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
)
config = GPTTrainerConfig(
epochs=1,