mirror of https://github.com/coqui-ai/TTS.git
Drop diffusion for XTTS
This commit is contained in:
parent
5d418bb84a
commit
46940cb64b
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -9,8 +9,6 @@ import torchaudio
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
|
||||||
from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
|
from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
|
||||||
from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
|
|
||||||
from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps
|
|
||||||
from TTS.tts.layers.xtts.gpt import GPT
|
from TTS.tts.layers.xtts.gpt import GPT
|
||||||
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
|
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
|
||||||
from TTS.tts.layers.xtts.stream_generator import init_stream_support
|
from TTS.tts.layers.xtts.stream_generator import init_stream_support
|
||||||
|
@ -168,12 +166,10 @@ class XttsAudioConfig(Coqpit):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sample_rate (int): The sample rate in which the GPT operates.
|
sample_rate (int): The sample rate in which the GPT operates.
|
||||||
diffusion_sample_rate (int): The sample rate of the diffusion audio waveform.
|
|
||||||
output_sample_rate (int): The sample rate of the output audio waveform.
|
output_sample_rate (int): The sample rate of the output audio waveform.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sample_rate: int = 22050
|
sample_rate: int = 22050
|
||||||
diffusion_sample_rate: int = 24000
|
|
||||||
output_sample_rate: int = 24000
|
output_sample_rate: int = 24000
|
||||||
|
|
||||||
|
|
||||||
|
@ -697,24 +693,11 @@ class Xtts(BaseTTS):
|
||||||
hasattr(self, "hifigan_decoder") and self.hifigan_decoder is not None
|
hasattr(self, "hifigan_decoder") and self.hifigan_decoder is not None
|
||||||
), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
|
), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
|
||||||
wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
|
wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
|
||||||
else:
|
|
||||||
assert hasattr(
|
|
||||||
self, "diffusion_decoder"
|
|
||||||
), "You must disable hifigan decoders to use difffusion by setting `use_hifigan: false`"
|
|
||||||
mel = do_spectrogram_diffusion(
|
|
||||||
self.diffusion_decoder,
|
|
||||||
diffuser,
|
|
||||||
gpt_latents,
|
|
||||||
diffusion_conditioning,
|
|
||||||
temperature=diffusion_temperature,
|
|
||||||
)
|
|
||||||
wav = self.vocoder.inference(mel)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"wav": wav.cpu().numpy().squeeze(),
|
"wav": wav.cpu().numpy().squeeze(),
|
||||||
"gpt_latents": gpt_latents,
|
"gpt_latents": gpt_latents,
|
||||||
"speaker_embedding": speaker_embedding,
|
"speaker_embedding": speaker_embedding,
|
||||||
"diffusion_conditioning": diffusion_conditioning,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
|
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
|
||||||
|
|
|
@ -98,7 +98,7 @@ def main():
|
||||||
)
|
)
|
||||||
# define audio config
|
# define audio config
|
||||||
audio_config = XttsAudioConfig(
|
audio_config = XttsAudioConfig(
|
||||||
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
|
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
|
||||||
)
|
)
|
||||||
# training parameters config
|
# training parameters config
|
||||||
config = GPTTrainerConfig(
|
config = GPTTrainerConfig(
|
||||||
|
|
|
@ -99,7 +99,7 @@ def main():
|
||||||
)
|
)
|
||||||
# define audio config
|
# define audio config
|
||||||
audio_config = XttsAudioConfig(
|
audio_config = XttsAudioConfig(
|
||||||
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
|
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
|
||||||
)
|
)
|
||||||
# training parameters config
|
# training parameters config
|
||||||
config = GPTTrainerConfig(
|
config = GPTTrainerConfig(
|
||||||
|
|
|
@ -89,7 +89,7 @@ model_args = GPTArgs(
|
||||||
use_ne_hifigan=True,
|
use_ne_hifigan=True,
|
||||||
)
|
)
|
||||||
audio_config = XttsAudioConfig(
|
audio_config = XttsAudioConfig(
|
||||||
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
|
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
|
||||||
)
|
)
|
||||||
config = GPTTrainerConfig(
|
config = GPTTrainerConfig(
|
||||||
epochs=1,
|
epochs=1,
|
||||||
|
|
|
@ -89,7 +89,7 @@ model_args = GPTArgs(
|
||||||
use_ne_hifigan=True,
|
use_ne_hifigan=True,
|
||||||
)
|
)
|
||||||
audio_config = XttsAudioConfig(
|
audio_config = XttsAudioConfig(
|
||||||
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
|
sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000
|
||||||
)
|
)
|
||||||
config = GPTTrainerConfig(
|
config = GPTTrainerConfig(
|
||||||
epochs=1,
|
epochs=1,
|
||||||
|
|
Loading…
Reference in New Issue