mirror of https://github.com/coqui-ai/TTS.git
Add Voice conversion inference support (#1337)
* Add support for voice conversion inference * Cache d_vectors_by_speaker for fast inference using a bigger speakers.json * Rebase bug fix * Use the average d-vector for inference
This commit is contained in:
parent
917f417ac4
commit
dbe9da7f15
|
@ -195,11 +195,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--reference_wav",
|
||||||
|
type=str,
|
||||||
|
help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--reference_speaker_idx",
|
||||||
|
type=str,
|
||||||
|
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# print the description if either text or list_models is not set
|
# print the description if either text or list_models is not set
|
||||||
if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs:
|
if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
|
||||||
parser.parse_args(["-h"])
|
parser.parse_args(["-h"])
|
||||||
|
|
||||||
# load model manager
|
# load model manager
|
||||||
|
@ -281,10 +292,11 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
return
|
return
|
||||||
|
|
||||||
# RUN THE SYNTHESIS
|
# RUN THE SYNTHESIS
|
||||||
|
if args.text:
|
||||||
print(" > Text: {}".format(args.text))
|
print(" > Text: {}".format(args.text))
|
||||||
|
|
||||||
# kick it
|
# kick it
|
||||||
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)
|
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx)
|
||||||
|
|
||||||
# save the results
|
# save the results
|
||||||
print(" > Saving output to {}".format(args.out_path))
|
print(" > Saving output to {}".format(args.out_path))
|
||||||
|
|
|
@ -994,6 +994,25 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
|
outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
|
||||||
return outputs
|
return outputs
|
||||||
|
@torch.no_grad()
|
||||||
|
def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
|
||||||
|
"""Inference for voice conversion
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reference_wav (Tensor): Reference wavform. Tensor of shape [B, T]
|
||||||
|
speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B]
|
||||||
|
d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]`
|
||||||
|
reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B]
|
||||||
|
reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
|
||||||
|
"""
|
||||||
|
# compute spectrograms
|
||||||
|
y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
|
||||||
|
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
||||||
|
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
||||||
|
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
||||||
|
# print(y.shape, y_lengths.shape)
|
||||||
|
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
|
||||||
|
return wav
|
||||||
|
|
||||||
def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
|
def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
|
||||||
"""Forward pass for voice conversion
|
"""Forward pass for voice conversion
|
||||||
|
@ -1007,12 +1026,11 @@ class Vits(BaseTTS):
|
||||||
speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
|
speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
|
||||||
"""
|
"""
|
||||||
assert self.num_speakers > 0, "num_speakers have to be larger than 0."
|
assert self.num_speakers > 0, "num_speakers have to be larger than 0."
|
||||||
|
|
||||||
# speaker embedding
|
# speaker embedding
|
||||||
if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
|
if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
|
||||||
g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
|
g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
|
||||||
g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1)
|
g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1)
|
||||||
elif self.args.use_speaker_embedding and self.args.use_d_vector_file:
|
elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
|
||||||
g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
|
g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
|
||||||
g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
|
g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
|
||||||
else:
|
else:
|
||||||
|
@ -1199,7 +1217,7 @@ class Vits(BaseTTS):
|
||||||
if speaker_name is None:
|
if speaker_name is None:
|
||||||
d_vector = self.speaker_manager.get_random_d_vector()
|
d_vector = self.speaker_manager.get_random_d_vector()
|
||||||
else:
|
else:
|
||||||
d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False)
|
d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
|
||||||
elif config.use_speaker_embedding:
|
elif config.use_speaker_embedding:
|
||||||
if speaker_name is None:
|
if speaker_name is None:
|
||||||
speaker_id = self.speaker_manager.get_random_speaker_id()
|
speaker_id = self.speaker_manager.get_random_speaker_id()
|
||||||
|
|
|
@ -65,6 +65,7 @@ class SpeakerManager:
|
||||||
|
|
||||||
self.d_vectors = {}
|
self.d_vectors = {}
|
||||||
self.speaker_ids = {}
|
self.speaker_ids = {}
|
||||||
|
self.d_vectors_by_speakers = {}
|
||||||
self.clip_ids = []
|
self.clip_ids = []
|
||||||
self.speaker_encoder = None
|
self.speaker_encoder = None
|
||||||
self.speaker_encoder_ap = None
|
self.speaker_encoder_ap = None
|
||||||
|
@ -166,6 +167,8 @@ class SpeakerManager:
|
||||||
self.speaker_ids = {name: i for i, name in enumerate(speakers)}
|
self.speaker_ids = {name: i for i, name in enumerate(speakers)}
|
||||||
|
|
||||||
self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
|
self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
|
||||||
|
# cache d_vectors_by_speakers for fast inference using a bigger speakers.json
|
||||||
|
self.d_vectors_by_speakers = self.get_d_vectors_by_speakers()
|
||||||
|
|
||||||
def get_d_vector_by_clip(self, clip_idx: str) -> List:
|
def get_d_vector_by_clip(self, clip_idx: str) -> List:
|
||||||
"""Get d_vector by clip ID.
|
"""Get d_vector by clip ID.
|
||||||
|
@ -187,7 +190,21 @@ class SpeakerManager:
|
||||||
Returns:
|
Returns:
|
||||||
List[List]: all the d_vectors of the given speaker.
|
List[List]: all the d_vectors of the given speaker.
|
||||||
"""
|
"""
|
||||||
return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx]
|
return self.d_vectors_by_speakers[speaker_idx]
|
||||||
|
|
||||||
|
def get_d_vectors_by_speakers(self) -> Dict:
|
||||||
|
"""Get all d_vectors by speaker.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: all the d_vectors of each speaker.
|
||||||
|
"""
|
||||||
|
d_vectors_by_speakers = {}
|
||||||
|
for x in self.d_vectors.values():
|
||||||
|
if x["name"] not in d_vectors_by_speakers.keys():
|
||||||
|
d_vectors_by_speakers[x["name"]] = [x["embedding"]]
|
||||||
|
else:
|
||||||
|
d_vectors_by_speakers[x["name"]].append(x["embedding"])
|
||||||
|
return d_vectors_by_speakers
|
||||||
|
|
||||||
def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
|
def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
|
||||||
"""Get mean d_vector of a speaker ID.
|
"""Get mean d_vector of a speaker ID.
|
||||||
|
|
|
@ -205,3 +205,88 @@ def synthesis(
|
||||||
"outputs": outputs,
|
"outputs": outputs,
|
||||||
}
|
}
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
def transfer_voice(
|
||||||
|
model,
|
||||||
|
CONFIG,
|
||||||
|
use_cuda,
|
||||||
|
reference_wav,
|
||||||
|
speaker_id=None,
|
||||||
|
d_vector=None,
|
||||||
|
reference_speaker_id=None,
|
||||||
|
reference_d_vector=None,
|
||||||
|
do_trim_silence=False,
|
||||||
|
use_griffin_lim=False,
|
||||||
|
):
|
||||||
|
"""Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
|
||||||
|
the vocoder model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (TTS.tts.models):
|
||||||
|
The TTS model to synthesize audio with.
|
||||||
|
|
||||||
|
CONFIG (Coqpit):
|
||||||
|
Model configuration.
|
||||||
|
|
||||||
|
use_cuda (bool):
|
||||||
|
Enable/disable CUDA.
|
||||||
|
|
||||||
|
reference_wav (str):
|
||||||
|
Path of reference_wav to be used to voice conversion.
|
||||||
|
|
||||||
|
speaker_id (int):
|
||||||
|
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
|
||||||
|
|
||||||
|
d_vector (torch.Tensor):
|
||||||
|
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
|
||||||
|
|
||||||
|
reference_speaker_id (int):
|
||||||
|
Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
|
||||||
|
|
||||||
|
reference_d_vector (torch.Tensor):
|
||||||
|
Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
|
||||||
|
|
||||||
|
enable_eos_bos_chars (bool):
|
||||||
|
enable special chars for end of sentence and start of sentence. Defaults to False.
|
||||||
|
|
||||||
|
do_trim_silence (bool):
|
||||||
|
trim silence after synthesis. Defaults to False.
|
||||||
|
"""
|
||||||
|
# pass tensors to backend
|
||||||
|
if speaker_id is not None:
|
||||||
|
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
|
||||||
|
|
||||||
|
if d_vector is not None:
|
||||||
|
d_vector = embedding_to_torch(d_vector, cuda=use_cuda)
|
||||||
|
|
||||||
|
if reference_d_vector is not None:
|
||||||
|
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
|
||||||
|
|
||||||
|
# load reference_wav audio
|
||||||
|
reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
|
||||||
|
|
||||||
|
if hasattr(model, "module"):
|
||||||
|
_func = model.module.inference_voice_conversion
|
||||||
|
else:
|
||||||
|
_func = model.inference_voice_conversion
|
||||||
|
model_outputs = _func(
|
||||||
|
reference_wav,
|
||||||
|
speaker_id,
|
||||||
|
d_vector,
|
||||||
|
reference_speaker_id,
|
||||||
|
reference_d_vector)
|
||||||
|
|
||||||
|
# convert outputs to numpy
|
||||||
|
# plot results
|
||||||
|
wav = None
|
||||||
|
model_outputs = model_outputs.squeeze()
|
||||||
|
if model_outputs.ndim == 2: # [T, C_spec]
|
||||||
|
if use_griffin_lim:
|
||||||
|
wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
|
||||||
|
# trim silence
|
||||||
|
if do_trim_silence:
|
||||||
|
wav = trim_silence(wav, model.ap)
|
||||||
|
else: # [T,]
|
||||||
|
wav = model_outputs
|
||||||
|
|
||||||
|
return wav
|
||||||
|
|
|
@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model
|
||||||
|
|
||||||
# pylint: disable=unused-wildcard-import
|
# pylint: disable=unused-wildcard-import
|
||||||
# pylint: disable=wildcard-import
|
# pylint: disable=wildcard-import
|
||||||
from TTS.tts.utils.synthesis import synthesis, trim_silence
|
from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
||||||
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
|
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
|
||||||
|
@ -114,10 +114,14 @@ class Synthesizer(object):
|
||||||
|
|
||||||
if not self.encoder_checkpoint:
|
if not self.encoder_checkpoint:
|
||||||
self._set_speaker_encoder_paths_from_tts_config()
|
self._set_speaker_encoder_paths_from_tts_config()
|
||||||
|
|
||||||
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
|
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
self.tts_model.cuda()
|
self.tts_model.cuda()
|
||||||
|
|
||||||
|
if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
|
||||||
|
self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
|
||||||
|
|
||||||
def _set_speaker_encoder_paths_from_tts_config(self):
|
def _set_speaker_encoder_paths_from_tts_config(self):
|
||||||
"""Set the encoder paths from the tts model config for models with speaker encoders."""
|
"""Set the encoder paths from the tts model config for models with speaker encoders."""
|
||||||
if hasattr(self.tts_config, "model_args") and hasattr(
|
if hasattr(self.tts_config, "model_args") and hasattr(
|
||||||
|
@ -169,11 +173,13 @@ class Synthesizer(object):
|
||||||
|
|
||||||
def tts(
|
def tts(
|
||||||
self,
|
self,
|
||||||
text: str,
|
text: str = "",
|
||||||
speaker_name: str = "",
|
speaker_name: str = "",
|
||||||
language_name: str = "",
|
language_name: str = "",
|
||||||
speaker_wav: Union[str, List[str]] = None,
|
speaker_wav: Union[str, List[str]] = None,
|
||||||
style_wav=None,
|
style_wav=None,
|
||||||
|
reference_wav=None,
|
||||||
|
reference_speaker_name=None,
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""🐸 TTS magic. Run all the models and generate speech.
|
"""🐸 TTS magic. Run all the models and generate speech.
|
||||||
|
|
||||||
|
@ -183,12 +189,20 @@ class Synthesizer(object):
|
||||||
language_name (str, optional): language id for multi-language models. Defaults to "".
|
language_name (str, optional): language id for multi-language models. Defaults to "".
|
||||||
speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
|
speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
|
||||||
style_wav ([type], optional): style waveform for GST. Defaults to None.
|
style_wav ([type], optional): style waveform for GST. Defaults to None.
|
||||||
|
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
|
||||||
|
reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
|
||||||
Returns:
|
Returns:
|
||||||
List[int]: [description]
|
List[int]: [description]
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
wavs = []
|
wavs = []
|
||||||
|
|
||||||
|
if not text and not reference_wav:
|
||||||
|
raise ValueError(
|
||||||
|
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
|
||||||
|
)
|
||||||
|
|
||||||
|
if text:
|
||||||
sens = self.split_into_sentences(text)
|
sens = self.split_into_sentences(text)
|
||||||
print(" > Text splitted to sentences.")
|
print(" > Text splitted to sentences.")
|
||||||
print(sens)
|
print(sens)
|
||||||
|
@ -199,8 +213,8 @@ class Synthesizer(object):
|
||||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||||
if speaker_name and isinstance(speaker_name, str):
|
if speaker_name and isinstance(speaker_name, str):
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the speaker embedding from the saved d_vectors.
|
# get the average speaker embedding from the saved d_vectors.
|
||||||
speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0]
|
speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
|
||||||
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
|
@ -209,7 +223,7 @@ class Synthesizer(object):
|
||||||
elif not speaker_name and not speaker_wav:
|
elif not speaker_name and not speaker_wav:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
" [!] Look like you use a multi-speaker model. "
|
" [!] Look like you use a multi-speaker model. "
|
||||||
"You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model."
|
"You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
speaker_embedding = None
|
speaker_embedding = None
|
||||||
|
@ -246,6 +260,7 @@ class Synthesizer(object):
|
||||||
|
|
||||||
use_gl = self.vocoder_model is None
|
use_gl = self.vocoder_model is None
|
||||||
|
|
||||||
|
if not reference_wav:
|
||||||
for sen in sens:
|
for sen in sens:
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
outputs = synthesis(
|
outputs = synthesis(
|
||||||
|
@ -292,6 +307,59 @@ class Synthesizer(object):
|
||||||
|
|
||||||
wavs += list(waveform)
|
wavs += list(waveform)
|
||||||
wavs += [0] * 10000
|
wavs += [0] * 10000
|
||||||
|
else:
|
||||||
|
# get the speaker embedding or speaker id for the reference wav file
|
||||||
|
reference_speaker_embedding = None
|
||||||
|
reference_speaker_id = None
|
||||||
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||||
|
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
||||||
|
if self.tts_config.use_d_vector_file:
|
||||||
|
# get the speaker embedding from the saved d_vectors.
|
||||||
|
reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0]
|
||||||
|
reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
|
else:
|
||||||
|
# get speaker idx from the speaker name
|
||||||
|
reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name]
|
||||||
|
else:
|
||||||
|
reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav)
|
||||||
|
|
||||||
|
outputs = transfer_voice(
|
||||||
|
model=self.tts_model,
|
||||||
|
CONFIG=self.tts_config,
|
||||||
|
use_cuda=self.use_cuda,
|
||||||
|
reference_wav=reference_wav,
|
||||||
|
speaker_id=speaker_id,
|
||||||
|
d_vector=speaker_embedding,
|
||||||
|
use_griffin_lim=use_gl,
|
||||||
|
reference_speaker_id=reference_speaker_id,
|
||||||
|
reference_d_vector=reference_speaker_embedding
|
||||||
|
)
|
||||||
|
waveform = outputs
|
||||||
|
if not use_gl:
|
||||||
|
mel_postnet_spec = outputs[0].detach().cpu().numpy()
|
||||||
|
# denormalize tts output based on tts audio config
|
||||||
|
mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
|
||||||
|
device_type = "cuda" if self.use_cuda else "cpu"
|
||||||
|
# renormalize spectrogram based on vocoder config
|
||||||
|
vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
|
||||||
|
# compute scale factor for possible sample rate mismatch
|
||||||
|
scale_factor = [
|
||||||
|
1,
|
||||||
|
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
||||||
|
]
|
||||||
|
if scale_factor[1] != 1:
|
||||||
|
print(" > interpolating tts model output.")
|
||||||
|
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
||||||
|
else:
|
||||||
|
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
||||||
|
# run vocoder model
|
||||||
|
# [1, T, C]
|
||||||
|
waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
|
||||||
|
if self.use_cuda:
|
||||||
|
waveform = waveform.cpu()
|
||||||
|
if not use_gl:
|
||||||
|
waveform = waveform.numpy()
|
||||||
|
wavs = waveform.squeeze()
|
||||||
|
|
||||||
# compute stats
|
# compute stats
|
||||||
process_time = time.time() - start_time
|
process_time = time.time() - start_time
|
||||||
|
|
Loading…
Reference in New Issue