mirror of https://github.com/coqui-ai/TTS.git
parent
64fdd0ed8b
commit
275229a876
|
@ -264,6 +264,7 @@ class Synthesizer(nn.Module):
|
||||||
style_text=None,
|
style_text=None,
|
||||||
reference_wav=None,
|
reference_wav=None,
|
||||||
reference_speaker_name=None,
|
reference_speaker_name=None,
|
||||||
|
verbose: bool = True,
|
||||||
split_sentences: bool = True,
|
split_sentences: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
|
@ -278,6 +279,7 @@ class Synthesizer(nn.Module):
|
||||||
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
|
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
|
||||||
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
|
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
|
||||||
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
|
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
|
||||||
|
verbose (bool, optional): print verbose output. Defaults to True.
|
||||||
split_sentences (bool, optional): split the input text into sentences. Defaults to True.
|
split_sentences (bool, optional): split the input text into sentences. Defaults to True.
|
||||||
**kwargs: additional arguments to pass to the TTS model.
|
**kwargs: additional arguments to pass to the TTS model.
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -294,9 +296,11 @@ class Synthesizer(nn.Module):
|
||||||
if text:
|
if text:
|
||||||
sens = [text]
|
sens = [text]
|
||||||
if split_sentences:
|
if split_sentences:
|
||||||
print(" > Text splitted to sentences.")
|
if verbose:
|
||||||
|
print(" > Text splitted to sentences.")
|
||||||
sens = self.split_into_sentences(text)
|
sens = self.split_into_sentences(text)
|
||||||
print(sens)
|
if verbose:
|
||||||
|
print(sens)
|
||||||
|
|
||||||
# handle multi-speaker
|
# handle multi-speaker
|
||||||
if "voice_dir" in kwargs:
|
if "voice_dir" in kwargs:
|
||||||
|
@ -420,7 +424,8 @@ class Synthesizer(nn.Module):
|
||||||
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
||||||
]
|
]
|
||||||
if scale_factor[1] != 1:
|
if scale_factor[1] != 1:
|
||||||
print(" > interpolating tts model output.")
|
if verbose:
|
||||||
|
print(" > interpolating tts model output.")
|
||||||
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
||||||
else:
|
else:
|
||||||
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
||||||
|
@ -484,7 +489,8 @@ class Synthesizer(nn.Module):
|
||||||
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
||||||
]
|
]
|
||||||
if scale_factor[1] != 1:
|
if scale_factor[1] != 1:
|
||||||
print(" > interpolating tts model output.")
|
if verbose:
|
||||||
|
print(" > interpolating tts model output.")
|
||||||
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
||||||
else:
|
else:
|
||||||
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
||||||
|
@ -497,9 +503,10 @@ class Synthesizer(nn.Module):
|
||||||
waveform = waveform.numpy()
|
waveform = waveform.numpy()
|
||||||
wavs = waveform.squeeze()
|
wavs = waveform.squeeze()
|
||||||
|
|
||||||
# compute stats
|
if verbose:
|
||||||
process_time = time.time() - start_time
|
# compute stats
|
||||||
audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
|
process_time = time.time() - start_time
|
||||||
print(f" > Processing time: {process_time}")
|
audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
|
||||||
print(f" > Real-time factor: {process_time / audio_time}")
|
print(f" > Processing time: {process_time}")
|
||||||
|
print(f" > Real-time factor: {process_time / audio_time}")
|
||||||
return wavs
|
return wavs
|
||||||
|
|
Loading…
Reference in New Issue