Update synthesizer.py

Configurable verbose output
2024-03-05 18:36:47 +01:00 · 2024-03-05 18:36:47 +01:00 · 275229a876
parent 64fdd0ed8b
commit 275229a876
1 changed files with 16 additions and 9 deletions
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -264,6 +264,7 @@ class Synthesizer(nn.Module):
        style_text=None,
        reference_wav=None,
        reference_speaker_name=None,
+        verbose: bool = True,
        split_sentences: bool = True,
        **kwargs,
    ) -> List[int]:
@ -278,6 +279,7 @@ class Synthesizer(nn.Module):
            style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
            reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
+            verbose (bool, optional): print verbose output. Defaults to True.
            split_sentences (bool, optional): split the input text into sentences. Defaults to True.
            **kwargs: additional arguments to pass to the TTS model.
        Returns:
@ -294,9 +296,11 @@ class Synthesizer(nn.Module):
        if text:
            sens = [text]
            if split_sentences:
-                print(" > Text splitted to sentences.")
+                if verbose:
+                    print(" > Text splitted to sentences.")
                sens = self.split_into_sentences(text)
-            print(sens)
+            if verbose:
+                print(sens)

        # handle multi-speaker
        if "voice_dir" in kwargs:
@ -420,7 +424,8 @@ class Synthesizer(nn.Module):
                        self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
                    ]
                    if scale_factor[1] != 1:
-                        print(" > interpolating tts model output.")
+                        if verbose:
+                            print(" > interpolating tts model output.")
                        vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
                    else:
                        vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
@ -484,7 +489,8 @@ class Synthesizer(nn.Module):
                    self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
                ]
                if scale_factor[1] != 1:
-                    print(" > interpolating tts model output.")
+                    if verbose:
+                        print(" > interpolating tts model output.")
                    vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
                else:
                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
@ -497,9 +503,10 @@ class Synthesizer(nn.Module):
                waveform = waveform.numpy()
            wavs = waveform.squeeze()

-        # compute stats
-        process_time = time.time() - start_time
-        audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
-        print(f" > Processing time: {process_time}")
-        print(f" > Real-time factor: {process_time / audio_time}")
+        if verbose:
+            # compute stats
+            process_time = time.time() - start_time
+            audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
+            print(f" > Processing time: {process_time}")
+            print(f" > Real-time factor: {process_time / audio_time}")
        return wavs