diff --git a/TTS/api.py b/TTS/api.py index 7376cfa4..460e9374 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -85,7 +85,7 @@ class CS_API: self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} if not self.api_token: raise ValueError( - "No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n" + "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n" "Visit 🔗https://app.coqui.ai/account to get one.\n" "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" "" @@ -274,7 +274,10 @@ class TTS: self.model_name = None if model_name: - self.load_tts_model_by_name(model_name, gpu) + if "tts_models" in model_name: + self.load_tts_model_by_name(model_name, gpu) + elif "voice_conversion_models" in model_name: + self.load_vc_model_by_name(model_name, gpu) if model_path: self.load_tts_model_by_path( @@ -565,19 +568,39 @@ class TTS: def voice_conversion( self, - sourve_wav: str, + source_wav: str, target_wav: str, ): """Voice conversion with FreeVC. Convert source wav to target speaker. + Args:`` + source_wav (str): + Path to the source wav file. + target_wav (str):` + Path to the target wav file. + """ + wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) + return wav + + def voice_conversion_to_file( + self, + source_wav: str, + target_wav: str, + file_path: str = "output.wav", + ): + """Voice conversion with FreeVC. Convert source wav to target speaker. + Args: source_wav (str): Path to the source wav file. target_wav (str): Path to the target wav file. + file_path (str, optional): + Output file path. Defaults to "output.wav". """ - wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav) - return wav + wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav) + save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) + return file_path def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): """Convert text to speech with voice conversion.