diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 509b3da6..fe31c510 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -195,11 +195,22 @@ If you don't specify any models, then it uses LJSpeech based English model. help="If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) - + parser.add_argument( + "--reference_wav", + type=str, + help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", + default=None, + ) + parser.add_argument( + "--reference_speaker_idx", + type=str, + help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", + default=None, + ) args = parser.parse_args() # print the description if either text or list_models is not set - if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs: + if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav: parser.parse_args(["-h"]) # load model manager @@ -281,10 +292,11 @@ If you don't specify any models, then it uses LJSpeech based English model. return # RUN THE SYNTHESIS - print(" > Text: {}".format(args.text)) + if args.text: + print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav) + wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6aa30dfe..818b9a54 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -994,6 +994,25 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs + @torch.no_grad() + def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None): + """Inference for voice conversion + + Args: + reference_wav (Tensor): Reference wavform. Tensor of shape [B, T] + speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B] + d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]` + reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B] + reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]` + """ + # compute spectrograms + y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2) + y_lengths = torch.tensor([y.size(-1)]).to(y.device) + speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector + speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector + # print(y.shape, y_lengths.shape) + wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt) + return wav def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """Forward pass for voice conversion @@ -1007,12 +1026,11 @@ class Vits(BaseTTS): speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,] """ assert self.num_speakers > 0, "num_speakers have to be larger than 0." - # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) - elif self.args.use_speaker_embedding and self.args.use_d_vector_file: + elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) else: @@ -1199,7 +1217,7 @@ class Vits(BaseTTS): if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: - d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False) + d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 078ce3f1..c15a3abf 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -65,6 +65,7 @@ class SpeakerManager: self.d_vectors = {} self.speaker_ids = {} + self.d_vectors_by_speakers = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None @@ -166,6 +167,8 @@ class SpeakerManager: self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) + # cache d_vectors_by_speakers for fast inference using a bigger speakers.json + self.d_vectors_by_speakers = self.get_d_vectors_by_speakers() def get_d_vector_by_clip(self, clip_idx: str) -> List: """Get d_vector by clip ID. @@ -187,7 +190,21 @@ class SpeakerManager: Returns: List[List]: all the d_vectors of the given speaker. """ - return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] + return self.d_vectors_by_speakers[speaker_idx] + + def get_d_vectors_by_speakers(self) -> Dict: + """Get all d_vectors by speaker. + + Returns: + Dict: all the d_vectors of each speaker. + """ + d_vectors_by_speakers = {} + for x in self.d_vectors.values(): + if x["name"] not in d_vectors_by_speakers.keys(): + d_vectors_by_speakers[x["name"]] = [x["embedding"]] + else: + d_vectors_by_speakers[x["name"]].append(x["embedding"]) + return d_vectors_by_speakers def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index b6e19ab4..582fb4f1 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -205,3 +205,88 @@ def synthesis( "outputs": outputs, } return return_dict + +def transfer_voice( + model, + CONFIG, + use_cuda, + reference_wav, + speaker_id=None, + d_vector=None, + reference_speaker_id=None, + reference_d_vector=None, + do_trim_silence=False, + use_griffin_lim=False, +): + """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to + the vocoder model. + + Args: + model (TTS.tts.models): + The TTS model to synthesize audio with. + + CONFIG (Coqpit): + Model configuration. + + use_cuda (bool): + Enable/disable CUDA. + + reference_wav (str): + Path of reference_wav to be used to voice conversion. + + speaker_id (int): + Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + d_vector (torch.Tensor): + d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + reference_speaker_id (int): + Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + reference_d_vector (torch.Tensor): + Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + enable_eos_bos_chars (bool): + enable special chars for end of sentence and start of sentence. Defaults to False. + + do_trim_silence (bool): + trim silence after synthesis. Defaults to False. + """ + # pass tensors to backend + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + + if reference_d_vector is not None: + reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda) + + # load reference_wav audio + reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda) + + if hasattr(model, "module"): + _func = model.module.inference_voice_conversion + else: + _func = model.inference_voice_conversion + model_outputs = _func( + reference_wav, + speaker_id, + d_vector, + reference_speaker_id, + reference_d_vector) + + # convert outputs to numpy + # plot results + wav = None + model_outputs = model_outputs.squeeze() + if model_outputs.ndim == 2: # [T, C_spec] + if use_griffin_lim: + wav = inv_spectrogram(model_outputs, model.ap, CONFIG) + # trim silence + if do_trim_silence: + wav = trim_silence(wav, model.ap) + else: # [T,] + wav = model_outputs + + return wav diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index d1abc907..687794b4 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import -from TTS.tts.utils.synthesis import synthesis, trim_silence +from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input @@ -114,10 +114,14 @@ class Synthesizer(object): if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() + self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" if hasattr(self.tts_config, "model_args") and hasattr( @@ -169,11 +173,13 @@ class Synthesizer(object): def tts( self, - text: str, + text: str = "", speaker_name: str = "", language_name: str = "", speaker_wav: Union[str, List[str]] = None, style_wav=None, + reference_wav=None, + reference_speaker_name=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -183,15 +189,23 @@ class Synthesizer(object): language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. - + reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. + reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] - sens = self.split_into_sentences(text) - print(" > Text splitted to sentences.") - print(sens) + + if not text and not reference_wav: + raise ValueError( + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) + + if text: + sens = self.split_into_sentences(text) + print(" > Text splitted to sentences.") + print(sens) # handle multi-speaker speaker_embedding = None @@ -199,8 +213,8 @@ class Synthesizer(object): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: - # get the speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0] + # get the average speaker embedding from the saved d_vectors. + speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name @@ -209,7 +223,7 @@ class Synthesizer(object): elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " - "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." + "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model." ) else: speaker_embedding = None @@ -246,22 +260,83 @@ class Synthesizer(object): use_gl = self.vocoder_model is None - for sen in sens: - # synthesize voice - outputs = synthesis( - model=self.tts_model, - text=sen, - CONFIG=self.tts_config, - use_cuda=self.use_cuda, - speaker_id=speaker_id, - language_id=language_id, - style_wav=style_wav, - use_griffin_lim=use_gl, - d_vector=speaker_embedding, - ) - waveform = outputs["wav"] - mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + if not reference_wav: + for sen in sens: + # synthesize voice + outputs = synthesis( + model=self.tts_model, + text=sen, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + speaker_id=speaker_id, + language_id=language_id, + style_wav=style_wav, + use_griffin_lim=use_gl, + d_vector=speaker_embedding, + ) + waveform = outputs["wav"] + mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + if not use_gl: + # denormalize tts output based on tts audio config + mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T + device_type = "cuda" if self.use_cuda else "cpu" + # renormalize spectrogram based on vocoder config + vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) + # compute scale factor for possible sample rate mismatch + scale_factor = [ + 1, + self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, + ] + if scale_factor[1] != 1: + print(" > interpolating tts model output.") + vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) + else: + vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable + # run vocoder model + # [1, T, C] + waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) + if self.use_cuda and not use_gl: + waveform = waveform.cpu() + if not use_gl: + waveform = waveform.numpy() + waveform = waveform.squeeze() + + # trim silence + if self.tts_config.audio["do_trim_silence"] is True: + waveform = trim_silence(waveform, self.tts_model.ap) + + wavs += list(waveform) + wavs += [0] * 10000 + else: + # get the speaker embedding or speaker id for the reference wav file + reference_speaker_embedding = None + reference_speaker_id = None + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): + if reference_speaker_name and isinstance(reference_speaker_name, str): + if self.tts_config.use_d_vector_file: + # get the speaker embedding from the saved d_vectors. + reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim] + else: + # get speaker idx from the speaker name + reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name] + else: + reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav) + + outputs = transfer_voice( + model=self.tts_model, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + reference_wav=reference_wav, + speaker_id=speaker_id, + d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding + ) + waveform = outputs if not use_gl: + mel_postnet_spec = outputs[0].detach().cpu().numpy() # denormalize tts output based on tts audio config mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" @@ -280,18 +355,11 @@ class Synthesizer(object): # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) - if self.use_cuda and not use_gl: + if self.use_cuda: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() - waveform = waveform.squeeze() - - # trim silence - if self.tts_config.audio["do_trim_silence"] is True: - waveform = trim_silence(waveform, self.tts_model.ap) - - wavs += list(waveform) - wavs += [0] * 10000 + wavs = waveform.squeeze() # compute stats process_time = time.time() - start_time