mirror of https://github.com/coqui-ai/TTS.git
Add Voice conversion inference support (#1337)
* Add support for voice conversion inference * Cache d_vectors_by_speaker for fast inference using a bigger speakers.json * Rebase bug fix * Use the average d-vector for inference
This commit is contained in:
parent
917f417ac4
commit
dbe9da7f15
|
@ -195,11 +195,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
||||
default=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--reference_wav",
|
||||
type=str,
|
||||
help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference_speaker_idx",
|
||||
type=str,
|
||||
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
||||
default=None,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# print the description if either text or list_models is not set
|
||||
if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs:
|
||||
if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
|
||||
parser.parse_args(["-h"])
|
||||
|
||||
# load model manager
|
||||
|
@ -281,10 +292,11 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
return
|
||||
|
||||
# RUN THE SYNTHESIS
|
||||
print(" > Text: {}".format(args.text))
|
||||
if args.text:
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
||||
# kick it
|
||||
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)
|
||||
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx)
|
||||
|
||||
# save the results
|
||||
print(" > Saving output to {}".format(args.out_path))
|
||||
|
|
|
@ -994,6 +994,25 @@ class Vits(BaseTTS):
|
|||
|
||||
outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
|
||||
return outputs
|
||||
@torch.no_grad()
|
||||
def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
|
||||
"""Inference for voice conversion
|
||||
|
||||
Args:
|
||||
reference_wav (Tensor): Reference wavform. Tensor of shape [B, T]
|
||||
speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B]
|
||||
d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]`
|
||||
reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B]
|
||||
reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
|
||||
"""
|
||||
# compute spectrograms
|
||||
y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
|
||||
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
||||
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
||||
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
||||
# print(y.shape, y_lengths.shape)
|
||||
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
|
||||
return wav
|
||||
|
||||
def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
|
||||
"""Forward pass for voice conversion
|
||||
|
@ -1007,12 +1026,11 @@ class Vits(BaseTTS):
|
|||
speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
|
||||
"""
|
||||
assert self.num_speakers > 0, "num_speakers have to be larger than 0."
|
||||
|
||||
# speaker embedding
|
||||
if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
|
||||
g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1)
|
||||
elif self.args.use_speaker_embedding and self.args.use_d_vector_file:
|
||||
elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
|
||||
g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
|
||||
g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
|
||||
else:
|
||||
|
@ -1199,7 +1217,7 @@ class Vits(BaseTTS):
|
|||
if speaker_name is None:
|
||||
d_vector = self.speaker_manager.get_random_d_vector()
|
||||
else:
|
||||
d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False)
|
||||
d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
|
||||
elif config.use_speaker_embedding:
|
||||
if speaker_name is None:
|
||||
speaker_id = self.speaker_manager.get_random_speaker_id()
|
||||
|
|
|
@ -65,6 +65,7 @@ class SpeakerManager:
|
|||
|
||||
self.d_vectors = {}
|
||||
self.speaker_ids = {}
|
||||
self.d_vectors_by_speakers = {}
|
||||
self.clip_ids = []
|
||||
self.speaker_encoder = None
|
||||
self.speaker_encoder_ap = None
|
||||
|
@ -166,6 +167,8 @@ class SpeakerManager:
|
|||
self.speaker_ids = {name: i for i, name in enumerate(speakers)}
|
||||
|
||||
self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
|
||||
# cache d_vectors_by_speakers for fast inference using a bigger speakers.json
|
||||
self.d_vectors_by_speakers = self.get_d_vectors_by_speakers()
|
||||
|
||||
def get_d_vector_by_clip(self, clip_idx: str) -> List:
|
||||
"""Get d_vector by clip ID.
|
||||
|
@ -187,7 +190,21 @@ class SpeakerManager:
|
|||
Returns:
|
||||
List[List]: all the d_vectors of the given speaker.
|
||||
"""
|
||||
return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx]
|
||||
return self.d_vectors_by_speakers[speaker_idx]
|
||||
|
||||
def get_d_vectors_by_speakers(self) -> Dict:
|
||||
"""Get all d_vectors by speaker.
|
||||
|
||||
Returns:
|
||||
Dict: all the d_vectors of each speaker.
|
||||
"""
|
||||
d_vectors_by_speakers = {}
|
||||
for x in self.d_vectors.values():
|
||||
if x["name"] not in d_vectors_by_speakers.keys():
|
||||
d_vectors_by_speakers[x["name"]] = [x["embedding"]]
|
||||
else:
|
||||
d_vectors_by_speakers[x["name"]].append(x["embedding"])
|
||||
return d_vectors_by_speakers
|
||||
|
||||
def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
|
||||
"""Get mean d_vector of a speaker ID.
|
||||
|
|
|
@ -205,3 +205,88 @@ def synthesis(
|
|||
"outputs": outputs,
|
||||
}
|
||||
return return_dict
|
||||
|
||||
def transfer_voice(
|
||||
model,
|
||||
CONFIG,
|
||||
use_cuda,
|
||||
reference_wav,
|
||||
speaker_id=None,
|
||||
d_vector=None,
|
||||
reference_speaker_id=None,
|
||||
reference_d_vector=None,
|
||||
do_trim_silence=False,
|
||||
use_griffin_lim=False,
|
||||
):
|
||||
"""Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
|
||||
the vocoder model.
|
||||
|
||||
Args:
|
||||
model (TTS.tts.models):
|
||||
The TTS model to synthesize audio with.
|
||||
|
||||
CONFIG (Coqpit):
|
||||
Model configuration.
|
||||
|
||||
use_cuda (bool):
|
||||
Enable/disable CUDA.
|
||||
|
||||
reference_wav (str):
|
||||
Path of reference_wav to be used to voice conversion.
|
||||
|
||||
speaker_id (int):
|
||||
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
|
||||
|
||||
d_vector (torch.Tensor):
|
||||
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
|
||||
|
||||
reference_speaker_id (int):
|
||||
Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
|
||||
|
||||
reference_d_vector (torch.Tensor):
|
||||
Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
|
||||
|
||||
enable_eos_bos_chars (bool):
|
||||
enable special chars for end of sentence and start of sentence. Defaults to False.
|
||||
|
||||
do_trim_silence (bool):
|
||||
trim silence after synthesis. Defaults to False.
|
||||
"""
|
||||
# pass tensors to backend
|
||||
if speaker_id is not None:
|
||||
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
|
||||
|
||||
if d_vector is not None:
|
||||
d_vector = embedding_to_torch(d_vector, cuda=use_cuda)
|
||||
|
||||
if reference_d_vector is not None:
|
||||
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
|
||||
|
||||
# load reference_wav audio
|
||||
reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
|
||||
|
||||
if hasattr(model, "module"):
|
||||
_func = model.module.inference_voice_conversion
|
||||
else:
|
||||
_func = model.inference_voice_conversion
|
||||
model_outputs = _func(
|
||||
reference_wav,
|
||||
speaker_id,
|
||||
d_vector,
|
||||
reference_speaker_id,
|
||||
reference_d_vector)
|
||||
|
||||
# convert outputs to numpy
|
||||
# plot results
|
||||
wav = None
|
||||
model_outputs = model_outputs.squeeze()
|
||||
if model_outputs.ndim == 2: # [T, C_spec]
|
||||
if use_griffin_lim:
|
||||
wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
|
||||
# trim silence
|
||||
if do_trim_silence:
|
||||
wav = trim_silence(wav, model.ap)
|
||||
else: # [T,]
|
||||
wav = model_outputs
|
||||
|
||||
return wav
|
||||
|
|
|
@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model
|
|||
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import synthesis, trim_silence
|
||||
from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
||||
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
|
||||
|
@ -114,10 +114,14 @@ class Synthesizer(object):
|
|||
|
||||
if not self.encoder_checkpoint:
|
||||
self._set_speaker_encoder_paths_from_tts_config()
|
||||
|
||||
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
|
||||
if use_cuda:
|
||||
self.tts_model.cuda()
|
||||
|
||||
if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
|
||||
self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
|
||||
|
||||
def _set_speaker_encoder_paths_from_tts_config(self):
|
||||
"""Set the encoder paths from the tts model config for models with speaker encoders."""
|
||||
if hasattr(self.tts_config, "model_args") and hasattr(
|
||||
|
@ -169,11 +173,13 @@ class Synthesizer(object):
|
|||
|
||||
def tts(
|
||||
self,
|
||||
text: str,
|
||||
text: str = "",
|
||||
speaker_name: str = "",
|
||||
language_name: str = "",
|
||||
speaker_wav: Union[str, List[str]] = None,
|
||||
style_wav=None,
|
||||
reference_wav=None,
|
||||
reference_speaker_name=None,
|
||||
) -> List[int]:
|
||||
"""🐸 TTS magic. Run all the models and generate speech.
|
||||
|
||||
|
@ -183,15 +189,23 @@ class Synthesizer(object):
|
|||
language_name (str, optional): language id for multi-language models. Defaults to "".
|
||||
speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
|
||||
style_wav ([type], optional): style waveform for GST. Defaults to None.
|
||||
|
||||
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
|
||||
reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
|
||||
Returns:
|
||||
List[int]: [description]
|
||||
"""
|
||||
start_time = time.time()
|
||||
wavs = []
|
||||
sens = self.split_into_sentences(text)
|
||||
print(" > Text splitted to sentences.")
|
||||
print(sens)
|
||||
|
||||
if not text and not reference_wav:
|
||||
raise ValueError(
|
||||
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
|
||||
)
|
||||
|
||||
if text:
|
||||
sens = self.split_into_sentences(text)
|
||||
print(" > Text splitted to sentences.")
|
||||
print(sens)
|
||||
|
||||
# handle multi-speaker
|
||||
speaker_embedding = None
|
||||
|
@ -199,8 +213,8 @@ class Synthesizer(object):
|
|||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||
if speaker_name and isinstance(speaker_name, str):
|
||||
if self.tts_config.use_d_vector_file:
|
||||
# get the speaker embedding from the saved d_vectors.
|
||||
speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0]
|
||||
# get the average speaker embedding from the saved d_vectors.
|
||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
|
||||
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||
else:
|
||||
# get speaker idx from the speaker name
|
||||
|
@ -209,7 +223,7 @@ class Synthesizer(object):
|
|||
elif not speaker_name and not speaker_wav:
|
||||
raise ValueError(
|
||||
" [!] Look like you use a multi-speaker model. "
|
||||
"You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model."
|
||||
"You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model."
|
||||
)
|
||||
else:
|
||||
speaker_embedding = None
|
||||
|
@ -246,22 +260,83 @@ class Synthesizer(object):
|
|||
|
||||
use_gl = self.vocoder_model is None
|
||||
|
||||
for sen in sens:
|
||||
# synthesize voice
|
||||
outputs = synthesis(
|
||||
model=self.tts_model,
|
||||
text=sen,
|
||||
CONFIG=self.tts_config,
|
||||
use_cuda=self.use_cuda,
|
||||
speaker_id=speaker_id,
|
||||
language_id=language_id,
|
||||
style_wav=style_wav,
|
||||
use_griffin_lim=use_gl,
|
||||
d_vector=speaker_embedding,
|
||||
)
|
||||
waveform = outputs["wav"]
|
||||
mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
|
||||
if not reference_wav:
|
||||
for sen in sens:
|
||||
# synthesize voice
|
||||
outputs = synthesis(
|
||||
model=self.tts_model,
|
||||
text=sen,
|
||||
CONFIG=self.tts_config,
|
||||
use_cuda=self.use_cuda,
|
||||
speaker_id=speaker_id,
|
||||
language_id=language_id,
|
||||
style_wav=style_wav,
|
||||
use_griffin_lim=use_gl,
|
||||
d_vector=speaker_embedding,
|
||||
)
|
||||
waveform = outputs["wav"]
|
||||
mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
|
||||
if not use_gl:
|
||||
# denormalize tts output based on tts audio config
|
||||
mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
|
||||
device_type = "cuda" if self.use_cuda else "cpu"
|
||||
# renormalize spectrogram based on vocoder config
|
||||
vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
|
||||
# compute scale factor for possible sample rate mismatch
|
||||
scale_factor = [
|
||||
1,
|
||||
self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
|
||||
]
|
||||
if scale_factor[1] != 1:
|
||||
print(" > interpolating tts model output.")
|
||||
vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
|
||||
else:
|
||||
vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable
|
||||
# run vocoder model
|
||||
# [1, T, C]
|
||||
waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
|
||||
if self.use_cuda and not use_gl:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
waveform = waveform.numpy()
|
||||
waveform = waveform.squeeze()
|
||||
|
||||
# trim silence
|
||||
if self.tts_config.audio["do_trim_silence"] is True:
|
||||
waveform = trim_silence(waveform, self.tts_model.ap)
|
||||
|
||||
wavs += list(waveform)
|
||||
wavs += [0] * 10000
|
||||
else:
|
||||
# get the speaker embedding or speaker id for the reference wav file
|
||||
reference_speaker_embedding = None
|
||||
reference_speaker_id = None
|
||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
||||
if self.tts_config.use_d_vector_file:
|
||||
# get the speaker embedding from the saved d_vectors.
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0]
|
||||
reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||
else:
|
||||
# get speaker idx from the speaker name
|
||||
reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name]
|
||||
else:
|
||||
reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav)
|
||||
|
||||
outputs = transfer_voice(
|
||||
model=self.tts_model,
|
||||
CONFIG=self.tts_config,
|
||||
use_cuda=self.use_cuda,
|
||||
reference_wav=reference_wav,
|
||||
speaker_id=speaker_id,
|
||||
d_vector=speaker_embedding,
|
||||
use_griffin_lim=use_gl,
|
||||
reference_speaker_id=reference_speaker_id,
|
||||
reference_d_vector=reference_speaker_embedding
|
||||
)
|
||||
waveform = outputs
|
||||
if not use_gl:
|
||||
mel_postnet_spec = outputs[0].detach().cpu().numpy()
|
||||
# denormalize tts output based on tts audio config
|
||||
mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
|
||||
device_type = "cuda" if self.use_cuda else "cpu"
|
||||
|
@ -280,18 +355,11 @@ class Synthesizer(object):
|
|||
# run vocoder model
|
||||
# [1, T, C]
|
||||
waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
|
||||
if self.use_cuda and not use_gl:
|
||||
if self.use_cuda:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
waveform = waveform.numpy()
|
||||
waveform = waveform.squeeze()
|
||||
|
||||
# trim silence
|
||||
if self.tts_config.audio["do_trim_silence"] is True:
|
||||
waveform = trim_silence(waveform, self.tts_model.ap)
|
||||
|
||||
wavs += list(waveform)
|
||||
wavs += [0] * 10000
|
||||
wavs = waveform.squeeze()
|
||||
|
||||
# compute stats
|
||||
process_time = time.time() - start_time
|
||||
|
|
Loading…
Reference in New Issue