In [1]:
import IPython
import torch

from IPython.display import Audio

from TTS.config import load_config
from TTS.tts.models import setup_model
from TTS.tts.utils.synthesis import synthesis
from TTS.utils.audio import AudioProcessor

In [2]:
GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'
MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'
CONFIG_PATH = GENERAL_PATH + 'config.json'
TTS_LANGUAGES = GENERAL_PATH + "language_ids.json"
TTS_SPEAKERS = GENERAL_PATH + "speakers.json"
USE_CUDA = torch.cuda.is_available()

In [3]:
# load the config
C = load_config(CONFIG_PATH)

# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
model.load_state_dict(cp['model'])


model.eval()

if USE_CUDA:
    model = model.cuda()

use_griffin_lim = True

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, ML

In [4]:
#set speaker
d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')

In [5]:
model.language_manager.language_id_mapping

{'af': 0,
 'en': 1,
 'fr-fr': 2,
 'jv': 3,
 'pt-br': 4,
 'st': 5,
 'su': 6,
 'tn': 7,
 'xh': 8}

In [6]:
# set scales 
model.noise_scale = 0.0  # defines the noise variance applied to the random z vector at inference.
model.length_scale = 1.0  # scaler for the duration predictor. The larger it is, the slower the speech.
model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.
model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference.

In [7]:
text = "Il m'a fallu beaucoup de temps pour d√©velopper une voix, et maintenant que je l'ai, je ne vais pas me taire."
language_id = 2
wav, alignment, _, _ = synthesis(
                    model,
                    text,
                    C,
                    "cuda" in str(next(model.parameters()).device),
                    ap,
                    speaker_id=None,
                    d_vector=d_vector,
                    style_wav=None,
                    language_id=language_id,
                    enable_eos_bos_chars=C.enable_eos_bos_chars,
                    use_griffin_lim=True,
                    do_trim_silence=False,
                ).values()
IPython.display.display(Audio(wav, rate=ap.sample_rate))