mirror of https://github.com/coqui-ai/TTS.git
4.9 KiB
4.9 KiB
None
<html lang="en">
<head>
</head>
</html>
In [ ]:
%load_ext autoreload %autoreload 2 import os import sys import io import torch import time import numpy as np from collections import OrderedDict from matplotlib import pylab as plt %pylab inline rcParams["figure.figsize"] = (16,5) sys.path.append('/home/erogol/projects/') import librosa import librosa.display from TTS.models.tacotron import Tacotron from TTS.layers import * from TTS.utils.data import * from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config from TTS.utils.text import text_to_sequence import IPython from IPython.display import Audio from utils import *
In [ ]:
ls /data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7
In [ ]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True): waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) return waveform def text2audio(text, model, CONFIG, use_cuda, ap): wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen+='.' sen = sen.strip() print(sen) wav = tts(model, sen, CONFIG, use_cuda, ap) wavs.append(wav) wavs.append(np.zeros(10000)) # audio = np.stack(wavs) # IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) return wavs
In [ ]:
# Set constants ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7' MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test/' CONFIG = load_config(CONFIG_PATH) use_cuda = True
In [ ]:
# check_idxs = [50008, 100016, 200032, 266208] check_idxs = [274480]
In [ ]:
# load the model model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r) # load the audio processor ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis, CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30) for idx in check_idxs: MODEL_PATH = MODEL_PATH_TMP.format(idx) print(MODEL_PATH) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 400 text = "Voice is natural, voice is human. That’s why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isn’t available to the majority of people. We think that stifles innovation. So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone." wavs = text2audio(text, model, CONFIG, use_cuda, ap) audio = np.concatenate(wavs) IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')