mirror of https://github.com/coqui-ai/TTS.git
commit
fcc394ba3d
|
@ -6,3 +6,5 @@ tensorboardX
|
||||||
torch
|
torch
|
||||||
matplotlib
|
matplotlib
|
||||||
Pillow
|
Pillow
|
||||||
|
flask
|
||||||
|
scipy
|
|
@ -0,0 +1,9 @@
|
||||||
|
## TTS example web-server
|
||||||
|
Steps to run:
|
||||||
|
1. Download one of the models given on the main page.
|
||||||
|
2. Checkout the corresponding commit history.
|
||||||
|
2. Set paths and other options in the file ```server/conf.json```.
|
||||||
|
3. Run the server ```python server/server.py -c conf.json```. (Requires Flask)
|
||||||
|
4. Go to ```localhost:[given_port]``` and enjoy.
|
||||||
|
|
||||||
|
Note that the audio quality on browser is slightly worse due to the encoder quantization.
|
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"model_path":"/home/egolge/projects/models/May-22-2018_03_24PM-e6112f7",
|
||||||
|
"model_name":"checkpoint_272976.pth.tar",
|
||||||
|
"model_config":"config.json",
|
||||||
|
"port": 5000,
|
||||||
|
"use_cuda": true
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!flask/bin/python
|
||||||
|
import argparse
|
||||||
|
from synthesizer import Synthesizer
|
||||||
|
from TTS.utils.generic_utils import load_config
|
||||||
|
from flask import (Flask, Response, request,
|
||||||
|
render_template, send_file)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-c', '--config_path', type=str,
|
||||||
|
help='path to config file for training')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = load_config(args.config_path)
|
||||||
|
app = Flask(__name__)
|
||||||
|
synthesizer = Synthesizer()
|
||||||
|
synthesizer.load_model(config.model_path, config.model_name,
|
||||||
|
config.model_config, config.use_cuda)
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
return render_template('index.html')
|
||||||
|
|
||||||
|
@app.route('/api/tts', methods=['GET'])
|
||||||
|
def tts():
|
||||||
|
text = request.args.get('text')
|
||||||
|
print(" > Model input: {}".format(text))
|
||||||
|
data = synthesizer.tts(text)
|
||||||
|
return send_file(data,
|
||||||
|
mimetype='audio/wav')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True, host='0.0.0.0', port=config.port)
|
|
@ -0,0 +1,71 @@
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import librosa
|
||||||
|
import torch
|
||||||
|
import scipy
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
from TTS.utils.text import text_to_sequence
|
||||||
|
from TTS.utils.generic_utils import load_config
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.models.tacotron import Tacotron
|
||||||
|
from matplotlib import pylab as plt
|
||||||
|
|
||||||
|
|
||||||
|
class Synthesizer(object):
|
||||||
|
|
||||||
|
def load_model(self, model_path, model_name, model_config, use_cuda):
|
||||||
|
model_config = os.path.join(model_path, model_config)
|
||||||
|
self.model_file = os.path.join(model_path, model_name)
|
||||||
|
print(" > Loading model ...")
|
||||||
|
print(" | > model config: ", model_config)
|
||||||
|
print(" | > model file: ", self.model_file)
|
||||||
|
config = load_config(model_config)
|
||||||
|
self.config = config
|
||||||
|
self.use_cuda = use_cuda
|
||||||
|
self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
|
||||||
|
self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
|
||||||
|
config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
|
||||||
|
config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)
|
||||||
|
# load model state
|
||||||
|
if use_cuda:
|
||||||
|
cp = torch.load(self.model_file)
|
||||||
|
else:
|
||||||
|
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
|
||||||
|
# load the model
|
||||||
|
self.model.load_state_dict(cp['model'])
|
||||||
|
if use_cuda:
|
||||||
|
self.model.cuda()
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
def save_wav(self, wav, path):
|
||||||
|
wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
|
||||||
|
# sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
|
||||||
|
# wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
|
||||||
|
# wav = wav / wav.max()
|
||||||
|
# sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
|
||||||
|
scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
|
||||||
|
# librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
|
||||||
|
|
||||||
|
def tts(self, text):
|
||||||
|
text_cleaner = [self.config.text_cleaner]
|
||||||
|
wavs = []
|
||||||
|
for sen in text.split('.'):
|
||||||
|
if len(sen) < 3:
|
||||||
|
continue
|
||||||
|
sen +='.'
|
||||||
|
print(sen)
|
||||||
|
sen = sen.strip()
|
||||||
|
seq = np.array(text_to_sequence(text, text_cleaner))
|
||||||
|
chars_var = torch.from_numpy(seq).unsqueeze(0)
|
||||||
|
if self.use_cuda:
|
||||||
|
chars_var = chars_var.cuda()
|
||||||
|
mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
|
||||||
|
linear_out = linear_out[0].data.cpu().numpy()
|
||||||
|
wav = self.ap.inv_spectrogram(linear_out.T)
|
||||||
|
# wav = wav[:self.ap.find_endpoint(wav)]
|
||||||
|
out = io.BytesIO()
|
||||||
|
wavs.append(wav)
|
||||||
|
wavs.append(np.zeros(10000))
|
||||||
|
self.save_wav(wav, out)
|
||||||
|
return out
|
|
@ -0,0 +1,104 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
|
||||||
|
<head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||||
|
<meta name="description" content="">
|
||||||
|
<meta name="author" content="">
|
||||||
|
|
||||||
|
<title>Mozillia - Text2Speech engine</title>
|
||||||
|
|
||||||
|
<!-- Bootstrap core CSS -->
|
||||||
|
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||||
|
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
|
||||||
|
|
||||||
|
<!-- Custom styles for this template -->
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
padding-top: 54px;
|
||||||
|
}
|
||||||
|
@media (min-width: 992px) {
|
||||||
|
body {
|
||||||
|
padding-top: 56px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<!-- Navigation -->
|
||||||
|
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
|
||||||
|
<div class="container">
|
||||||
|
<a class="navbar-brand" href="#">Mozilla TTS</a>
|
||||||
|
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
|
||||||
|
<span class="navbar-toggler-icon"></span>
|
||||||
|
</button>
|
||||||
|
<div class="collapse navbar-collapse" id="navbarResponsive">
|
||||||
|
<ul class="navbar-nav ml-auto">
|
||||||
|
<li class="nav-item active">
|
||||||
|
<a class="nav-link" href="#">Home
|
||||||
|
<span class="sr-only">(current)</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<!-- Page Content -->
|
||||||
|
<div class="container">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-lg-12 text-center">
|
||||||
|
<h1 class="mt-5">Mozilla TTS server example.</h1>
|
||||||
|
<p class="lead">It is "work-in-progress" with an "far-to-be-alpha" release.</p>
|
||||||
|
<ul class="list-unstyled">
|
||||||
|
</ul>
|
||||||
|
<input id="text" placeholder="Enter text" size=45 type="text" name="text">
|
||||||
|
<button id="speak-button" name="speak">Speak</button><br/><br/>
|
||||||
|
<audio id="audio" controls autoplay hidden></audio>
|
||||||
|
<p id="message"></p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Bootstrap core JavaScript -->
|
||||||
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
|
||||||
|
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"></script>
|
||||||
|
<script>
|
||||||
|
function q(selector) {return document.querySelector(selector)}
|
||||||
|
q('#text').focus()
|
||||||
|
q('#speak-button').addEventListener('click', function(e) {
|
||||||
|
text = q('#text').value
|
||||||
|
if (text) {
|
||||||
|
q('#message').textContent = 'Synthesizing...'
|
||||||
|
q('#speak-button').disabled = true
|
||||||
|
q('#audio').hidden = true
|
||||||
|
synthesize(text)
|
||||||
|
}
|
||||||
|
e.preventDefault()
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
function synthesize(text) {
|
||||||
|
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
|
||||||
|
.then(function(res) {
|
||||||
|
if (!res.ok) throw Error(res.statusText)
|
||||||
|
return res.blob()
|
||||||
|
}).then(function(blob) {
|
||||||
|
q('#message').textContent = ''
|
||||||
|
q('#speak-button').disabled = false
|
||||||
|
q('#audio').src = URL.createObjectURL(blob)
|
||||||
|
q('#audio').hidden = false
|
||||||
|
}).catch(function(err) {
|
||||||
|
q('#message').textContent = 'Error: ' + err.message
|
||||||
|
q('#speak-button').disabled = false
|
||||||
|
})
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
Loading…
Reference in New Issue