model path changes for server and string strip

This commit is contained in:
Eren Golge 2018-06-06 07:13:17 -07:00
parent 83b1c7d1cb
commit 0afb14ed5e
5 changed files with 224 additions and 0 deletions

9
server/README.md Normal file
View File

@ -0,0 +1,9 @@
## TTS example web-server
Steps to run:
1. Download one of the models given on the main page.
2. Checkout the corresponding commit history.
2. Set paths and other options in the file ```server/conf.json```.
3. Run the server ```python server/server.py -c conf.json```. (Requires Flask)
4. Go to ```localhost:[given_port]``` and enjoy.
Note that the audio quality on browser is slightly worse due to the encoder quantization.

7
server/conf.json Normal file
View File

@ -0,0 +1,7 @@
{
"model_path":"/data/shared/erogol_models/May-25-2018_03:01PM-loc-sens-attention-ad94312",
"model_name":"checkpoint_291024.pth.tar",
"model_config":"config.json",
"port": 5000,
"use_cuda": true
}

32
server/server.py Normal file
View File

@ -0,0 +1,32 @@
#!flask/bin/python
import argparse
from synthesizer import Synthesizer
from TTS.utils.generic_utils import load_config
from flask import (Flask, Response, request,
render_template, send_file)
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config_path', type=str,
help='path to config file for training')
args = parser.parse_args()
config = load_config(args.config_path)
app = Flask(__name__)
synthesizer = Synthesizer()
synthesizer.load_model(config.model_path, config.model_name,
config.model_config, config.use_cuda)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/tts', methods=['GET'])
def tts():
text = request.args.get('text')
print(" > Model input: {}".format(text))
data = synthesizer.tts(text)
return send_file(data,
mimetype='audio/wav')
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=config.port)

72
server/synthesizer.py Normal file
View File

@ -0,0 +1,72 @@
import io
import os
import librosa
import torch
import scipy
import numpy as np
import soundfile as sf
from TTS.utils.text import text_to_sequence
from TTS.utils.generic_utils import load_config
from TTS.utils.audio import AudioProcessor
from TTS.models.tacotron import Tacotron
from matplotlib import pylab as plt
class Synthesizer(object):
def load_model(self, model_path, model_name, model_config, use_cuda):
model_config = os.path.join(model_path, model_config)
self.model_file = os.path.join(model_path, model_name)
print(" > Loading model ...")
print(" | > model config: ", model_config)
print(" | > model file: ", self.model_file)
config = load_config(model_config)
self.config = config
self.use_cuda = use_cuda
self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)
# load model state
if use_cuda:
cp = torch.load(self.model_file)
else:
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
# load the model
self.model.load_state_dict(cp['model'])
if use_cuda:
self.model.cuda()
self.model.eval()
def save_wav(self, wav, path):
wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
# sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
# wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
# wav = wav / wav.max()
# sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
# librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
def tts(self, text):
text_cleaner = [self.config.text_cleaner]
wavs = []
for sen in text.split('.'):
if len(sen) < 3:
continue
sen = sen.strip()
sen +='.'
print(sen)
sen = sen.strip()
seq = np.array(text_to_sequence(text, text_cleaner))
chars_var = torch.from_numpy(seq).unsqueeze(0)
if self.use_cuda:
chars_var = chars_var.cuda()
mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
linear_out = linear_out[0].data.cpu().numpy()
wav = self.ap.inv_spectrogram(linear_out.T)
# wav = wav[:self.ap.find_endpoint(wav)]
out = io.BytesIO()
wavs.append(wav)
wavs.append(np.zeros(10000))
self.save_wav(wav, out)
return out

104
server/templates/index.html Normal file
View File

@ -0,0 +1,104 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="">
<title>Mozillia - Text2Speech engine</title>
<!-- Bootstrap core CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
<!-- Custom styles for this template -->
<style>
body {
padding-top: 54px;
}
@media (min-width: 992px) {
body {
padding-top: 56px;
}
}
</style>
</head>
<body>
<!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
<div class="container">
<a class="navbar-brand" href="#">Mozilla TTS</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarResponsive">
<ul class="navbar-nav ml-auto">
<li class="nav-item active">
<a class="nav-link" href="#">Home
<span class="sr-only">(current)</span>
</a>
</li>
</ul>
</div>
</div>
</nav>
<!-- Page Content -->
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<h1 class="mt-5">Mozilla TTS server example.</h1>
<p class="lead">It is "work-in-progress" with an "far-to-be-alpha" release.</p>
<ul class="list-unstyled">
</ul>
<input id="text" placeholder="Enter text" size=45 type="text" name="text">
<button id="speak-button" name="speak">Speak</button><br/><br/>
<audio id="audio" controls autoplay hidden></audio>
<p id="message"></p>
</div>
</div>
</div>
<!-- Bootstrap core JavaScript -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"></script>
<script>
function q(selector) {return document.querySelector(selector)}
q('#text').focus()
q('#speak-button').addEventListener('click', function(e) {
text = q('#text').value
if (text) {
q('#message').textContent = 'Synthesizing...'
q('#speak-button').disabled = true
q('#audio').hidden = true
synthesize(text)
}
e.preventDefault()
return false
})
function synthesize(text) {
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
.then(function(res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()
}).then(function(blob) {
q('#message').textContent = ''
q('#speak-button').disabled = false
q('#audio').src = URL.createObjectURL(blob)
q('#audio').hidden = false
}).catch(function(err) {
q('#message').textContent = 'Error: ' + err.message
q('#speak-button').disabled = false
})
}
</script>
</body>
</html>