mirror of https://github.com/coqui-ai/TTS.git
commit
5b4e1b48d9
|
@ -5,4 +5,6 @@ tensorboard
|
|||
tensorboardX
|
||||
torch
|
||||
matplotlib
|
||||
Pillow
|
||||
Pillow
|
||||
flask
|
||||
scipy
|
|
@ -0,0 +1,9 @@
|
|||
## TTS example web-server
|
||||
Steps to run:
|
||||
1. Download one of the models given on the main page.
|
||||
2. Checkout the corresponding commit history.
|
||||
2. Set paths and other options in the file ```server/conf.json```.
|
||||
3. Run the server ```python server/server.py -c conf.json```. (Requires Flask)
|
||||
4. Go to ```localhost:[given_port]``` and enjoy.
|
||||
|
||||
Note that the audio quality on browser is slightly worse due to the encoder quantization.
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"model_path":"/home/egolge/projects/models/May-22-2018_03_24PM-e6112f7",
|
||||
"model_name":"checkpoint_272976.pth.tar",
|
||||
"model_config":"config.json",
|
||||
"port": 5000,
|
||||
"use_cuda": true
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
#!flask/bin/python
|
||||
import argparse
|
||||
from synthesizer import Synthesizer
|
||||
from TTS.utils.generic_utils import load_config
|
||||
from flask import (Flask, Response, request,
|
||||
render_template, send_file)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--config_path', type=str,
|
||||
help='path to config file for training')
|
||||
args = parser.parse_args()
|
||||
|
||||
config = load_config(args.config_path)
|
||||
app = Flask(__name__)
|
||||
synthesizer = Synthesizer()
|
||||
synthesizer.load_model(config.model_path, config.model_name,
|
||||
config.model_config, config.use_cuda)
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/api/tts', methods=['GET'])
|
||||
def tts():
|
||||
text = request.args.get('text')
|
||||
print(" > Model input: {}".format(text))
|
||||
data = synthesizer.tts(text)
|
||||
return send_file(data,
|
||||
mimetype='audio/wav')
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port=config.port)
|
|
@ -0,0 +1,71 @@
|
|||
import io
|
||||
import os
|
||||
import librosa
|
||||
import torch
|
||||
import scipy
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from TTS.utils.text import text_to_sequence
|
||||
from TTS.utils.generic_utils import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.models.tacotron import Tacotron
|
||||
from matplotlib import pylab as plt
|
||||
|
||||
|
||||
class Synthesizer(object):
|
||||
|
||||
def load_model(self, model_path, model_name, model_config, use_cuda):
|
||||
model_config = os.path.join(model_path, model_config)
|
||||
self.model_file = os.path.join(model_path, model_name)
|
||||
print(" > Loading model ...")
|
||||
print(" | > model config: ", model_config)
|
||||
print(" | > model file: ", self.model_file)
|
||||
config = load_config(model_config)
|
||||
self.config = config
|
||||
self.use_cuda = use_cuda
|
||||
self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
|
||||
self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
|
||||
config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
|
||||
config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)
|
||||
# load model state
|
||||
if use_cuda:
|
||||
cp = torch.load(self.model_file)
|
||||
else:
|
||||
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
|
||||
# load the model
|
||||
self.model.load_state_dict(cp['model'])
|
||||
if use_cuda:
|
||||
self.model.cuda()
|
||||
self.model.eval()
|
||||
|
||||
def save_wav(self, wav, path):
|
||||
wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
|
||||
# sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
|
||||
# wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
|
||||
# wav = wav / wav.max()
|
||||
# sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
|
||||
scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
|
||||
# librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
|
||||
|
||||
def tts(self, text):
|
||||
text_cleaner = [self.config.text_cleaner]
|
||||
wavs = []
|
||||
for sen in text.split('.'):
|
||||
if len(sen) < 3:
|
||||
continue
|
||||
sen +='.'
|
||||
print(sen)
|
||||
sen = sen.strip()
|
||||
seq = np.array(text_to_sequence(text, text_cleaner))
|
||||
chars_var = torch.from_numpy(seq).unsqueeze(0)
|
||||
if self.use_cuda:
|
||||
chars_var = chars_var.cuda()
|
||||
mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
|
||||
linear_out = linear_out[0].data.cpu().numpy()
|
||||
wav = self.ap.inv_spectrogram(linear_out.T)
|
||||
# wav = wav[:self.ap.find_endpoint(wav)]
|
||||
out = io.BytesIO()
|
||||
wavs.append(wav)
|
||||
wavs.append(np.zeros(10000))
|
||||
self.save_wav(wav, out)
|
||||
return out
|
|
@ -0,0 +1,104 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<meta name="description" content="">
|
||||
<meta name="author" content="">
|
||||
|
||||
<title>Mozillia - Text2Speech engine</title>
|
||||
|
||||
<!-- Bootstrap core CSS -->
|
||||
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
|
||||
|
||||
<!-- Custom styles for this template -->
|
||||
<style>
|
||||
body {
|
||||
padding-top: 54px;
|
||||
}
|
||||
@media (min-width: 992px) {
|
||||
body {
|
||||
padding-top: 56px;
|
||||
}
|
||||
}
|
||||
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<!-- Navigation -->
|
||||
<nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
|
||||
<div class="container">
|
||||
<a class="navbar-brand" href="#">Mozilla TTS</a>
|
||||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarResponsive">
|
||||
<ul class="navbar-nav ml-auto">
|
||||
<li class="nav-item active">
|
||||
<a class="nav-link" href="#">Home
|
||||
<span class="sr-only">(current)</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<!-- Page Content -->
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col-lg-12 text-center">
|
||||
<h1 class="mt-5">Mozilla TTS server example.</h1>
|
||||
<p class="lead">It is "work-in-progress" with an "far-to-be-alpha" release.</p>
|
||||
<ul class="list-unstyled">
|
||||
</ul>
|
||||
<input id="text" placeholder="Enter text" size=45 type="text" name="text">
|
||||
<button id="speak-button" name="speak">Speak</button><br/><br/>
|
||||
<audio id="audio" controls autoplay hidden></audio>
|
||||
<p id="message"></p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Bootstrap core JavaScript -->
|
||||
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
|
||||
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"></script>
|
||||
<script>
|
||||
function q(selector) {return document.querySelector(selector)}
|
||||
q('#text').focus()
|
||||
q('#speak-button').addEventListener('click', function(e) {
|
||||
text = q('#text').value
|
||||
if (text) {
|
||||
q('#message').textContent = 'Synthesizing...'
|
||||
q('#speak-button').disabled = true
|
||||
q('#audio').hidden = true
|
||||
synthesize(text)
|
||||
}
|
||||
e.preventDefault()
|
||||
return false
|
||||
})
|
||||
function synthesize(text) {
|
||||
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
|
||||
.then(function(res) {
|
||||
if (!res.ok) throw Error(res.statusText)
|
||||
return res.blob()
|
||||
}).then(function(blob) {
|
||||
q('#message').textContent = ''
|
||||
q('#speak-button').disabled = false
|
||||
q('#audio').src = URL.createObjectURL(blob)
|
||||
q('#audio').hidden = false
|
||||
}).catch(function(err) {
|
||||
q('#message').textContent = 'Error: ' + err.message
|
||||
q('#speak-button').disabled = false
|
||||
})
|
||||
}
|
||||
</script>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
Loading…
Reference in New Issue