Merge pull request #38 from mozilla/server

Demo Server targeting #25
2018-06-06 15:52:02 +02:00 · 2018-06-06 15:52:02 +02:00 · fcc394ba3d
parent 5fac8cdead 9b40109954
commit fcc394ba3d
6 changed files with 226 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,5 @@ tensorboardX
 torch
 matplotlib
 Pillow
 flask
 scipy
--- a/server/README.md
+++ b/server/README.md
@ -0,0 +1,9 @@
 ## TTS example web-server
 Steps to run:
 1. Download one of the models given on the main page.
 2. Checkout the corresponding commit history. 
 2. Set paths and other options in the file ```server/conf.json```.
 3. Run the server ```python server/server.py -c conf.json```. (Requires Flask)
 4. Go to ```localhost:[given_port]``` and enjoy.
 Note that the audio quality on browser is slightly worse due to the encoder quantization. 
--- a/server/conf.json
+++ b/server/conf.json
@ -0,0 +1,7 @@
 {
    "model_path":"/home/egolge/projects/models/May-22-2018_03_24PM-e6112f7",
    "model_name":"checkpoint_272976.pth.tar",
    "model_config":"config.json",
    "port": 5000,
    "use_cuda": true
 }
--- a/server/server.py
+++ b/server/server.py
@ -0,0 +1,32 @@
 #!flask/bin/python
 import argparse
 from synthesizer import Synthesizer
 from TTS.utils.generic_utils import load_config
 from flask import (Flask, Response, request,
                  render_template, send_file)
 parser = argparse.ArgumentParser()
 parser.add_argument('-c', '--config_path', type=str,
                    help='path to config file for training')
 args = parser.parse_args()
 config = load_config(args.config_path)
 app = Flask(__name__)
 synthesizer = Synthesizer()
 synthesizer.load_model(config.model_path, config.model_name,
                       config.model_config, config.use_cuda)
@app.route('/')
 def index():
    return render_template('index.html')
@app.route('/api/tts', methods=['GET'])
 def tts():
    text = request.args.get('text')
    print(" > Model input: {}".format(text))
    data = synthesizer.tts(text)
    return send_file(data,  
                     mimetype='audio/wav')
 if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=config.port)
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -0,0 +1,71 @@
 import io
 import os
 import librosa
 import torch
 import scipy
 import numpy as np
 import soundfile as sf
 from TTS.utils.text import text_to_sequence
 from TTS.utils.generic_utils import load_config
 from TTS.utils.audio import AudioProcessor
 from TTS.models.tacotron import Tacotron
 from matplotlib import pylab as plt
 class Synthesizer(object):
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)        
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()       
    def save_wav(self, wav, path):
        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
        # wav = wav / wav.max()
        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen +='.'
            print(sen)
            sen = sen.strip()
            seq = np.array(text_to_sequence(text, text_cleaner))
            chars_var = torch.from_numpy(seq).unsqueeze(0)
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            # wav = wav[:self.ap.find_endpoint(wav)]
            out = io.BytesIO()
            wavs.append(wav)
            wavs.append(np.zeros(10000))
        self.save_wav(wav, out)
        return out
--- a/server/templates/index.html
+++ b/server/templates/index.html
@ -0,0 +1,104 @@
 <!DOCTYPE html>
 <html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <meta name="description" content="">
    <meta name="author" content="">
    <title>Mozillia - Text2Speech engine</title>
    <!-- Bootstrap core CSS -->
    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" 
     integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
    <!-- Custom styles for this template -->
    <style>
      body {
        padding-top: 54px;
      }
      @media (min-width: 992px) {
        body {
          padding-top: 56px;
        }
      }
    </style>
  </head>
  <body>
    <!-- Navigation -->
    <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
      <div class="container">
        <a class="navbar-brand" href="#">Mozilla TTS</a>
        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
          <span class="navbar-toggler-icon"></span>
        </button>
        <div class="collapse navbar-collapse" id="navbarResponsive">
          <ul class="navbar-nav ml-auto">
            <li class="nav-item active">
              <a class="nav-link" href="#">Home
                <span class="sr-only">(current)</span>
              </a>
            </li>
          </ul>
        </div>
      </div>
    </nav>
    <!-- Page Content -->
    <div class="container">
      <div class="row">
        <div class="col-lg-12 text-center">
          <h1 class="mt-5">Mozilla TTS server example.</h1>
          <p class="lead">It is "work-in-progress" with an "far-to-be-alpha" release.</p>
          <ul class="list-unstyled">
          </ul>
          <input id="text" placeholder="Enter text" size=45 type="text" name="text"> 
          <button id="speak-button" name="speak">Speak</button><br/><br/>
          <audio id="audio" controls autoplay hidden></audio>
          <p id="message"></p>
        </div>
      </div>
    </div>
    <!-- Bootstrap core JavaScript -->
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"></script>
    <script>
            function q(selector) {return document.querySelector(selector)}
            q('#text').focus()
            q('#speak-button').addEventListener('click', function(e) {
                text = q('#text').value
                if (text) {
                    q('#message').textContent = 'Synthesizing...'
                    q('#speak-button').disabled = true
                    q('#audio').hidden = true
                    synthesize(text)
                }
                e.preventDefault()
                return false
            })
            function synthesize(text) {
                fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
                    .then(function(res) {
                        if (!res.ok) throw Error(res.statusText)
                            return res.blob()
                        }).then(function(blob) {
                            q('#message').textContent = ''
                            q('#speak-button').disabled = false
                            q('#audio').src = URL.createObjectURL(blob)
                            q('#audio').hidden = false
                        }).catch(function(err) {
                            q('#message').textContent = 'Error: ' + err.message
                            q('#speak-button').disabled = false
                        })
            }
        </script>
  </body>
 </html>