Merge pull request #38 from mozilla/server

Demo Server targeting #25
2018-06-06 15:52:02 +02:00 · 2018-06-06 15:52:02 +02:00 · 5b4e1b48d9
parent 52df57dadf 12d5f051d4
commit 5b4e1b48d9
6 changed files with 226 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,4 +5,6 @@ tensorboard
 tensorboardX
 torch
 matplotlib
-Pillow
+Pillow
+flask
+scipy
--- a/server/README.md
+++ b/server/README.md
@ -0,0 +1,9 @@
+## TTS example web-server
+Steps to run:
+1. Download one of the models given on the main page.
+2. Checkout the corresponding commit history. 
+2. Set paths and other options in the file ```server/conf.json```.
+3. Run the server ```python server/server.py -c conf.json```. (Requires Flask)
+4. Go to ```localhost:[given_port]``` and enjoy.
+
+Note that the audio quality on browser is slightly worse due to the encoder quantization. 
--- a/server/conf.json
+++ b/server/conf.json
@ -0,0 +1,7 @@
+{
+    "model_path":"/home/egolge/projects/models/May-22-2018_03_24PM-e6112f7",
+    "model_name":"checkpoint_272976.pth.tar",
+    "model_config":"config.json",
+    "port": 5000,
+    "use_cuda": true
+}
--- a/server/server.py
+++ b/server/server.py
@ -0,0 +1,32 @@
+#!flask/bin/python
+import argparse
+from synthesizer import Synthesizer
+from TTS.utils.generic_utils import load_config
+from flask import (Flask, Response, request,
+                  render_template, send_file)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--config_path', type=str,
+                    help='path to config file for training')
+args = parser.parse_args()
+
+config = load_config(args.config_path)
+app = Flask(__name__)
+synthesizer = Synthesizer()
+synthesizer.load_model(config.model_path, config.model_name,
+                       config.model_config, config.use_cuda)
+
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+@app.route('/api/tts', methods=['GET'])
+def tts():
+    text = request.args.get('text')
+    print(" > Model input: {}".format(text))
+    data = synthesizer.tts(text)
+    return send_file(data,  
+                     mimetype='audio/wav')
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=config.port)
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -0,0 +1,71 @@
+import io
+import os
+import librosa
+import torch
+import scipy
+import numpy as np
+import soundfile as sf
+from TTS.utils.text import text_to_sequence
+from TTS.utils.generic_utils import load_config
+from TTS.utils.audio import AudioProcessor
+from TTS.models.tacotron import Tacotron
+from matplotlib import pylab as plt
+
+
+class Synthesizer(object):
+
+    def load_model(self, model_path, model_name, model_config, use_cuda):
+        model_config = os.path.join(model_path, model_config)
+        self.model_file = os.path.join(model_path, model_name)        
+        print(" > Loading model ...")
+        print(" | > model config: ", model_config)
+        print(" | > model file: ", self.model_file)
+        config = load_config(model_config)
+        self.config = config
+        self.use_cuda = use_cuda
+        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
+        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
+                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
+                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
+        # load model state
+        if use_cuda:
+            cp = torch.load(self.model_file)
+        else:
+            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
+        # load the model
+        self.model.load_state_dict(cp['model'])
+        if use_cuda:
+            self.model.cuda()
+        self.model.eval()       
+    
+    def save_wav(self, wav, path):
+        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
+        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
+        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
+        # wav = wav / wav.max()
+        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
+        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
+        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
+
+    def tts(self, text):
+        text_cleaner = [self.config.text_cleaner]
+        wavs = []
+        for sen in text.split('.'):
+            if len(sen) < 3:
+                continue
+            sen +='.'
+            print(sen)
+            sen = sen.strip()
+            seq = np.array(text_to_sequence(text, text_cleaner))
+            chars_var = torch.from_numpy(seq).unsqueeze(0)
+            if self.use_cuda:
+                chars_var = chars_var.cuda()
+            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
+            linear_out = linear_out[0].data.cpu().numpy()
+            wav = self.ap.inv_spectrogram(linear_out.T)
+            # wav = wav[:self.ap.find_endpoint(wav)]
+            out = io.BytesIO()
+            wavs.append(wav)
+            wavs.append(np.zeros(10000))
+        self.save_wav(wav, out)
+        return out
--- a/server/templates/index.html
+++ b/server/templates/index.html
@ -0,0 +1,104 @@
+<!DOCTYPE html>
+<html lang="en">
+
+  <head>
+
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="">
+    <meta name="author" content="">
+
+    <title>Mozillia - Text2Speech engine</title>
+
+    <!-- Bootstrap core CSS -->
+    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" 
+     integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet">
+
+    <!-- Custom styles for this template -->
+    <style>
+      body {
+        padding-top: 54px;
+      }
+      @media (min-width: 992px) {
+        body {
+          padding-top: 56px;
+        }
+      }
+
+    </style>
+  </head>
+  
+  <body>
+
+    <!-- Navigation -->
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
+      <div class="container">
+        <a class="navbar-brand" href="#">Mozilla TTS</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarResponsive">
+          <ul class="navbar-nav ml-auto">
+            <li class="nav-item active">
+              <a class="nav-link" href="#">Home
+                <span class="sr-only">(current)</span>
+              </a>
+            </li>
+          </ul>
+        </div>
+      </div>
+    </nav>
+
+    <!-- Page Content -->
+    <div class="container">
+      <div class="row">
+        <div class="col-lg-12 text-center">
+          <h1 class="mt-5">Mozilla TTS server example.</h1>
+          <p class="lead">It is "work-in-progress" with an "far-to-be-alpha" release.</p>
+          <ul class="list-unstyled">
+          </ul>
+          <input id="text" placeholder="Enter text" size=45 type="text" name="text"> 
+          <button id="speak-button" name="speak">Speak</button><br/><br/>
+          <audio id="audio" controls autoplay hidden></audio>
+          <p id="message"></p>
+        </div>
+      </div>
+    </div>
+
+    <!-- Bootstrap core JavaScript -->
+    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"></script>
+    <script>
+            function q(selector) {return document.querySelector(selector)}
+            q('#text').focus()
+            q('#speak-button').addEventListener('click', function(e) {
+                text = q('#text').value
+                if (text) {
+                    q('#message').textContent = 'Synthesizing...'
+                    q('#speak-button').disabled = true
+                    q('#audio').hidden = true
+                    synthesize(text)
+                }
+                e.preventDefault()
+                return false
+            })
+            function synthesize(text) {
+                fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})
+                    .then(function(res) {
+                        if (!res.ok) throw Error(res.statusText)
+                            return res.blob()
+                        }).then(function(blob) {
+                            q('#message').textContent = ''
+                            q('#speak-button').disabled = false
+                            q('#audio').src = URL.createObjectURL(blob)
+                            q('#audio').hidden = false
+                        }).catch(function(err) {
+                            q('#message').textContent = 'Error: ' + err.message
+                            q('#speak-button').disabled = false
+                        })
+            }
+        </script>
+
+  </body>
+
+</html>