diff --git a/server/README.md b/server/README.md new file mode 100644 index 00000000..974b695b --- /dev/null +++ b/server/README.md @@ -0,0 +1,9 @@ +## TTS example web-server +Steps to run: +1. Download one of the models given on the main page. +2. Checkout the corresponding commit history. +2. Set paths and other options in the file ```server/conf.json```. +3. Run the server ```python server/server.py -c conf.json```. (Requires Flask) +4. Go to ```localhost:[given_port]``` and enjoy. + +Note that the audio quality on browser is slightly worse due to the encoder quantization. \ No newline at end of file diff --git a/server/conf.json b/server/conf.json new file mode 100644 index 00000000..a4a33a0a --- /dev/null +++ b/server/conf.json @@ -0,0 +1,7 @@ +{ + "model_path":"/data/shared/erogol_models/May-25-2018_03:01PM-loc-sens-attention-ad94312", + "model_name":"checkpoint_291024.pth.tar", + "model_config":"config.json", + "port": 5000, + "use_cuda": true +} diff --git a/server/server.py b/server/server.py new file mode 100644 index 00000000..01267447 --- /dev/null +++ b/server/server.py @@ -0,0 +1,32 @@ +#!flask/bin/python +import argparse +from synthesizer import Synthesizer +from TTS.utils.generic_utils import load_config +from flask import (Flask, Response, request, + render_template, send_file) + +parser = argparse.ArgumentParser() +parser.add_argument('-c', '--config_path', type=str, + help='path to config file for training') +args = parser.parse_args() + +config = load_config(args.config_path) +app = Flask(__name__) +synthesizer = Synthesizer() +synthesizer.load_model(config.model_path, config.model_name, + config.model_config, config.use_cuda) + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/api/tts', methods=['GET']) +def tts(): + text = request.args.get('text') + print(" > Model input: {}".format(text)) + data = synthesizer.tts(text) + return send_file(data, + mimetype='audio/wav') + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0', port=config.port) \ No newline at end of file diff --git a/server/synthesizer.py b/server/synthesizer.py new file mode 100644 index 00000000..808107fe --- /dev/null +++ b/server/synthesizer.py @@ -0,0 +1,72 @@ +import io +import os +import librosa +import torch +import scipy +import numpy as np +import soundfile as sf +from TTS.utils.text import text_to_sequence +from TTS.utils.generic_utils import load_config +from TTS.utils.audio import AudioProcessor +from TTS.models.tacotron import Tacotron +from matplotlib import pylab as plt + + +class Synthesizer(object): + + def load_model(self, model_path, model_name, model_config, use_cuda): + model_config = os.path.join(model_path, model_config) + self.model_file = os.path.join(model_path, model_name) + print(" > Loading model ...") + print(" | > model config: ", model_config) + print(" | > model file: ", self.model_file) + config = load_config(model_config) + self.config = config + self.use_cuda = use_cuda + self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r) + self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db, + config.frame_shift_ms, config.frame_length_ms, config.preemphasis, + config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60) + # load model state + if use_cuda: + cp = torch.load(self.model_file) + else: + cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) + # load the model + self.model.load_state_dict(cp['model']) + if use_cuda: + self.model.cuda() + self.model.eval() + + def save_wav(self, wav, path): + wav *= 32767 / max(1e-8, np.max(np.abs(wav))) + # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav') + # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None) + # wav = wav / wav.max() + # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg') + scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16)) + # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True) + + def tts(self, text): + text_cleaner = [self.config.text_cleaner] + wavs = [] + for sen in text.split('.'): + if len(sen) < 3: + continue + sen = sen.strip() + sen +='.' + print(sen) + sen = sen.strip() + seq = np.array(text_to_sequence(text, text_cleaner)) + chars_var = torch.from_numpy(seq).unsqueeze(0) + if self.use_cuda: + chars_var = chars_var.cuda() + mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var) + linear_out = linear_out[0].data.cpu().numpy() + wav = self.ap.inv_spectrogram(linear_out.T) + # wav = wav[:self.ap.find_endpoint(wav)] + out = io.BytesIO() + wavs.append(wav) + wavs.append(np.zeros(10000)) + self.save_wav(wav, out) + return out diff --git a/server/templates/index.html b/server/templates/index.html new file mode 100644 index 00000000..b120d83a --- /dev/null +++ b/server/templates/index.html @@ -0,0 +1,104 @@ + + + +
+ + + + + + +It is "work-in-progress" with an "far-to-be-alpha" release.
+