Remove the noise in the code

This commit is contained in:
Eren 2018-06-05 16:15:57 +02:00
parent 021017a465
commit 020833b438
3 changed files with 22 additions and 16 deletions

View File

@ -25,9 +25,7 @@ def tts():
text = request.args.get('text') text = request.args.get('text')
print(" > Model input: {}".format(text)) print(" > Model input: {}".format(text))
data = synthesizer.tts(text) data = synthesizer.tts(text)
return send_file(data, return send_file(data,
attachment_filename="testing.wav",
as_attachment=True,
mimetype='audio/wav') mimetype='audio/wav')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -2,7 +2,9 @@ import io
import os import os
import librosa import librosa
import torch import torch
import scipy
import numpy as np import numpy as np
import soundfile as sf
from TTS.utils.text import text_to_sequence from TTS.utils.text import text_to_sequence
from TTS.utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
@ -24,7 +26,7 @@ class Synthesizer(object):
self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r) self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db, self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
config.frame_shift_ms, config.frame_length_ms, config.preemphasis, config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=30) config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)
# load model state # load model state
if use_cuda: if use_cuda:
cp = torch.load(self.model_file) cp = torch.load(self.model_file)
@ -37,8 +39,13 @@ class Synthesizer(object):
self.model.eval() self.model.eval()
def save_wav(self, wav, path): def save_wav(self, wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav))) wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
librosa.output.write_wav(path, wav.astype(np.float), self.config.sample_rate, norm=True) # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
# wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
# wav = wav / wav.max()
# sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
# librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)
def tts(self, text): def tts(self, text):
text_cleaner = [self.config.text_cleaner] text_cleaner = [self.config.text_cleaner]
@ -47,6 +54,7 @@ class Synthesizer(object):
if len(sen) < 3: if len(sen) < 3:
continue continue
sen +='.' sen +='.'
print(sen)
sen = sen.strip() sen = sen.strip()
seq = np.array(text_to_sequence(text, text_cleaner)) seq = np.array(text_to_sequence(text, text_cleaner))
chars_var = torch.from_numpy(seq).unsqueeze(0) chars_var = torch.from_numpy(seq).unsqueeze(0)
@ -55,7 +63,7 @@ class Synthesizer(object):
mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var) mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
linear_out = linear_out[0].data.cpu().numpy() linear_out = linear_out[0].data.cpu().numpy()
wav = self.ap.inv_spectrogram(linear_out.T) wav = self.ap.inv_spectrogram(linear_out.T)
wav = wav[:self.ap.find_endpoint(wav)] # wav = wav[:self.ap.find_endpoint(wav)]
out = io.BytesIO() out = io.BytesIO()
wavs.append(wav) wavs.append(wav)
wavs.append(np.zeros(10000)) wavs.append(np.zeros(10000))

View File

@ -72,15 +72,15 @@
function q(selector) {return document.querySelector(selector)} function q(selector) {return document.querySelector(selector)}
q('#text').focus() q('#text').focus()
q('#speak-button').addEventListener('click', function(e) { q('#speak-button').addEventListener('click', function(e) {
text = q('#text').value.trim() text = q('#text').value
if (text) { if (text) {
q('#message').textContent = 'Synthesizing...' q('#message').textContent = 'Synthesizing...'
q('#speak-button').disabled = true q('#speak-button').disabled = true
q('#audio').hidden = true q('#audio').hidden = true
synthesize(text) synthesize(text)
} }
e.preventDefault() e.preventDefault()
return false return false
}) })
function synthesize(text) { function synthesize(text) {
fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'}) fetch('/api/tts?text=' + encodeURIComponent(text), {cache: 'no-cache'})