html formatting, enable multi-speaker model on the server with a dropdown menu to select the speaker

This commit is contained in:
Eren Gölge 2021-04-22 15:22:36 +02:00
parent f9f3d04d14
commit ad047c8195
2 changed files with 137 additions and 94 deletions

View File

@ -32,12 +32,19 @@ def create_argparser():
"--model_name", "--model_name",
type=str, type=str,
default="tts_models/en/ljspeech/tacotron2-DDC", default="tts_models/en/ljspeech/tacotron2-DDC",
help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>", help=
"Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
) )
parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.") parser.add_argument("--vocoder_name",
type=str,
default=None,
help="name of one of the released vocoder models.")
# Args for running custom models # Args for running custom models
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument("--config_path",
default=None,
type=str,
help="Path to model config file.")
parser.add_argument( parser.add_argument(
"--model_path", "--model_path",
type=str, type=str,
@ -47,15 +54,34 @@ def create_argparser():
parser.add_argument( parser.add_argument(
"--vocoder_path", "--vocoder_path",
type=str, type=str,
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", help=
"Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
default=None, default=None,
) )
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument("--vocoder_config_path",
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) type=str,
parser.add_argument("--port", type=int, default=5002, help="port to listen on.") help="Path to vocoder model config file.",
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.") default=None)
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.") parser.add_argument("--speakers_file_path",
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.") type=str,
help="JSON file for multi-speaker model.",
default=None)
parser.add_argument("--port",
type=int,
default=5002,
help="port to listen on.")
parser.add_argument("--use_cuda",
type=convert_boolean,
default=False,
help="true to use CUDA.")
parser.add_argument("--debug",
type=convert_boolean,
default=False,
help="true to enable Flask debug mode.")
parser.add_argument("--show_details",
type=convert_boolean,
default=False,
help="Generate model detail page.")
return parser return parser
@ -83,11 +109,14 @@ if args.list_models:
# CASE2: load pre-trained model paths # CASE2: load pre-trained model paths
if args.model_name is not None and not args.model_path: if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name) model_path, config_path, model_item = manager.download_model(
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name args.model_name)
args.vocoder_name = model_item[
"default_vocoder"] if args.vocoder_name is None else args.vocoder_name
if args.vocoder_name is not None and not args.vocoder_path: if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) vocoder_path, vocoder_config_path, _ = manager.download_model(
args.vocoder_name)
# CASE3: set custome model paths # CASE3: set custome model paths
if args.model_path is not None: if args.model_path is not None:
@ -100,11 +129,11 @@ if args.vocoder_path is not None:
vocoder_config_path = args.vocoder_config_path vocoder_config_path = args.vocoder_config_path
# load models # load models
synthesizer = Synthesizer( synthesizer = Synthesizer(model_path, config_path, speakers_file_path,
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda vocoder_path, vocoder_config_path, args.use_cuda)
)
use_speaker_embedding = synthesizer.tts_config.get("use_external_speaker_embedding_file", False) use_speaker_embedding = synthesizer.tts_config.get(
"use_external_speaker_embedding_file", False)
use_gst = synthesizer.tts_config.get("use_gst", False) use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__) app = Flask(__name__)
@ -131,9 +160,11 @@ def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
@app.route("/") @app.route("/")
def index(): def index():
return render_template( return render_template("index.html",
"index.html", show_details=args.show_details, use_speaker_embedding=use_speaker_embedding, use_gst=use_gst show_details=args.show_details,
) use_speaker_embedding=use_speaker_embedding,
speaker_ids=synthesizer.speaker_manager.speaker_ids,
use_gst=use_gst)
@app.route("/details") @app.route("/details")
@ -156,8 +187,8 @@ def details():
@app.route("/api/tts", methods=["GET"]) @app.route("/api/tts", methods=["GET"])
def tts(): def tts():
text = request.args.get("text") text = request.args.get("text")
speaker_idx = request.args.get("speaker", "") speaker_idx = request.args.get("speaker_id", "")
style_wav = request.args.get("style-wav", "") style_wav = request.args.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav) style_wav = style_wav_uri_to_dict(style_wav)
print(" > Model input: {}".format(text)) print(" > Model input: {}".format(text))

View File

@ -1,7 +1,7 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
@ -12,24 +12,26 @@
<!-- Bootstrap core CSS --> <!-- Bootstrap core CSS -->
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous" rel="stylesheet"> integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
rel="stylesheet">
<!-- Custom styles for this template --> <!-- Custom styles for this template -->
<style> <style>
body {
padding-top: 54px;
}
@media (min-width: 992px) {
body { body {
padding-top: 56px; padding-top: 54px;
} }
}
@media (min-width: 992px) {
body {
padding-top: 56px;
}
}
</style> </style>
</head> </head>
<body> <body>
<a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a> <a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
<!-- Navigation --> <!-- Navigation -->
<!-- <!--
@ -54,78 +56,88 @@
<!-- Page Content --> <!-- Page Content -->
<div class="container"> <div class="container">
<div class="row"> <div class="row">
<div class="col-lg-12 text-center"> <div class="col-lg-12 text-center">
<img class="mt-5" src="{{url_for('static', filename='coqui-log-green-TTS.png')}}" align="middle" width="512"/> <img class="mt-5" src="{{url_for('static', filename='coqui-log-green-TTS.png')}}" align="middle"
width="512" />
<ul class="list-unstyled"> <ul class="list-unstyled">
</ul> </ul>
{%if use_speaker_embedding%}
<input id="speaker-json-key" placeholder="speaker json key.." size=45 type="text" name="speaker-json-key">
{%endif%}
{%if use_gst%} {%if use_gst%}
<input value='{"0": 0.1}' id="style-wav" placeholder="style wav (dict or path ot wav).." size=45 type="text" name="style-wav"> <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
{%endif%} type="text" name="style_wav">
{%endif%}
<input id="text" placeholder="Type here..." size=45 type="text" name="text"> <input id="text" placeholder="Type here..." size=45 type="text" name="text">
<button id="speak-button" name="speak">Speak</button><br/><br/> <button id="speak-button" name="speak">Speak</button><br /><br />
{%if show_details%}
<button id="details-button" onclick="location.href = 'details'" name="model-details">Model Details</button><br/><br/> {%if use_speaker_embedding%}
{%endif%} Choose a speaker:
<audio id="audio" controls autoplay hidden></audio> <select id="speaker_id" name=speaker_id method="GET" action="/">
<p id="message"></p> {% for speaker_id in speaker_ids %}
<option value="{{speaker_id}}" SELECTED>{{speaker_id}}</option>"
{% endfor %}
</select><br /><br />
{%endif%}
{%if show_details%}
<button id="details-button" onclick="location.href = 'details'" name="model-details">Model
Details</button><br /><br />
{%endif%}
<audio id="audio" controls autoplay hidden></audio>
<p id="message"></p>
</div>
</div> </div>
</div>
</div> </div>
<!-- Bootstrap core JavaScript --> <!-- Bootstrap core JavaScript -->
<script> <script>
function getTextValue(textId) { function getTextValue(textId) {
const container = q(textId) const container = q(textId)
if (container) { if (container) {
return container.value return container.value
}
return ""
} }
function q(selector) {return document.querySelector(selector)} return ""
q('#text').focus() }
function do_tts(e) { function q(selector) { return document.querySelector(selector) }
const text = q('#text').value q('#text').focus()
const speakerJsonKey = getTextValue('#speaker-json-key') function do_tts(e) {
const styleWav = getTextValue('#style-wav') const text = q('#text').value
if (text) { const speaker_id = getTextValue('#speaker_id')
q('#message').textContent = 'Synthesizing...' const style_wav = getTextValue('#style_wav')
q('#speak-button').disabled = true if (text) {
q('#audio').hidden = true q('#message').textContent = 'Synthesizing...'
synthesize(text, speakerJsonKey, styleWav) q('#speak-button').disabled = true
} q('#audio').hidden = true
e.preventDefault() synthesize(text, speaker_id, style_wav)
return false
} }
q('#speak-button').addEventListener('click', do_tts) e.preventDefault()
q('#text').addEventListener('keyup', function(e) { return false
if (e.keyCode == 13) { // enter }
q('#speak-button').addEventListener('click', do_tts)
q('#text').addEventListener('keyup', function (e) {
if (e.keyCode == 13) { // enter
do_tts(e) do_tts(e)
}
})
function synthesize(text, speakerJsonKey="", styleWav="") {
fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker=${encodeURIComponent(speakerJsonKey)}&style-wav=${encodeURIComponent(styleWav)}` , {cache: 'no-cache'})
.then(function(res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()
}).then(function(blob) {
q('#message').textContent = ''
q('#speak-button').disabled = false
q('#audio').src = URL.createObjectURL(blob)
q('#audio').hidden = false
}).catch(function(err) {
q('#message').textContent = 'Error: ' + err.message
q('#speak-button').disabled = false
})
} }
</script> })
function synthesize(text, speaker_id = "", style_wav = "") {
fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
.then(function (res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()
}).then(function (blob) {
q('#message').textContent = ''
q('#speak-button').disabled = false
q('#audio').src = URL.createObjectURL(blob)
q('#audio').hidden = false
}).catch(function (err) {
q('#message').textContent = 'Error: ' + err.message
q('#speak-button').disabled = false
})
}
</script>
</body> </body>
</html> </html>