diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index f06aca80..13e31ed9 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -20,9 +20,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "TTS_PATH = \"/home/erogol/projects/\"\n", @@ -33,7 +31,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], @@ -45,6 +42,7 @@ "import io\n", "import torch \n", "import time\n", + "import json\n", "import numpy as np\n", "from collections import OrderedDict\n", "from matplotlib import pylab as plt\n", @@ -72,23 +70,23 @@ "from IPython.display import Audio\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "os.environ['OMP_NUM_THREADS']='1'\n" + "os.environ['CUDA_VISIBLE_DEVICES']='1'" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " # coorect the normalization differences b/w TTS and the Vocoder.\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", + " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " if not use_gl:\n", " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n", "\n", @@ -106,19 +104,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Set constants\n", - "ROOT_PATH = '/media/erogol/data_ssd/Data/models/mozilla_models/4845/'\n", - "MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", + "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", + "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n", + "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/best_model_16K.pth.tar\"\n", + "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/config_16K.json\"\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", "use_cuda = False\n", "\n", @@ -126,6 +122,8 @@ "# CONFIG.windowing = False\n", "# CONFIG.prenet_dropout = False\n", "# CONFIG.separate_stopnet = True\n", + "# CONFIG.use_forward_attn = True\n", + "# CONFIG.forward_attn_mask = True\n", "# CONFIG.stopnet = True\n", "\n", "# Set the vocoder\n", @@ -136,17 +134,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# LOAD TTS MODEL\n", "from utils.text.symbols import symbols, phonemes\n", "\n", + "# multi speaker \n", + "if CONFIG.use_speaker_embedding:\n", + " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", + " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", + "else:\n", + " speakers = []\n", + " speaker_id = None\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, CONFIG)\n", + "model = setup_model(num_chars, len(speakers), CONFIG)\n", "\n", "# load the audio processor\n", "ap = AudioProcessor(**CONFIG.audio) \n", @@ -163,39 +167,45 @@ "if use_cuda:\n", " model.cuda()\n", "model.eval()\n", - "print(cp['step'])" + "print(cp['step'])\n", + "print(cp['r'])\n", + "\n", + "# set model stepsize \n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# LOAD WAVERNN\n", "if use_gl == False:\n", " from WaveRNN.models.wavernn import Model\n", + " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", " bits = 10\n", - "\n", + " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", " wavernn = Model(\n", " rnn_dims=512,\n", " fc_dims=512,\n", - " mode=\"mold\",\n", - " pad=2,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n", + " mode=VOCODER_CONFIG.mode,\n", + " mulaw=VOCODER_CONFIG.mulaw,\n", + " pad=VOCODER_CONFIG.pad,\n", + " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", " compute_dims=128,\n", " res_out_dims=128,\n", " res_blocks=10,\n", - " hop_length=ap.hop_length,\n", - " sample_rate=ap.sample_rate,\n", + " hop_length=ap_vocoder.hop_length,\n", + " sample_rate=ap_vocoder.sample_rate,\n", + " use_upsample_net = True,\n", + " use_aux_net = True\n", " ).cuda()\n", "\n", - "\n", " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'])\n", + " wavernn.load_state_dict(check['model'], strict=False)\n", " if use_cuda:\n", " wavernn.cuda()\n", " wavernn.eval();\n", @@ -209,70 +219,75 @@ "### Comparision with https://mycroft.ai/blog/available-voices/" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "model.decoder.max_decoder_steps = 2000\n", + "speaker_id = 500\n", + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ "model.eval()\n", "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = 0\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Seine Fuerenden Berater hatten Donald Trump seit Wochen beschworen, berichteten US-Medien: Lassen Sie das mit den Zoellen bleiben.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Der Klimawandel bedroht die Gletscher im Himalaya.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Zwei Unternehmen verlieren einem Medienbericht zufolge ihre Verträge als Maut-Inkasso-Manager.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Eine Ausländermaut nach dem Geschmack der CSU wird es nicht geben - das bedauert außerhalb der Partei fast niemand.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Angela Merkel ist als Klimakanzlerin gestartet.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -285,61 +300,51 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Dann vernachlässigte sie das Thema.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Nun, kurz vor dem Ende, will sie damit noch einmal neu anfangen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Nun ist der Spieltempel pleite, und manchen Dorfbewohnern fehlt das Geld zum Essen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Andrea Nahles will in der Fraktion die Vertrauensfrage stellen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Die Erfolge der Grünen bringen eine Reihe Unerfahrener in die Parlamente.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -352,212 +357,27 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, "outputs": [], "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Die Luftfahrtbranche arbeitet daran, CO2-neutral zu werden.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Michael Kretschmer versucht seit Monaten, die Bürger zu umgarnen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# !zip benchmark_samples/samples.zip benchmark_samples/*" @@ -566,9 +386,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3(mztts)", + "display_name": "Python 3", "language": "python", - "name": "mztts" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -580,9 +400,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }