diff --git a/.cardboardlint.yml b/.cardboardlint.yml index 464ea733..4a115a37 100644 --- a/.cardboardlint.yml +++ b/.cardboardlint.yml @@ -1,2 +1,5 @@ linters: - pylint: + # pylintrc: pylintrc + filefilter: ['- test_*.py', '+ *.py', '- *.npy'] + # exclude: \ No newline at end of file diff --git a/.github/PR_TEMPLATE.md b/.github/PR_TEMPLATE.md new file mode 100644 index 00000000..7bfb8c60 --- /dev/null +++ b/.github/PR_TEMPLATE.md @@ -0,0 +1,18 @@ +--- +name: 'Contribution Guideline ' +about: Refer to Contirbution Guideline +title: '' +labels: '' +assignees: '' + +--- +### Contribution Guideline + +Please send your PRs to `dev` branch if it is not directly related to a specific branch. +Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter. +We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command: + +```bash +pip install pylint cardboardlint +cardboardlinter --refspec master +``` \ No newline at end of file diff --git a/.gitignore b/.gitignore index e1e9fbd4..b6fee485 100644 --- a/.gitignore +++ b/.gitignore @@ -128,3 +128,4 @@ tests/outputs/* TODO.txt .vscode/* data/* +notebooks/data/* diff --git a/.pylintrc b/.pylintrc index b6e04944..a78b521e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -157,7 +157,8 @@ disable=missing-docstring, xreadlines-attribute, deprecated-sys-function, exception-escape, - comprehension-escape + comprehension-escape, + duplicate-code # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/.travis.yml b/.travis.yml index 645f9861..5210bef2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,12 @@ language: python git: quiet: true +before_install: + - sudo apt-get update + - sudo apt-get -y install espeak + - python -m pip install --upgrade pip + - pip install six==1.12.0 + matrix: include: - name: "Lint check" @@ -11,7 +17,15 @@ matrix: env: TEST_SUITE="lint" - name: "Unit tests" python: "3.6" - install: pip install --quiet -r requirements_tests.txt + install: + - python setup.py egg_info + - pip install -e . env: TEST_SUITE="unittest" + - name: "Unit tests" + python: "3.6" + install: + - python setup.py egg_info + - pip install -e . + env: TEST_SUITE="testscripts" script: ./.travis/script diff --git a/.travis/script b/.travis/script index ca6f4cd3..0c24a221 100755 --- a/.travis/script +++ b/.travis/script @@ -10,10 +10,12 @@ if [[ ( "$TRAVIS_PULL_REQUEST" != "false" ) && ( "$TEST_SUITE" == "lint" ) ]]; t fi if [[ "$TEST_SUITE" == "unittest" ]]; then - # Run tests on all pushes - pushd tts_namespace - python -m unittest - popd - # Test server package + nosetests tests --nocapture ./tests/test_server_package.sh fi + +if [[ "$TEST_SUITE" == "testscripts" ]]; then + # test model training scripts + ./tests/test_tts_train.sh + ./tests/test_vocoder_train.sh +fi diff --git a/README.md b/README.md index 7d9884b0..136b2ac5 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,73 @@

- +
-This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample generated voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn). +

+ + + +

-TTS includes two different model implementations which are based on [Tacotron](https://arxiv.org/abs/1703.10135) and [Tacotron2](https://arxiv.org/abs/1712.05884). Tacotron is smaller, efficient and easier to train but Tacotron2 provides better results, especially when it is combined with a Neural vocoder. Therefore, choose depending on your project requirements. +
-If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons. +This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). + +Mozilla TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. + +You can check some of synthesized voice samples from [here](https://erogol.github.io/ddc-samples/). + +If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about some of TTS architectures and [here](https://github.com/erogol/TTS-papers) list of up-to-date research papers. [![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7) -## TTS Performance +## TTS Performance

[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) +## Provided Models and Methods +Text-to-Spectrogram: +- Tacotron: [paper](https://arxiv.org/abs/1703.10135) +- Tacotron2: [paper](https://arxiv.org/abs/1712.05884) + +Attention Methods: +- Guided Attention: [paper](https://arxiv.org/abs/1710.08969) +- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006) +- Graves Attention: [paper](https://arxiv.org/abs/1907.09006) +- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) + +Speaker Encoder: +- GE2E: [paper](https://arxiv.org/abs/1710.10467) + +Vocoders: +- MelGAN: [paper](https://arxiv.org/abs/1710.10467) +- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106) +- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646) + +You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers). + ## Features -- High performance Text2Speech models on Torch and Tensorflow 2.0. -- High performance Speaker Encoder to compute speaker embeddings efficiently. -- Integration with various Neural Vocoders (PWGAN, MelGAN, WaveRNN) -- Released trained models. -- Efficient training codes for PyTorch. (soon for Tensorflow 2.0) -- Codes to convert Torch models to Tensorflow 2.0. -- Detailed training anlaysis on console and Tensorboard. +- High performance Deep Learning models for Text2Speech tasks. + - Text2Spec models (Tacotron, Tacotron2). + - Speaker Encoder to compute speaker embeddings efficiently. + - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN) +- Fast and efficient model training. +- Detailed training logs on console and Tensorboard. +- Support for multi-speaker TTS. +- Efficient Multi-GPUs training. +- Ability to convert PyTorch models to Tensorflow 2.0 and TFLite for inference. +- Released models in PyTorch, Tensorflow and TFLite. - Tools to curate Text2Speech datasets under```dataset_analysis```. - Demo server for model testing. - Notebooks for extensive model benchmarking. - Modular (but not too much) code base enabling easy testing for new ideas. -## Requirements and Installation +## Main Requirements and Installation Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation. * python>=3.6 - * pytorch>=0.4.1 + * pytorch>=1.4.1 + * tensorflow>=2.2 * librosa * tensorboard * tensorboardX @@ -47,18 +82,34 @@ Or you can use ```requirements.txt``` to install the requirements only. ```pip install -r requirements.txt``` +### Directory Structure +``` +|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) +|- utils/ (common utilities.) +|- TTS + |- bin/ (folder for all the executables.) + |- train*.py (train your target model.) + |- distribute.py (train your TTS model using Multiple GPUs.) + |- compute_statistics.py (compute dataset statistics for normalization.) + |- convert*.py (convert target torch model to TF.) + |- tts/ (text to speech models) + |- layers/ (model layer definitions) + |- models/ (model definitions) + |- tf/ (Tensorflow 2 utilities and model implementations) + |- utils/ (model specific utilities.) + |- speaker_encoder/ (Speaker Encoder models.) + |- (same) + |- vocoder/ (Vocoder models.) + |- (same) +``` + ### Docker -A barebone `Dockerfile` exists at the root of the project, which should let you quickly setup the environment. By default, it will start the server and let you query it. Make sure to use `nvidia-docker` to use your GPUs. Make sure you follow the instructions in the [`server README`](server/README.md) before you build your image so that the server can find the model within the image. +A docker image is created by [@synesthesiam](https://github.com/synesthesiam) and shared in a separate [repository](https://github.com/synesthesiam/docker-mozillatts) with the latest LJSpeech models. -``` -docker build -t mozilla-tts . -nvidia-docker run -it --rm -p 5002:5002 mozilla-tts -``` - -## Checkpoints and Audio Samples +## Release Models Please visit [our wiki.](https://github.com/mozilla/TTS/wiki/Released-Models) -## Example Model Outputs +## Sample Model Output Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset. > "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning." @@ -67,26 +118,14 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl example_output -## Runtime -The most time-consuming part is the vocoder algorithm (Griffin-Lim) which runs on CPU. By setting its number of iterations lower, you might have faster execution with a small loss of quality. Some of the experimental values are below. - -Sentence: "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." - -Audio length is approximately 6 secs. - -| Time (secs) | System | # GL iters | Model -| ---- |:-------|:-----------| ---- | -|2.00|GTX1080Ti|30|Tacotron| -|3.01|GTX1080Ti|60|Tacotron| -|3.57|CPU|60|Tacotron| -|5.27|GTX1080Ti|60|Tacotron2| -|6.50|CPU|60|Tacotron2| - +## [Mozilla TTS Tutorials and Notebooks](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials) ## Datasets and Data-Loading -TTS provides a generic dataloder easy to use for new datasets. You need to write an preprocessor function to integrate your own dataset.Check ```datasets/preprocess.py``` to see some examples. After the function, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields too. +TTS provides a generic dataloader easy to use for your custom dataset. +You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples. +After that, you need to set ```dataset``` fields in ```config.json```. -Some of the open-sourced datasets that we successfully applied TTS, are linked below. +Some of the public datasets that we successfully applied TTS: - [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) - [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) @@ -96,9 +135,9 @@ Some of the open-sourced datasets that we successfully applied TTS, are linked b - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 ## Training and Fine-tuning LJ-Speech -Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below. +Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below. -To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go. +To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go. ``` shuf metadata.csv > metadata_shuf.csv @@ -108,15 +147,19 @@ tail -n 1100 metadata_shuf.csv > metadata_val.csv To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in ```config.json```. -```train.py --config_path config.json``` +```python TTS/bin/train.py --config_path TTS/tts/configs/config.json``` To fine-tune a model, use ```--restore_path```. -```train.py --config_path config.json --restore_path /path/to/your/model.pth.tar``` +```python TTS/bin/train.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar``` + +To continue an old training run, use ```--continue_path```. + +```python TTS/bin/train.py --continue_path /path/to/your/run_folder/``` For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU. -```CUDA_VISIBLE_DEVICES="0,1,4" distribute.py --config_path config.json``` +```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --config_path TTS/tts/configs/config.json``` Each run creates a new output folder and ```config.json``` is copied under this folder. @@ -124,8 +167,6 @@ In case of any error or intercepted execution, if there is no checkpoint yet und You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder. -## [Testing and Examples](https://github.com/mozilla/TTS/wiki/Examples-using-TTS) - ## Contribution guidelines This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines.](https://www.mozilla.org/about/governance/policies/participation/) @@ -137,10 +178,10 @@ cardboardlinter --refspec master ``` ## Collaborative Experimentation Guide -If you like to use TTS to try a new idea and like to share your experiments with the community, we urge you to use the following guideline for a better collaboration. +If you like to use TTS to try a new idea and like to share your experiments with the community, we urge you to use the following guideline for a better collaboration. (If you have an idea for better collaboration, let us know) - Create a new branch. -- Open an issue pointing your branch. +- Open an issue pointing your branch. - Explain your experiment. - Share your results as you proceed. (Tensorboard log files, audio results, visuals etc.) - Use LJSpeech dataset (for English) if you like to compare results with the released models. (It is the most open scalable dataset for quick experimentation) @@ -155,7 +196,7 @@ If you like to use TTS to try a new idea and like to share your experiments with - [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/). - [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN) - [ ] Multi-speaker embedding. -- [ ] Model optimization (model export, model pruning etc.) +- [x] Model optimization (model export, model pruning etc.) - @@ -27,7 +27,7 @@ - + Fork me on GitHub @@ -60,7 +60,7 @@

Mozilla TTS

- +

diff --git a/speaker_encoder/README.md b/mozilla_voice_tts/speaker_encoder/README.md similarity index 88% rename from speaker_encoder/README.md rename to mozilla_voice_tts/speaker_encoder/README.md index 38b4bb1b..7706c7d7 100644 --- a/speaker_encoder/README.md +++ b/mozilla_voice_tts/speaker_encoder/README.md @@ -1,16 +1,16 @@ -### Speaker embedding (Experimental) +### Speaker Encoder This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. -Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). +Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). ![](umap.png) Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. -To run the code, you need to follow the same flow as in TTS. +To run the code, you need to follow the same flow as in mozilla_voice_tts. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` diff --git a/layers/__init__.py b/mozilla_voice_tts/speaker_encoder/__init__.py similarity index 100% rename from layers/__init__.py rename to mozilla_voice_tts/speaker_encoder/__init__.py diff --git a/speaker_encoder/compute_embeddings.py b/mozilla_voice_tts/speaker_encoder/compute_embeddings.py similarity index 92% rename from speaker_encoder/compute_embeddings.py rename to mozilla_voice_tts/speaker_encoder/compute_embeddings.py index bfa377e3..027d3381 100644 --- a/speaker_encoder/compute_embeddings.py +++ b/mozilla_voice_tts/speaker_encoder/compute_embeddings.py @@ -6,9 +6,9 @@ import numpy as np from tqdm import tqdm import torch -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import load_config +from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder +from mozilla_voice_tts.tts.utils.audio import AudioProcessor +from mozilla_voice_tts.tts.utils.generic_utils import load_config parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset. ') diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json new file mode 100644 index 00000000..11da0cf6 --- /dev/null +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -0,0 +1,61 @@ + +{ + "run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning", + "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", + "audio":{ + // Audio processing parameters + "num_mels": 40, // size of the mel spec frame. + "fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 400, // stft window length in ms. + "hop_length": 160, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + "reinit_layers": [], + "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 1, // Number of steps to log traning on console. + "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "model": { + "input_dim": 40, + "proj_dim": 256, + "lstm_dim": 256, + "num_lstm_layers": 3, + "use_lstm_with_projection": false + }, + "datasets": + [ + { + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/speaker_encoder/dataset.py b/mozilla_voice_tts/speaker_encoder/dataset.py similarity index 96% rename from speaker_encoder/dataset.py rename to mozilla_voice_tts/speaker_encoder/dataset.py index 913b7a6d..d3243c13 100644 --- a/speaker_encoder/dataset.py +++ b/mozilla_voice_tts/speaker_encoder/dataset.py @@ -9,7 +9,7 @@ class MyDataset(Dataset): num_utter_per_speaker=10, skip_speakers=False, verbose=False): """ Args: - ap (TTS.utils.AudioProcessor): audio processor object. + ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. seq_len (int): voice segment length in seconds. verbose (bool): print diagnostic information. @@ -31,7 +31,7 @@ class MyDataset(Dataset): print(f" | > Num speakers: {len(self.speakers)}") def load_wav(self, filename): - audio = self.ap.load_wav(filename) + audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio def load_data(self, idx): diff --git a/speaker_encoder/generic_utils.py b/mozilla_voice_tts/speaker_encoder/generic_utils.py similarity index 93% rename from speaker_encoder/generic_utils.py rename to mozilla_voice_tts/speaker_encoder/generic_utils.py index c568d129..bc72c91c 100644 --- a/speaker_encoder/generic_utils.py +++ b/mozilla_voice_tts/speaker_encoder/generic_utils.py @@ -15,7 +15,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path, 'optimizer': optimizer.state_dict() if optimizer is not None else None, 'step': current_step, 'epoch': epoch, - 'GE2Eloss': model_loss, + 'loss': model_loss, 'date': datetime.date.today().strftime("%B %d, %Y"), } torch.save(state, checkpoint_path) @@ -29,7 +29,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, 'model': new_state_dict, 'optimizer': optimizer.state_dict(), 'step': current_step, - 'GE2Eloss': model_loss, + 'loss': model_loss, 'date': datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss @@ -38,4 +38,4 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, print("\n > BEST MODEL ({0:.5f}) : {1:}".format( model_loss, bestmodel_path)) torch.save(state, bestmodel_path) - return best_loss \ No newline at end of file + return best_loss diff --git a/speaker_encoder/loss.py b/mozilla_voice_tts/speaker_encoder/losses.py similarity index 72% rename from speaker_encoder/loss.py rename to mozilla_voice_tts/speaker_encoder/losses.py index ab290547..35ff73fa 100644 --- a/speaker_encoder/loss.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F - +import numpy as np # adapted from https://github.com/cvqluu/GE2E-Loss class GE2ELoss(nn.Module): @@ -23,6 +23,8 @@ class GE2ELoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method + print(' > Initialised Generalized End-to-End loss') + assert self.loss_method in ["softmax", "contrast"] if self.loss_method == "softmax": @@ -119,3 +121,40 @@ class GE2ELoss(nn.Module): cos_sim_matrix = self.w * cos_sim_matrix + self.b L = self.embed_loss(dvecs, cos_sim_matrix) return L.mean() + +# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py +class AngleProtoLoss(nn.Module): + """ + Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector + Args: + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + def __init__(self, init_w=10.0, init_b=-5.0): + super(AngleProtoLoss, self).__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.criterion = torch.nn.CrossEntropyLoss() + + print(' > Initialised Angular Prototypical loss') + + def forward(self, x): + """ + Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + out_anchor = torch.mean(x[:, 1:, :], 1) + out_positive = x[:, 0, :] + num_speakers = out_anchor.size()[0] + + cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2)) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device) + L = self.criterion(cos_sim_matrix, label) + return L diff --git a/speaker_encoder/model.py b/mozilla_voice_tts/speaker_encoder/model.py similarity index 64% rename from speaker_encoder/model.py rename to mozilla_voice_tts/speaker_encoder/model.py index b3bd71ff..df0527bc 100644 --- a/speaker_encoder/model.py +++ b/mozilla_voice_tts/speaker_encoder/model.py @@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module): o, (_, _) = self.lstm(x) return self.linear(o) +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, + hidden_size=lstm_dim, + num_layers=num_lstm_layers, + batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) class SpeakerEncoder(nn.Module): - def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3): + def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection layers = [] - layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers - 1): - layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) - self.layers = nn.Sequential(*layers) + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + self._init_layers() def _init_layers(self): @@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module): def forward(self, x): # TODO: implement state passing for lstms d = self.layers(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) return d def inference(self, x): d = self.layers.forward(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) return d def compute_embedding(self, x, num_frames=160, overlap=0.5): @@ -85,4 +109,3 @@ class SpeakerEncoder(nn.Module): frames[cur_iter <= num_iters, :, :] ) return embed / num_iters - diff --git a/speaker_encoder/requirements.txt b/mozilla_voice_tts/speaker_encoder/requirements.txt similarity index 100% rename from speaker_encoder/requirements.txt rename to mozilla_voice_tts/speaker_encoder/requirements.txt diff --git a/speaker_encoder/umap.png b/mozilla_voice_tts/speaker_encoder/umap.png similarity index 100% rename from speaker_encoder/umap.png rename to mozilla_voice_tts/speaker_encoder/umap.png diff --git a/speaker_encoder/visual.py b/mozilla_voice_tts/speaker_encoder/visual.py similarity index 100% rename from speaker_encoder/visual.py rename to mozilla_voice_tts/speaker_encoder/visual.py diff --git a/models/__init__.py b/mozilla_voice_tts/tts/__init__.py similarity index 100% rename from models/__init__.py rename to mozilla_voice_tts/tts/__init__.py diff --git a/config.json b/mozilla_voice_tts/tts/configs/config.json similarity index 70% rename from config.json rename to mozilla_voice_tts/tts/configs/config.json index 32debf86..2a61ba03 100644 --- a/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,24 +1,24 @@ { "model": "Tacotron2", - "run_name": "ljspeech", - "run_description": "tacotron2", + "run_name": "ljspeech-ddc-bn", + "run_description": "tacotron2 with ddc and batch-normalization", // AUDIO PARAMETERS "audio":{ // stft parameters - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. "win_length": 1024, // stft window length in ms. "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // Griffin-Lim @@ -29,6 +29,7 @@ "num_mels": 80, // size of the mel spec frame. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // Normalization parameters "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. @@ -66,6 +67,7 @@ "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. // VALIDATION "run_eval": true, @@ -83,26 +85,29 @@ // TACOTRON PRENET "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. - // ATTENTION + // TACOTRON ATTENTION "attention_type": "original", // 'original' or 'graves' "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. + "attention_norm": "sigmoid", // softmax or sigmoid. "windowing": false, // Enables attention windowing. Used only in eval mode. "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. "transition_agent": false, // enable/disable transition agent of forward attention. "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. // STOPNET "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. "print_eval": false, // If True, it prints intermediate loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" @@ -118,28 +123,37 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", + "output_path": "../../Mozilla-TTS/vctk-test/", // PHONEMES - "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_cache_path": "../../Mozilla-TTS/vctk-test/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false, // TACOTRON ONLY: use global style tokens + "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, // DATASETS "datasets": // List of datasets. They all merged and they get different speaker_ids. [ { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": ["p225", "p234", "p238", "p245", "p248", "p261", "p294", "p302", "p326", "p335", "p347"], // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers "meta_file_val": null } ] - } diff --git a/datasets/TTSDataset.py b/mozilla_voice_tts/tts/datasets/TTSDataset.py similarity index 90% rename from datasets/TTSDataset.py rename to mozilla_voice_tts/tts/datasets/TTSDataset.py index 0d884c00..1ecca75f 100644 --- a/datasets/TTSDataset.py +++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py @@ -5,8 +5,8 @@ import torch import random from torch.utils.data import Dataset -from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos -from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target +from mozilla_voice_tts.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos +from mozilla_voice_tts.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target class MyDataset(Dataset): @@ -24,13 +24,14 @@ class MyDataset(Dataset): phoneme_cache_path=None, phoneme_language="en-us", enable_eos_bos=False, + speaker_mapping=None, verbose=False): """ Args: outputs_per_step (int): number of time frames predicted per step. text_cleaner (str): text cleaner used for the dataset. compute_linear_spec (bool): compute linear spectrogram if True. - ap (TTS.utils.AudioProcessor): audio processor object. + ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. batch_group_size (int): (0) range of batch randomization after sorting sequences by length. @@ -58,6 +59,7 @@ class MyDataset(Dataset): self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos + self.speaker_mapping = speaker_mapping self.verbose = verbose if use_phonemes and not os.path.isdir(phoneme_cache_path): os.makedirs(phoneme_cache_path, exist_ok=True) @@ -92,7 +94,7 @@ class MyDataset(Dataset): return phonemes def _load_or_generate_phoneme_sequence(self, wav_file, text): - file_name = os.path.basename(wav_file).split('.')[0] + file_name = os.path.splitext(os.path.basename(wav_file))[0] cache_path = os.path.join(self.phoneme_cache_path, file_name + '_phoneme.npy') try: @@ -127,7 +129,8 @@ class MyDataset(Dataset): 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], - 'speaker_name': speaker_name + 'speaker_name': speaker_name, + 'wav_file_name': os.path.basename(wav_file) } return sample @@ -191,9 +194,15 @@ class MyDataset(Dataset): batch[idx]['item_idx'] for idx in ids_sorted_decreasing ] text = [batch[idx]['text'] for idx in ids_sorted_decreasing] + speaker_name = [batch[idx]['speaker_name'] for idx in ids_sorted_decreasing] - + # get speaker embeddings + if self.speaker_mapping is not None: + wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing] + speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names] + else: + speaker_embedding = None # compute features mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] @@ -224,6 +233,9 @@ class MyDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + if speaker_embedding is not None: + speaker_embedding = torch.FloatTensor(speaker_embedding) + # compute linear spectrogram if self.compute_linear_spec: linear = [self.ap.spectrogram(w).astype('float32') for w in wav] @@ -234,7 +246,7 @@ class MyDataset(Dataset): else: linear = None return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ - stop_targets, item_idxs + stop_targets, item_idxs, speaker_embedding raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0])))) diff --git a/server/__init__.py b/mozilla_voice_tts/tts/datasets/__init__.py similarity index 100% rename from server/__init__.py rename to mozilla_voice_tts/tts/datasets/__init__.py diff --git a/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py similarity index 75% rename from datasets/preprocess.py rename to mozilla_voice_tts/tts/datasets/preprocess.py index e8700c6b..ece3bcb6 100644 --- a/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -2,7 +2,7 @@ import os from glob import glob import re import sys -from TTS.utils.generic_utils import split_dataset +from mozilla_voice_tts.tts.utils.generic_utils import split_dataset def load_meta_data(datasets): @@ -93,9 +93,10 @@ def mozilla_de(root_path, meta_file): def mailabs(root_path, meta_files=None): """Normalizes M-AI-Labs meta data files to TTS format""" - speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") + speaker_regex = re.compile( + "by_book/(male|female)/(?P[^/]+)/") if meta_files is None: - csv_files = glob(root_path+"/**/metadata.csv", recursive=True) + csv_files = glob(root_path + "/**/metadata.csv", recursive=True) else: csv_files = meta_files # meta_files = [f.strip() for f in meta_files.split(",")] @@ -115,12 +116,15 @@ def mailabs(root_path, meta_files=None): if meta_files is None: wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav') else: - wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav') + wav_file = os.path.join(root_path, + folder.replace("metadata.csv", ""), + 'wavs', cols[0] + '.wav') if os.path.isfile(wav_file): text = cols[1].strip() items.append([text, wav_file, speaker_name]) else: - raise RuntimeError("> File %s does not exist!"%(wav_file)) + raise RuntimeError("> File %s does not exist!" % + (wav_file)) return items @@ -185,7 +189,8 @@ def libri_tts(root_path, meta_files=None): text = cols[1] items.append([text, wav_file, speaker_name]) for item in items: - assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" + assert os.path.exists( + item[1]), f" [!] wav files don't exist - {item[1]}" return items @@ -197,7 +202,8 @@ def custom_turkish(root_path, meta_file): with open(txt_file, 'r', encoding='utf-8') as ttf: for line in ttf: cols = line.split('|') - wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav') + wav_file = os.path.join(root_path, 'wavs', + cols[0].strip() + '.wav') if not os.path.exists(wav_file): skipped_files.append(wav_file) continue @@ -205,3 +211,44 @@ def custom_turkish(root_path, meta_file): items.append([text, wav_file, speaker_name]) print(f" [!] {len(skipped_files)} files skipped. They don't exist...") return items + + +# ToDo: add the dataset link when the dataset is released publicly +def brspeech(root_path, meta_file): + '''BRSpeech 3.0 beta''' + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, 'r') as ttf: + for line in ttf: + if line.startswith("wav_filename"): + continue + cols = line.split('|') + #print(cols) + wav_file = os.path.join(root_path, cols[0]) + text = cols[2] + speaker_name = cols[3] + items.append([text, wav_file, speaker_name]) + return items + + +def vctk(root_path, meta_files=None, wavs_path='wav48'): + """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" + test_speakers = meta_files + items = [] + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", + recursive=True) + for meta_file in meta_files: + _, speaker_id, txt_file = os.path.relpath(meta_file, + root_path).split(os.sep) + file_id = txt_file.split('.')[0] + if isinstance(test_speakers, + list): # if is list ignore this speakers ids + if speaker_id in test_speakers: + continue + with open(meta_file) as file_text: + text = file_text.readlines()[0] + wav_file = os.path.join(root_path, wavs_path, speaker_id, + file_id + '.wav') + items.append([text, wav_file, speaker_id]) + + return items \ No newline at end of file diff --git a/speaker_encoder/__init__.py b/mozilla_voice_tts/tts/layers/__init__.py similarity index 100% rename from speaker_encoder/__init__.py rename to mozilla_voice_tts/tts/layers/__init__.py diff --git a/layers/common_layers.py b/mozilla_voice_tts/tts/layers/common_layers.py similarity index 96% rename from layers/common_layers.py rename to mozilla_voice_tts/tts/layers/common_layers.py index b7d02c2d..d197bb86 100644 --- a/layers/common_layers.py +++ b/mozilla_voice_tts/tts/layers/common_layers.py @@ -1,6 +1,5 @@ import torch from torch import nn -from torch.autograd import Variable from torch.nn import functional as F @@ -52,6 +51,7 @@ class LinearBN(nn.Module): class Prenet(nn.Module): + # pylint: disable=dangerous-default-value def __init__(self, in_features, prenet_type="original", @@ -244,14 +244,14 @@ class OriginalAttention(nn.Module): self.u = (0.5 * torch.ones([B, 1])).to(inputs.device) def init_location_attention(self, inputs): - B = inputs.shape[0] - T = inputs.shape[1] - self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_()) + B = inputs.size(0) + T = inputs.size(1) + self.attention_weights_cum = torch.zeros([B, T], device=inputs.device) def init_states(self, inputs): - B = inputs.shape[0] - T = inputs.shape[1] - self.attention_weights = Variable(inputs.data.new(B, T).zero_()) + B = inputs.size(0) + T = inputs.size(1) + self.attention_weights = torch.zeros([B, T], device=inputs.device) if self.location_attention: self.init_location_attention(inputs) if self.forward_attn: @@ -300,8 +300,8 @@ class OriginalAttention(nn.Module): def apply_forward_attention(self, alignment): # forward attention - fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device), - (1, 0, 0, 0)) + fwd_shifted_alpha = F.pad( + self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0)) # compute transition potentials alpha = ((1 - self.u) * self.alpha + self.u * fwd_shifted_alpha @@ -309,7 +309,7 @@ class OriginalAttention(nn.Module): # force incremental alignment if not self.training and self.forward_attn_mask: _, n = fwd_shifted_alpha.max(1) - val, n2 = alpha.max(1) + val, _ = alpha.max(1) for b in range(alignment.shape[0]): alpha[b, n[b] + 3:] = 0 alpha[b, :( diff --git a/layers/gst_layers.py b/mozilla_voice_tts/tts/layers/gst_layers.py similarity index 98% rename from layers/gst_layers.py rename to mozilla_voice_tts/tts/layers/gst_layers.py index 8058d5ed..a49b14a2 100644 --- a/layers/gst_layers.py +++ b/mozilla_voice_tts/tts/layers/gst_layers.py @@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module): # x: 3D tensor [batch_size, post_conv_width, # num_channels*post_conv_height] self.recurrence.flatten_parameters() - memory, out = self.recurrence(x) + _, out = self.recurrence(x) # out: 3D tensor [seq_len==1, batch_size, encoding_size=128] return out.squeeze(0) @@ -96,7 +96,7 @@ class StyleTokenLayer(nn.Module): self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter( torch.FloatTensor(num_style_tokens, self.key_dim)) - nn.init.orthogonal_(self.style_tokens) + nn.init.normal_(self.style_tokens, mean=0, std=0.5) self.attention = MultiHeadAttention( query_dim=self.query_dim, key_dim=self.key_dim, diff --git a/layers/losses.py b/mozilla_voice_tts/tts/layers/losses.py similarity index 91% rename from layers/losses.py rename to mozilla_voice_tts/tts/layers/losses.py index 608e247d..ac80ddbf 100644 --- a/layers/losses.py +++ b/mozilla_voice_tts/tts/layers/losses.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch import nn from torch.nn import functional -from TTS.utils.generic_utils import sequence_mask +from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask class L1LossMasked(nn.Module): @@ -150,7 +150,7 @@ class GuidedAttentionLoss(torch.nn.Module): @staticmethod def _make_ga_mask(ilen, olen, sigma): - grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen)) + grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device), torch.arange(ilen, device=ilen.device)) grid_x, grid_y = grid_x.float(), grid_y.float() return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2))) @@ -184,7 +184,7 @@ class TacotronLoss(torch.nn.Module): def forward(self, postnet_output, decoder_output, mel_input, linear_input, stopnet_output, stopnet_target, output_lens, decoder_b_output, - alignments, alignment_lens, input_lens): + alignments, alignment_lens, alignments_backwards, input_lens): return_dict = {} # decoder and postnet losses @@ -226,6 +226,15 @@ class TacotronLoss(torch.nn.Module): return_dict['decoder_b_loss'] = decoder_b_loss return_dict['decoder_c_loss'] = decoder_c_loss + # double decoder consistency loss (if enabled) + if self.config.double_decoder_consistency: + decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens) + # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output) + attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards) + loss += decoder_b_loss + attention_c_loss + return_dict['decoder_coarse_loss'] = decoder_b_loss + return_dict['decoder_ddc_loss'] = attention_c_loss + # guided attention loss (if enabled) if self.config.ga_alpha > 0: ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens) @@ -234,4 +243,3 @@ class TacotronLoss(torch.nn.Module): return_dict['loss'] = loss return return_dict - diff --git a/layers/tacotron.py b/mozilla_voice_tts/tts/layers/tacotron.py similarity index 80% rename from layers/tacotron.py rename to mozilla_voice_tts/tts/layers/tacotron.py index 20fd1e52..807282b3 100644 --- a/layers/tacotron.py +++ b/mozilla_voice_tts/tts/layers/tacotron.py @@ -1,7 +1,7 @@ # coding: utf-8 import torch from torch import nn -from .common_layers import Prenet, init_attn, Linear +from .common_layers import Prenet, init_attn class BatchNormConv1d(nn.Module): @@ -18,8 +18,8 @@ class BatchNormConv1d(nn.Module): activation: activation function set b/w Conv1d and BatchNorm Shapes: - - input: batch x dims - - output: batch x dims + - input: (B, D) + - output: (B, D) """ def __init__(self, @@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module): # self.init_layers() def init_layers(self): - if type(self.activation) == torch.nn.ReLU: + if isinstance(self.activation, torch.nn.ReLU): w_gain = 'relu' - elif type(self.activation) == torch.nn.Tanh: + elif isinstance(self.activation, torch.nn.Tanh): w_gain = 'tanh' elif self.activation is None: w_gain = 'linear' @@ -67,12 +67,23 @@ class BatchNormConv1d(nn.Module): class Highway(nn.Module): + r"""Highway layers as explained in https://arxiv.org/abs/1505.00387 + + Args: + in_features (int): size of each input sample + out_feature (int): size of each output sample + + Shapes: + - input: (B, *, H_in) + - output: (B, *, H_out) + """ + # TODO: Try GLU layer - def __init__(self, in_size, out_size): + def __init__(self, in_features, out_feature): super(Highway, self).__init__() - self.H = nn.Linear(in_size, out_size) + self.H = nn.Linear(in_features, out_feature) self.H.bias.data.zero_() - self.T = nn.Linear(in_size, out_size) + self.T = nn.Linear(in_features, out_feature) self.T.bias.data.fill_(-1) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() @@ -103,10 +114,10 @@ class CBHG(nn.Module): num_highways (int): number of highways layers Shapes: - - input: B x D x T_in - - output: B x T_in x D*2 + - input: (B, C, T_in) + - output: (B, T_in, C*2) """ - + #pylint: disable=dangerous-default-value def __init__(self, in_features, K=16, @@ -195,6 +206,8 @@ class CBHG(nn.Module): class EncoderCBHG(nn.Module): + r"""CBHG module with Encoder specific arguments""" + def __init__(self): super(EncoderCBHG, self).__init__() self.cbhg = CBHG( @@ -211,7 +224,14 @@ class EncoderCBHG(nn.Module): class Encoder(nn.Module): - r"""Encapsulate Prenet and CBHG modules for encoder""" + r"""Stack Prenet and CBHG module for encoder + Args: + inputs (FloatTensor): embedding features + + Shapes: + - inputs: (B, T, D_in) + - outputs: (B, T, 128 * 2) + """ def __init__(self, in_features): super(Encoder, self).__init__() @@ -219,14 +239,6 @@ class Encoder(nn.Module): self.cbhg = EncoderCBHG() def forward(self, inputs): - r""" - Args: - inputs (FloatTensor): embedding features - - Shapes: - - inputs: batch x time x in_features - - outputs: batch x time x 128*2 - """ # B x T x prenet_dim outputs = self.prenet(inputs) outputs = self.cbhg(outputs.transpose(1, 2)) @@ -250,35 +262,48 @@ class PostCBHG(nn.Module): class Decoder(nn.Module): - """Decoder module. + """Tacotron decoder. Args: - in_features (int): input vector (encoder output) sample size. - memory_dim (int): memory vector (prev. time-step output) sample size. - r (int): number of outputs per time step. + in_channels (int): number of input channels. + frame_channels (int): number of feature frame channels. + r (int): number of outputs per time step (reduction rate). memory_size (int): size of the past window. if <= 0 memory_size = r - TODO: arguments + attn_type (string): type of attention used in decoder. + attn_windowing (bool): if true, define an attention window centered to maximum + attention response. It provides more robust attention alignment especially + at interence time. + attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'. + prenet_type (string): 'original' or 'bn'. + prenet_dropout (float): prenet dropout rate. + forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736 + trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736 + forward_attn_mask (bool): if true, mask attention values smaller than a threshold. + location_attn (bool): if true, use location sensitive attention. + attn_K (int): number of attention heads for GravesAttention. + separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. + speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. """ # Pylint gets confused by PyTorch conventions here - #pylint: disable=attribute-defined-outside-init + # pylint: disable=attribute-defined-outside-init - def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing, + def __init__(self, in_channels, frame_channels, r, memory_size, attn_type, attn_windowing, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, speaker_embedding_dim): + separate_stopnet): super(Decoder, self).__init__() self.r_init = r self.r = r - self.in_features = in_features + self.in_channels = in_channels self.max_decoder_steps = 500 self.use_memory_queue = memory_size > 0 self.memory_size = memory_size if memory_size > 0 else r - self.memory_dim = memory_dim + self.frame_channels = frame_channels self.separate_stopnet = separate_stopnet self.query_dim = 256 # memory -> |Prenet| -> processed_memory - prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim + prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels self.prenet = Prenet( prenet_dim, prenet_type, @@ -286,11 +311,11 @@ class Decoder(nn.Module): out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State # attention_rnn generates queries for the attention mechanism - self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim) + self.attention_rnn = nn.GRUCell(in_channels + 128, self.query_dim) self.attention = init_attn(attn_type=attn_type, query_dim=self.query_dim, - embedding_dim=in_features, + embedding_dim=in_channels, attention_dim=128, location_attention=location_attn, attention_location_n_filters=32, @@ -302,14 +327,14 @@ class Decoder(nn.Module): forward_attn_mask=forward_attn_mask, attn_K=attn_K) # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input - self.project_to_decoder_in = nn.Linear(256 + in_features, 256) + self.project_to_decoder_in = nn.Linear(256 + in_channels, 256) # decoder_RNN_input -> |RNN| -> RNN_state self.decoder_rnns = nn.ModuleList( [nn.GRUCell(256, 256) for _ in range(2)]) # RNN_state -> |Linear| -> mel_spec - self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init) + self.proj_to_mel = nn.Linear(256, frame_channels * self.r_init) # learn init values instead of zero init. - self.stopnet = StopNet(256 + memory_dim * self.r_init) + self.stopnet = StopNet(256 + frame_channels * self.r_init) def set_r(self, new_r): self.r = new_r @@ -319,9 +344,9 @@ class Decoder(nn.Module): Reshape the spectrograms for given 'r' """ # Grouping multiple frames if necessary - if memory.size(-1) == self.memory_dim: + if memory.size(-1) == self.frame_channels: memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1) - # Time first (T_decoder, B, memory_dim) + # Time first (T_decoder, B, frame_channels) memory = memory.transpose(0, 1) return memory @@ -330,19 +355,18 @@ class Decoder(nn.Module): Initialization of decoder states """ B = inputs.size(0) - T = inputs.size(1) # go frame as zeros matrix if self.use_memory_queue: - self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size) + self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size) else: - self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim) + self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels) # decoder states self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256) self.decoder_rnn_hiddens = [ torch.zeros(1, device=inputs.device).repeat(B, 256) for idx in range(len(self.decoder_rnns)) ] - self.context_vec = inputs.data.new(B, self.in_features).zero_() + self.context_vec = inputs.data.new(B, self.in_channels).zero_() # cache attention inputs self.processed_inputs = self.attention.preprocess_inputs(inputs) @@ -352,7 +376,7 @@ class Decoder(nn.Module): stop_tokens = torch.stack(stop_tokens).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() outputs = outputs.view( - outputs.size(0), -1, self.memory_dim) + outputs.size(0), -1, self.frame_channels) outputs = outputs.transpose(1, 2) return outputs, attentions, stop_tokens @@ -386,7 +410,7 @@ class Decoder(nn.Module): stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) - output = output[:, : self.r * self.memory_dim] + output = output[:, : self.r * self.frame_channels] return output, stop_token, self.attention.attention_weights def _update_memory_input(self, new_memory): @@ -395,17 +419,17 @@ class Decoder(nn.Module): # memory queue size is larger than number of frames per decoder iter self.memory_input = torch.cat([ new_memory, self.memory_input[:, :( - self.memory_size - self.r) * self.memory_dim].clone() + self.memory_size - self.r) * self.frame_channels].clone() ], dim=-1) else: # memory queue size smaller than number of frames per decoder iter - self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] + self.memory_input = new_memory[:, :self.memory_size * self.frame_channels] else: # use only the last frame prediction - # assert new_memory.shape[-1] == self.r * self.memory_dim - self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):] + # assert new_memory.shape[-1] == self.r * self.frame_channels + self.memory_input = new_memory[:, self.frame_channels * (self.r - 1):] - def forward(self, inputs, memory, mask, speaker_embeddings=None): + def forward(self, inputs, memory, mask): """ Args: inputs: Encoder outputs. @@ -415,8 +439,8 @@ class Decoder(nn.Module): mask: Attention mask for sequence padding. Shapes: - - inputs: batch x time x encoder_out_dim - - memory: batch x #mel_specs x mel_spec_dim + - inputs: (B, T, D_out_enc) + - memory: (B, T_mel, D_mel) """ # Run greedy decoding if memory is None memory = self._reshape_memory(memory) @@ -430,8 +454,7 @@ class Decoder(nn.Module): if t > 0: new_memory = memory[t - 1] self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) + output, stop_token, attention = self.decode(inputs, mask) outputs += [output] attentions += [attention] @@ -439,15 +462,12 @@ class Decoder(nn.Module): t += 1 return self._parse_outputs(outputs, attentions, stop_tokens) - def inference(self, inputs, speaker_embeddings=None): + def inference(self, inputs): """ Args: inputs: encoder outputs. - speaker_embeddings: speaker vectors. - Shapes: - inputs: batch x time x encoder_out_dim - - speaker_embeddings: batch x embed_dim """ outputs = [] attentions = [] @@ -460,8 +480,6 @@ class Decoder(nn.Module): if t > 0: new_memory = outputs[-1] self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) output, stop_token, attention = self.decode(inputs, None) stop_token = torch.sigmoid(stop_token.data) outputs += [output] @@ -471,14 +489,14 @@ class Decoder(nn.Module): if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6): break - elif t > self.max_decoder_steps: + if t > self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") break return self._parse_outputs(outputs, attentions, stop_tokens) class StopNet(nn.Module): - r""" + r"""Stopnet signalling decoder to stop inference. Args: in_features (int): feature dimension of input. """ diff --git a/layers/tacotron2.py b/mozilla_voice_tts/tts/layers/tacotron2.py similarity index 71% rename from layers/tacotron2.py rename to mozilla_voice_tts/tts/layers/tacotron2.py index f11aee65..490f3728 100644 --- a/layers/tacotron2.py +++ b/mozilla_voice_tts/tts/layers/tacotron2.py @@ -1,11 +1,24 @@ import torch -from torch.autograd import Variable from torch import nn from torch.nn import functional as F from .common_layers import init_attn, Prenet, Linear - +# NOTE: linter has a problem with the current TF release +#pylint: disable=no-value-for-parameter +#pylint: disable=unexpected-keyword-arg class ConvBNBlock(nn.Module): + r"""Convolutions with Batch Normalization and non-linear activation. + + Args: + in_channels (int): number of input channels. + out_channels (int): number of output channels. + kernel_size (int): convolution kernel size. + activation (str): 'relu', 'tanh', None (linear). + + Shapes: + - input: (B, C_in, T) + - output: (B, C_out, T) + """ def __init__(self, in_channels, out_channels, kernel_size, activation=None): super(ConvBNBlock, self).__init__() assert (kernel_size - 1) % 2 == 0 @@ -32,16 +45,25 @@ class ConvBNBlock(nn.Module): class Postnet(nn.Module): - def __init__(self, output_dim, num_convs=5): + r"""Tacotron2 Postnet + + Args: + in_out_channels (int): number of output channels. + + Shapes: + - input: (B, C_in, T) + - output: (B, C_in, T) + """ + def __init__(self, in_out_channels, num_convs=5): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( - ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh')) + ConvBNBlock(in_out_channels, 512, kernel_size=5, activation='tanh')) for _ in range(1, num_convs - 1): self.convolutions.append( ConvBNBlock(512, 512, kernel_size=5, activation='tanh')) self.convolutions.append( - ConvBNBlock(512, output_dim, kernel_size=5, activation=None)) + ConvBNBlock(512, in_out_channels, kernel_size=5, activation=None)) def forward(self, x): o = x @@ -51,14 +73,23 @@ class Postnet(nn.Module): class Encoder(nn.Module): - def __init__(self, output_input_dim=512): + r"""Tacotron2 Encoder + + Args: + in_out_channels (int): number of input and output channels. + + Shapes: + - input: (B, C_in, T) + - output: (B, C_in, T) + """ + def __init__(self, in_out_channels=512): super(Encoder, self).__init__() self.convolutions = nn.ModuleList() for _ in range(3): self.convolutions.append( - ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu')) - self.lstm = nn.LSTM(output_input_dim, - int(output_input_dim / 2), + ConvBNBlock(in_out_channels, in_out_channels, 5, 'relu')) + self.lstm = nn.LSTM(in_out_channels, + int(in_out_channels / 2), num_layers=1, batch_first=True, bias=True, @@ -90,20 +121,40 @@ class Encoder(nn.Module): # adapted from https://github.com/NVIDIA/tacotron2/ class Decoder(nn.Module): + """Tacotron2 decoder. We don't use Zoneout but Dropout between RNN layers. + + Args: + in_channels (int): number of input channels. + frame_channels (int): number of feature frame channels. + r (int): number of outputs per time step (reduction rate). + memory_size (int): size of the past window. if <= 0 memory_size = r + attn_type (string): type of attention used in decoder. + attn_win (bool): if true, define an attention window centered to maximum + attention response. It provides more robust attention alignment especially + at interence time. + attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'. + prenet_type (string): 'original' or 'bn'. + prenet_dropout (float): prenet dropout rate. + forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736 + trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736 + forward_attn_mask (bool): if true, mask attention values smaller than a threshold. + location_attn (bool): if true, use location sensitive attention. + attn_K (int): number of attention heads for GravesAttention. + separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. + """ # Pylint gets confused by PyTorch conventions here #pylint: disable=attribute-defined-outside-init - def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm, + def __init__(self, in_channels, frame_channels, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, - forward_attn_mask, location_attn, attn_K, separate_stopnet, - speaker_embedding_dim): + forward_attn_mask, location_attn, attn_K, separate_stopnet): super(Decoder, self).__init__() - self.frame_dim = frame_dim + self.frame_channels = frame_channels self.r_init = r self.r = r - self.encoder_embedding_dim = input_dim + self.encoder_embedding_dim = in_channels self.separate_stopnet = separate_stopnet self.max_decoder_steps = 1000 - self.gate_threshold = 0.5 + self.stop_threshold = 0.5 # model dimensions self.query_dim = 1024 @@ -114,20 +165,20 @@ class Decoder(nn.Module): self.p_decoder_dropout = 0.1 # memory -> |Prenet| -> processed_memory - prenet_dim = self.frame_dim + prenet_dim = self.frame_channels self.prenet = Prenet(prenet_dim, prenet_type, prenet_dropout, out_features=[self.prenet_dim, self.prenet_dim], bias=False) - self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim, + self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_channels, self.query_dim, bias=True) self.attention = init_attn(attn_type=attn_type, query_dim=self.query_dim, - embedding_dim=input_dim, + embedding_dim=in_channels, attention_dim=128, location_attention=location_attn, attention_location_n_filters=32, @@ -139,16 +190,16 @@ class Decoder(nn.Module): forward_attn_mask=forward_attn_mask, attn_K=attn_K) - self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim, + self.decoder_rnn = nn.LSTMCell(self.query_dim + in_channels, self.decoder_rnn_dim, bias=True) - self.linear_projection = Linear(self.decoder_rnn_dim + input_dim, - self.frame_dim * self.r_init) + self.linear_projection = Linear(self.decoder_rnn_dim + in_channels, + self.frame_channels * self.r_init) self.stopnet = nn.Sequential( nn.Dropout(0.1), - Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init, + Linear(self.decoder_rnn_dim + self.frame_channels * self.r_init, 1, bias=True, init_gain='sigmoid')) @@ -159,8 +210,8 @@ class Decoder(nn.Module): def get_go_frame(self, inputs): B = inputs.size(0) - memory = torch.zeros(1, device=inputs.device).repeat(B, - self.frame_dim * self.r) + memory = torch.zeros(1, device=inputs.device).repeat( + B, self.frame_channels * self.r) return memory def _init_states(self, inputs, mask, keep_states=False): @@ -186,9 +237,9 @@ class Decoder(nn.Module): Reshape the spectrograms for given 'r' """ # Grouping multiple frames if necessary - if memory.size(-1) == self.frame_dim: + if memory.size(-1) == self.frame_channels: memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1) - # Time first (T_decoder, B, frame_dim) + # Time first (T_decoder, B, frame_channels) memory = memory.transpose(0, 1) return memory @@ -196,22 +247,22 @@ class Decoder(nn.Module): alignments = torch.stack(alignments).transpose(0, 1) stop_tokens = torch.stack(stop_tokens).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() - outputs = outputs.view(outputs.size(0), -1, self.frame_dim) + outputs = outputs.view(outputs.size(0), -1, self.frame_channels) outputs = outputs.transpose(1, 2) return outputs, stop_tokens, alignments def _update_memory(self, memory): if len(memory.shape) == 2: - return memory[:, self.frame_dim * (self.r - 1):] - return memory[:, :, self.frame_dim * (self.r - 1):] + return memory[:, self.frame_channels * (self.r - 1):] + return memory[:, :, self.frame_channels * (self.r - 1):] def decode(self, memory): ''' shapes: - - memory: B x r * self.frame_dim + - memory: B x r * self.frame_channels ''' # self.context: B x D_en - # query_input: B x D_en + (r * self.frame_dim) + # query_input: B x D_en + (r * self.frame_channels) query_input = torch.cat((memory, self.context), -1) # self.query and self.attention_rnn_cell_state : B x D_attn_rnn self.query, self.attention_rnn_cell_state = self.attention_rnn( @@ -234,25 +285,36 @@ class Decoder(nn.Module): # B x (D_decoder_rnn + D_en) decoder_hidden_context = torch.cat((self.decoder_hidden, self.context), dim=1) - # B x (self.r * self.frame_dim) + # B x (self.r * self.frame_channels) decoder_output = self.linear_projection(decoder_hidden_context) - # B x (D_decoder_rnn + (self.r * self.frame_dim)) + # B x (D_decoder_rnn + (self.r * self.frame_channels)) stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1) if self.separate_stopnet: stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) # select outputs for the reduction rate self.r - decoder_output = decoder_output[:, :self.r * self.frame_dim] + decoder_output = decoder_output[:, :self.r * self.frame_channels] return decoder_output, self.attention.attention_weights, stop_token - def forward(self, inputs, memories, mask, speaker_embeddings=None): + def forward(self, inputs, memories, mask): + r"""Train Decoder with teacher forcing. + Args: + inputs: Encoder outputs. + memories: Feature frames for teacher-forcing. + mask: Attention mask for sequence padding. + + Shapes: + - inputs: (B, T, D_out_enc) + - memory: (B, T_mel, D_mel) + - outputs: (B, T_mel, D_mel) + - alignments: (B, T_in, T_out) + - stop_tokens: (B, T_out) + """ memory = self.get_go_frame(inputs).unsqueeze(0) memories = self._reshape_memory(memories) memories = torch.cat((memory, memories), dim=0) memories = self._update_memory(memories) - if speaker_embeddings is not None: - memories = torch.cat([memories, speaker_embeddings], dim=-1) memories = self.prenet(memories) self._init_states(inputs, mask=mask) @@ -270,7 +332,18 @@ class Decoder(nn.Module): outputs, stop_tokens, alignments) return outputs, alignments, stop_tokens - def inference(self, inputs, speaker_embeddings=None): + def inference(self, inputs): + r"""Decoder inference without teacher forcing and use + Stopnet to stop decoder. + Args: + inputs: Encoder outputs. + + Shapes: + - inputs: (B, T, D_out_enc) + - outputs: (B, T_mel, D_mel) + - alignments: (B, T_in, T_out) + - stop_tokens: (B, T_out) + """ memory = self.get_go_frame(inputs) memory = self._update_memory(memory) @@ -280,15 +353,13 @@ class Decoder(nn.Module): outputs, stop_tokens, alignments, t = [], [], [], 0 while True: memory = self.prenet(memory) - if speaker_embeddings is not None: - memory = torch.cat([memory, speaker_embeddings], dim=-1) decoder_output, alignment, stop_token = self.decode(memory) stop_token = torch.sigmoid(stop_token.data) outputs += [decoder_output.squeeze(1)] stop_tokens += [stop_token] alignments += [alignment] - if stop_token > 0.7 and t > inputs.shape[0] / 2: + if stop_token > self.stop_threshold and t > inputs.shape[0] // 2: break if len(outputs) == self.max_decoder_steps: print(" | > Decoder stopped with 'max_decoder_steps") @@ -315,7 +386,6 @@ class Decoder(nn.Module): self.attention.init_win_idx() self.attention.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 - stop_flags = [True, False, False] while True: memory = self.prenet(self.memory_truncated) decoder_output, alignment, stop_token = self.decode(memory) diff --git a/utils/__init__.py b/mozilla_voice_tts/tts/models/__init__.py similarity index 100% rename from utils/__init__.py rename to mozilla_voice_tts/tts/models/__init__.py diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py new file mode 100644 index 00000000..1dcf2fc8 --- /dev/null +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -0,0 +1,166 @@ +# coding: utf-8 +import torch +from torch import nn + +from mozilla_voice_tts.tts.layers.gst_layers import GST +from mozilla_voice_tts.tts.layers.tacotron import Decoder, Encoder, PostCBHG +from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract + + +class Tacotron(TacotronAbstract): + def __init__(self, + num_chars, + num_speakers, + r=5, + postnet_output_dim=1025, + decoder_output_dim=80, + attn_type='original', + attn_win=False, + attn_norm="sigmoid", + prenet_type="original", + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=256, + decoder_in_features=256, + speaker_embedding_dim=None, + gst=False, + gst_embedding_dim=256, + gst_num_heads=4, + gst_style_tokens=10, + memory_size=5): + super(Tacotron, + self).__init__(num_chars, num_speakers, r, postnet_output_dim, + decoder_output_dim, attn_type, attn_win, + attn_norm, prenet_type, prenet_dropout, + forward_attn, trans_agent, forward_attn_mask, + location_attn, attn_K, separate_stopnet, + bidirectional_decoder, double_decoder_consistency, + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, + gst_num_heads, gst_style_tokens) + + # speaker embedding layers + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embedding_dim = 256 + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + # speaker and gst embeddings is concat in decoder input + if self.num_speakers > 1: + self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + + # embedding layer + self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) + self.embedding.weight.data.normal_(0, 0.3) + + # base model layers + self.encoder = Encoder(self.encoder_in_features) + self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r, + memory_size, attn_type, attn_win, attn_norm, + prenet_type, prenet_dropout, forward_attn, + trans_agent, forward_attn_mask, location_attn, + attn_K, separate_stopnet) + self.postnet = PostCBHG(decoder_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, + postnet_output_dim) + + # global style token layers + if self.gst: + self.gst_layer = GST(num_mel=80, + num_heads=gst_num_heads, + num_style_tokens=gst_style_tokens, + embedding_dim=gst_embedding_dim) + # backward pass decoder + if self.bidirectional_decoder: + self._init_backward_decoder() + # setup DDC + if self.double_decoder_consistency: + self.coarse_decoder = Decoder( + self.decoder_in_features, decoder_output_dim, ddc_r, memory_size, + attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, + forward_attn, trans_agent, forward_attn_mask, location_attn, + attn_K, separate_stopnet) + + def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + """ + Shapes: + - characters: B x T_in + - text_lengths: B + - mel_specs: B x T_out x D + - speaker_ids: B x 1 + """ + input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) + # B x T_in x embed_dim + inputs = self.embedding(characters) + # B x T_in x encoder_in_features + encoder_outputs = self.encoder(inputs) + # sequence masking + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + # global style token + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + # speaker embedding + if self.num_speakers > 1: + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + # decoder_outputs: B x decoder_in_features x T_out + # alignments: B x T_in x encoder_in_features + # stop_tokens: B x T_in + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) + # sequence masking + if output_mask is not None: + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + # B x T_out x decoder_in_features + postnet_outputs = self.postnet(decoder_outputs) + # sequence masking + if output_mask is not None: + postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) + # B x T_out x posnet_dim + postnet_outputs = self.last_linear(postnet_outputs) + # B x T_out x decoder_in_features + decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() + if self.bidirectional_decoder: + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward + if self.double_decoder_consistency: + decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask) + return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward + return decoder_outputs, postnet_outputs, alignments, stop_tokens + + @torch.no_grad() + def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): + inputs = self.embedding(characters) + encoder_outputs = self.encoder(inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) + postnet_outputs = self.postnet(decoder_outputs) + postnet_outputs = self.last_linear(postnet_outputs) + decoder_outputs = decoder_outputs.transpose(1, 2) + return decoder_outputs, postnet_outputs, alignments, stop_tokens diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py new file mode 100644 index 00000000..a9ba442c --- /dev/null +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -0,0 +1,184 @@ +import torch +from torch import nn + +from mozilla_voice_tts.tts.layers.gst_layers import GST +from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet +from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract + +# TODO: match function arguments with tacotron +class Tacotron2(TacotronAbstract): + def __init__(self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type='original', + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + gst=False, + gst_embedding_dim=512, + gst_num_heads=4, + gst_style_tokens=10): + super(Tacotron2, + self).__init__(num_chars, num_speakers, r, postnet_output_dim, + decoder_output_dim, attn_type, attn_win, + attn_norm, prenet_type, prenet_dropout, + forward_attn, trans_agent, forward_attn_mask, + location_attn, attn_K, separate_stopnet, + bidirectional_decoder, double_decoder_consistency, + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, + gst_num_heads, gst_style_tokens) + + # speaker embedding layer + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embedding_dim = 512 + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + # speaker and gst embeddings is concat in decoder input + if self.num_speakers > 1: + self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + + # embedding layer + self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) + + # base model layers + self.encoder = Encoder(self.encoder_in_features) + self.decoder = Decoder(self.decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, + attn_norm, prenet_type, prenet_dropout, + forward_attn, trans_agent, forward_attn_mask, + location_attn, attn_K, separate_stopnet) + self.postnet = Postnet(self.postnet_output_dim) + + # global style token layers + if self.gst: + self.gst_layer = GST(num_mel=80, + num_heads=self.gst_num_heads, + num_style_tokens=self.gst_style_tokens, + embedding_dim=self.gst_embedding_dim) + # backward pass decoder + if self.bidirectional_decoder: + self._init_backward_decoder() + # setup DDC + if self.double_decoder_consistency: + self.coarse_decoder = Decoder( + self.decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, + attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, + trans_agent, forward_attn_mask, location_attn, attn_K, + separate_stopnet) + + @staticmethod + def shape_outputs(mel_outputs, mel_outputs_postnet, alignments): + mel_outputs = mel_outputs.transpose(1, 2) + mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) + return mel_outputs, mel_outputs_postnet, alignments + + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + # compute mask for padding + # B x T_in_max (boolean) + input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) + # B x D_embed x T_in_max + embedded_inputs = self.embedding(text).transpose(1, 2) + # B x T_in_max x D_en + encoder_outputs = self.encoder(embedded_inputs, text_lengths) + + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + + if self.num_speakers > 1: + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + + # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) + # sequence masking + if mel_lengths is not None: + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + # B x mel_dim x T_out + postnet_outputs = self.postnet(decoder_outputs) + postnet_outputs = decoder_outputs + postnet_outputs + # sequence masking + if output_mask is not None: + postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) + # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) + if self.bidirectional_decoder: + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward + if self.double_decoder_consistency: + decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask) + return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward + return decoder_outputs, postnet_outputs, alignments, stop_tokens + + @torch.no_grad() + def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + embedded_inputs = self.embedding(text).transpose(1, 2) + encoder_outputs = self.encoder.inference(embedded_inputs) + + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) + postnet_outputs = self.postnet(decoder_outputs) + postnet_outputs = decoder_outputs + postnet_outputs + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) + return decoder_outputs, postnet_outputs, alignments, stop_tokens + + def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + """ + Preserve model states for continuous inference + """ + embedded_inputs = self.embedding(text).transpose(1, 2) + encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + + mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( + encoder_outputs) + mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs_postnet = mel_outputs + mel_outputs_postnet + mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs( + mel_outputs, mel_outputs_postnet, alignments) + return mel_outputs, mel_outputs_postnet, alignments, stop_tokens diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py new file mode 100644 index 00000000..d98d03b7 --- /dev/null +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -0,0 +1,212 @@ +import copy +from abc import ABC, abstractmethod + +import torch +from torch import nn + +from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask + + +class TacotronAbstract(ABC, nn.Module): + def __init__(self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type='original', + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + gst=False, + gst_embedding_dim=512, + gst_num_heads=4, + gst_style_tokens=10): + """ Abstract Tacotron class """ + super().__init__() + self.num_chars = num_chars + self.r = r + self.decoder_output_dim = decoder_output_dim + self.postnet_output_dim = postnet_output_dim + self.gst = gst + self.gst_embedding_dim = gst_embedding_dim + self.gst_num_heads = gst_num_heads + self.gst_style_tokens = gst_style_tokens + self.num_speakers = num_speakers + self.bidirectional_decoder = bidirectional_decoder + self.double_decoder_consistency = double_decoder_consistency + self.ddc_r = ddc_r + self.attn_type = attn_type + self.attn_win = attn_win + self.attn_norm = attn_norm + self.prenet_type = prenet_type + self.prenet_dropout = prenet_dropout + self.forward_attn = forward_attn + self.trans_agent = trans_agent + self.forward_attn_mask = forward_attn_mask + self.location_attn = location_attn + self.attn_K = attn_K + self.separate_stopnet = separate_stopnet + self.encoder_in_features = encoder_in_features + self.decoder_in_features = decoder_in_features + self.speaker_embedding_dim = speaker_embedding_dim + + # layers + self.embedding = None + self.encoder = None + self.decoder = None + self.postnet = None + + # multispeaker + if self.speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # global style token + if self.gst: + self.decoder_in_features += gst_embedding_dim # add gst embedding dim + self.gst_layer = None + + # model states + self.speaker_embeddings = None + self.speaker_embeddings_projected = None + + # additional layers + self.decoder_backward = None + self.coarse_decoder = None + + ############################# + # INIT FUNCTIONS + ############################# + + def _init_states(self): + self.speaker_embeddings = None + self.speaker_embeddings_projected = None + + def _init_backward_decoder(self): + self.decoder_backward = copy.deepcopy(self.decoder) + + def _init_coarse_decoder(self): + self.coarse_decoder = copy.deepcopy(self.decoder) + self.coarse_decoder.r_init = self.ddc_r + self.coarse_decoder.set_r(self.ddc_r) + + ############################# + # CORE FUNCTIONS + ############################# + + @abstractmethod + def forward(self): + pass + + @abstractmethod + def inference(self): + pass + + ############################# + # COMMON COMPUTE FUNCTIONS + ############################# + + def compute_masks(self, text_lengths, mel_lengths): + """Compute masks against sequence paddings.""" + # B x T_in_max (boolean) + device = text_lengths.device + input_mask = sequence_mask(text_lengths).to(device) + output_mask = None + if mel_lengths is not None: + max_len = mel_lengths.max() + r = self.decoder.r + max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len + output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) + return input_mask, output_mask + + def _backward_pass(self, mel_specs, encoder_outputs, mask): + """ Run backwards decoder """ + decoder_outputs_b, alignments_b, _ = self.decoder_backward( + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask, + self.speaker_embeddings_projected) + decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() + return decoder_outputs_b, alignments_b + + def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, + input_mask): + """ Double Decoder Consistency """ + T = mel_specs.shape[1] + if T % self.coarse_decoder.r > 0: + padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) + mel_specs = torch.nn.functional.pad(mel_specs, + (0, 0, 0, padding_size, 0, 0)) + decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( + encoder_outputs.detach(), mel_specs, input_mask) + # scale_factor = self.decoder.r_init / self.decoder.r + alignments_backward = torch.nn.functional.interpolate( + alignments_backward.transpose(1, 2), + size=alignments.shape[1], + mode='nearest').transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward[:, :T, :] + return decoder_outputs_backward, alignments_backward + + ############################# + # EMBEDDING FUNCTIONS + ############################# + + def compute_speaker_embedding(self, speaker_ids): + """ Compute speaker embedding vectors """ + if hasattr(self, "speaker_embedding") and speaker_ids is None: + raise RuntimeError( + " [!] Model has speaker embedding layer but speaker_id is not provided" + ) + if hasattr(self, "speaker_embedding") and speaker_ids is not None: + self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) + if hasattr(self, "speaker_project_mel") and speaker_ids is not None: + self.speaker_embeddings_projected = self.speaker_project_mel( + self.speaker_embeddings).squeeze(1) + + def compute_gst(self, inputs, style_input): + """ Compute global style token """ + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) + return inputs + + @staticmethod + def _add_speaker_embedding(outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand( + outputs.size(0), outputs.size(1), -1) + outputs = outputs + speaker_embeddings_ + return outputs + + @staticmethod + def _concat_speaker_embedding(outputs, speaker_embeddings): + speaker_embeddings_ = speaker_embeddings.expand( + outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + return outputs diff --git a/tf/README.md b/mozilla_voice_tts/tts/tf/README.md similarity index 100% rename from tf/README.md rename to mozilla_voice_tts/tts/tf/README.md diff --git a/mozilla_voice_tts/tts/tf/__init__.py b/mozilla_voice_tts/tts/tf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tf/layers/common_layers.py b/mozilla_voice_tts/tts/tf/layers/common_layers.py similarity index 85% rename from tf/layers/common_layers.py rename to mozilla_voice_tts/tts/tf/layers/common_layers.py index 995b5490..ad18b9fc 100644 --- a/tf/layers/common_layers.py +++ b/mozilla_voice_tts/tts/tf/layers/common_layers.py @@ -3,6 +3,9 @@ from tensorflow import keras from tensorflow.python.ops import math_ops # from tensorflow_addons.seq2seq import BahdanauAttention +# NOTE: linter has a problem with the current TF release +#pylint: disable=no-value-for-parameter +#pylint: disable=unexpected-keyword-arg class Linear(keras.layers.Layer): def __init__(self, units, use_bias, **kwargs): @@ -109,12 +112,18 @@ class Attention(keras.layers.Layer): raise ValueError("Unknown value for attention norm type") def init_states(self, batch_size, value_length): - states = () + states = [] if self.use_loc_attn: attention_cum = tf.zeros([batch_size, value_length]) attention_old = tf.zeros([batch_size, value_length]) - states = (attention_cum, attention_old) - return states + states = [attention_cum, attention_old] + if self.use_forward_attn: + alpha = tf.concat([ + tf.ones([batch_size, 1]), + tf.zeros([batch_size, value_length])[:, :-1] + 1e-7 + ], 1) + states.append(alpha) + return tuple(states) def process_values(self, values): """ cache values for decoder iterations """ @@ -125,7 +134,7 @@ class Attention(keras.layers.Layer): def get_loc_attn(self, query, states): """ compute location attention, query layer and unnorm. attention weights""" - attention_cum, attention_old = states + attention_cum, attention_old = states[:2] attn_cat = tf.stack([attention_old, attention_cum], axis=2) processed_query = self.query_layer(tf.expand_dims(query, 1)) @@ -150,6 +159,23 @@ class Attention(keras.layers.Layer): score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) return score + def apply_forward_attention(self, alignment, alpha): #pylint: disable=no-self-use + # forward attention + fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0) + # compute transition potentials + new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment + # renormalize attention weights + new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True) + return new_alpha + + def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None): + states = [] + if self.use_loc_attn: + states = [old_states[0] + scores_norm, attn_weights] + if self.use_forward_attn: + states.append(new_alpha) + return tuple(states) + def call(self, query, states): """ shapes: @@ -165,13 +191,19 @@ class Attention(keras.layers.Layer): # self.apply_score_masking(score, mask) # attn_weights shape == (batch_size, max_length, 1) - attn_weights = self.norm_func(score) + # normalize attention scores + scores_norm = self.norm_func(score) + attn_weights = scores_norm - # update attention states - if self.use_loc_attn: - states = (states[0] + attn_weights, attn_weights) - else: - states = () + # apply forward attention + new_alpha = None + if self.use_forward_attn: + new_alpha = self.apply_forward_attention(attn_weights, states[-1]) + attn_weights = new_alpha + + # update states tuple + # states = (cum_attn_weights, attn_weights, new_alpha) + states = self.update_states(states, scores_norm, attn_weights, new_alpha) # context_vector shape after sum == (batch_size, hidden_size) context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False) diff --git a/tf/layers/tacotron2.py b/mozilla_voice_tts/tts/tf/layers/tacotron2.py similarity index 76% rename from tf/layers/tacotron2.py rename to mozilla_voice_tts/tts/tf/layers/tacotron2.py index c6f1a2cd..0dd0593e 100644 --- a/tf/layers/tacotron2.py +++ b/mozilla_voice_tts/tts/tf/layers/tacotron2.py @@ -1,11 +1,12 @@ - import tensorflow as tf from tensorflow import keras -from TTS.tf.utils.tf_utils import shape_list -from TTS.tf.layers.common_layers import Prenet, Attention +from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list +from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention # from tensorflow_addons.seq2seq import AttentionWrapper - +# NOTE: linter has a problem with the current TF release +#pylint: disable=no-value-for-parameter +#pylint: disable=unexpected-keyword-arg class ConvBNBlock(keras.layers.Layer): def __init__(self, filters, kernel_size, activation, **kwargs): super(ConvBNBlock, self).__init__(**kwargs) @@ -58,12 +59,16 @@ class Decoder(keras.layers.Layer): #pylint: disable=unused-argument def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type, prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask, - use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, **kwargs): + use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, enable_tflite, **kwargs): super(Decoder, self).__init__(**kwargs) self.frame_dim = frame_dim self.r_init = tf.constant(r, dtype=tf.int32) self.r = tf.constant(r, dtype=tf.int32) + self.output_dim = r * self.frame_dim self.separate_stopnet = separate_stopnet + self.enable_tflite = enable_tflite + + # layer constants self.max_decoder_steps = tf.constant(1000, dtype=tf.int32) self.stop_thresh = tf.constant(0.5, dtype=tf.float32) @@ -80,7 +85,7 @@ class Decoder(keras.layers.Layer): [self.prenet_dim, self.prenet_dim], bias=False, name='prenet') - self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name=f'{self.name}/attention_rnn', ) + self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name='attention_rnn', ) self.attention_rnn_dropout = keras.layers.Dropout(0.5) # TODO: implement other attn options @@ -94,10 +99,10 @@ class Decoder(keras.layers.Layer): use_trans_agent=use_trans_agent, use_forward_attn_mask=use_forward_attn_mask, name='attention') - self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name=f'{self.name}/decoder_rnn') + self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name='decoder_rnn') self.decoder_rnn_dropout = keras.layers.Dropout(0.5) - self.linear_projection = keras.layers.Dense(self.frame_dim * r, name=f'{self.name}/linear_projection/linear_layer') - self.stopnet = keras.layers.Dense(1, name=f'{self.name}/stopnet/linear_layer') + self.linear_projection = keras.layers.Dense(self.frame_dim * r, name='linear_projection/linear_layer') + self.stopnet = keras.layers.Dense(1, name='stopnet/linear_layer') def set_max_decoder_steps(self, new_max_steps): @@ -105,6 +110,7 @@ class Decoder(keras.layers.Layer): def set_r(self, new_r): self.r = tf.constant(new_r, dtype=tf.int32) + self.output_dim = self.frame_dim * new_r def build_decoder_initial_states(self, batch_size, memory_dim, memory_length): zero_frame = tf.zeros([batch_size, self.frame_dim]) @@ -183,6 +189,7 @@ class Decoder(keras.layers.Layer): outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) + # pre-computes self.attention.process_values(memory) @@ -226,7 +233,70 @@ class Decoder(keras.layers.Layer): outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) return outputs, stop_tokens, attentions + def decode_inference_tflite(self, memory, states): + """Inference with TF-Lite compatibility. It assumes + batch_size is 1""" + # init states + # dynamic_shape is not supported in TFLite + outputs = tf.TensorArray(dtype=tf.float32, + size=self.max_decoder_steps, + element_shape=tf.TensorShape( + [self.output_dim]), + clear_after_read=False, + dynamic_size=False) + # stop_flags = tf.TensorArray(dtype=tf.bool, + # size=self.max_decoder_steps, + # element_shape=tf.TensorShape( + # []), + # clear_after_read=False, + # dynamic_size=False) + attentions = () + stop_tokens = () + + # pre-computes + self.attention.process_values(memory) + + # iter vars + stop_flag = tf.constant(False, dtype=tf.bool) + step_count = tf.constant(0, dtype=tf.int32) + + def _body(step, memory, states, outputs, stop_flag): + frame_next = states[0] + prenet_next = self.prenet(frame_next, training=False) + output, stop_token, states, _ = self.step(prenet_next, + states, + None, + training=False) + stop_token = tf.math.sigmoid(stop_token) + stop_flag = tf.greater(stop_token, self.stop_thresh) + stop_flag = tf.reduce_all(stop_flag) + # stop_flags = stop_flags.write(step, tf.logical_not(stop_flag)) + + outputs = outputs.write(step, tf.reshape(output, [-1])) + return step + 1, memory, states, outputs, stop_flag + + cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) + step_count, memory, states, outputs, stop_flag = \ + tf.while_loop(cond, + _body, + loop_vars=(step_count, memory, states, outputs, + stop_flag), + parallel_iterations=32, + swap_memory=True, + maximum_iterations=self.max_decoder_steps) + + + outputs = outputs.stack() + outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter + outputs = tf.expand_dims(outputs, axis=[0]) + outputs = tf.transpose(outputs, [1, 0, 2]) + outputs = tf.reshape(outputs, [1, -1, self.frame_dim]) + return outputs, stop_tokens, attentions + + def call(self, memory, states, frames=None, memory_seq_length=None, training=False): if training: return self.decode(memory, states, frames, memory_seq_length) + if self.enable_tflite: + return self.decode_inference_tflite(memory, states) return self.decode_inference(memory, states) diff --git a/tf/models/tacotron2.py b/mozilla_voice_tts/tts/tf/models/tacotron2.py similarity index 70% rename from tf/models/tacotron2.py rename to mozilla_voice_tts/tts/tf/models/tacotron2.py index 101291cf..812fc634 100644 --- a/tf/models/tacotron2.py +++ b/mozilla_voice_tts/tts/tf/models/tacotron2.py @@ -1,10 +1,11 @@ +import tensorflow as tf from tensorflow import keras -from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet -from TTS.tf.utils.tf_utils import shape_list +from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet +from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list -#pylint: disable=too-many-ancestors +#pylint: disable=too-many-ancestors, abstract-method class Tacotron2(keras.models.Model): def __init__(self, num_chars, @@ -23,7 +24,8 @@ class Tacotron2(keras.models.Model): forward_attn_mask=False, location_attn=True, separate_stopnet=True, - bidirectional_decoder=False): + bidirectional_decoder=False, + enable_tflite=False): super(Tacotron2, self).__init__() self.r = r self.decoder_output_dim = decoder_output_dim @@ -31,6 +33,7 @@ class Tacotron2(keras.models.Model): self.bidirectional_decoder = bidirectional_decoder self.num_speakers = num_speakers self.speaker_embed_dim = 256 + self.enable_tflite = enable_tflite self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding') self.encoder = Encoder(512, name='encoder') @@ -48,9 +51,12 @@ class Tacotron2(keras.models.Model): use_location_attn=location_attn, attn_K=attn_K, separate_stopnet=separate_stopnet, - speaker_emb_dim=self.speaker_embed_dim) + speaker_emb_dim=self.speaker_embed_dim, + name='decoder', + enable_tflite=enable_tflite) self.postnet = Postnet(postnet_output_dim, 5, name='postnet') + @tf.function(experimental_relax_shapes=True) def call(self, characters, text_lengths=None, frames=None, training=None): if training: return self.training(characters, text_lengths, frames) @@ -79,3 +85,23 @@ class Tacotron2(keras.models.Model): print(output_frames.shape) return decoder_frames, output_frames, attentions, stop_tokens + @tf.function( + experimental_relax_shapes=True, + input_signature=[ + tf.TensorSpec([1, None], dtype=tf.int32), + ],) + def inference_tflite(self, characters): + B, T = shape_list(characters) + embedding_vectors = self.embedding(characters, training=False) + encoder_output = self.encoder(embedding_vectors, training=False) + decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) + decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) + postnet_frames = self.postnet(decoder_frames, training=False) + output_frames = decoder_frames + postnet_frames + print(output_frames.shape) + return decoder_frames, output_frames, attentions, stop_tokens + + def build_inference(self, ): + # TODO: issue https://github.com/PyCQA/pylint/issues/3613 + input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg + self(input_ids) diff --git a/tf/utils/convert_torch_to_tf_utils.py b/mozilla_voice_tts/tts/tf/utils/convert_torch_to_tf_utils.py similarity index 96% rename from tf/utils/convert_torch_to_tf_utils.py rename to mozilla_voice_tts/tts/tf/utils/convert_torch_to_tf_utils.py index e9e1e8a3..03b41803 100644 --- a/tf/utils/convert_torch_to_tf_utils.py +++ b/mozilla_voice_tts/tts/tf/utils/convert_torch_to_tf_utils.py @@ -1,6 +1,9 @@ import numpy as np import tensorflow as tf +# NOTE: linter has a problem with the current TF release +#pylint: disable=no-value-for-parameter +#pylint: disable=unexpected-keyword-arg def tf_create_dummy_inputs(): """ Create dummy inputs for TF Tacotron2 model """ diff --git a/tf/utils/generic_utils.py b/mozilla_voice_tts/tts/tf/utils/generic_utils.py similarity index 85% rename from tf/utils/generic_utils.py rename to mozilla_voice_tts/tts/tf/utils/generic_utils.py index 6368658d..f8131abd 100644 --- a/tf/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/tf/utils/generic_utils.py @@ -1,4 +1,3 @@ -import os import datetime import importlib import pickle @@ -6,9 +5,7 @@ import numpy as np import tensorflow as tf -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): - checkpoint_path = 'tts_tf_checkpoint_{}.pkl'.format(current_step) - checkpoint_path = os.path.join(output_folder, checkpoint_path) +def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): state = { 'model': model.weights, 'optimizer': optimizer, @@ -18,7 +15,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k 'r': r } state.update(kwargs) - pickle.dump(state, open(checkpoint_path, 'wb')) + pickle.dump(state, open(output_path, 'wb')) def load_checkpoint(model, checkpoint_path): @@ -27,7 +24,13 @@ def load_checkpoint(model, checkpoint_path): tf_vars = model.weights for tf_var in tf_vars: layer_name = tf_var.name - chkp_var_value = chkp_var_dict[layer_name] + try: + chkp_var_value = chkp_var_dict[layer_name] + except KeyError: + class_name = list(chkp_var_dict.keys())[0].split("/")[0] + layer_name = f"{class_name}/{layer_name}" + chkp_var_value = chkp_var_dict[layer_name] + tf.keras.backend.set_value(tf_var, chkp_var_value) if 'r' in checkpoint.keys(): model.decoder.set_r(checkpoint['r']) @@ -72,9 +75,9 @@ def count_parameters(model, c): return model.count_params() -def setup_model(num_chars, num_speakers, c): +def setup_model(num_chars, num_speakers, c, enable_tflite=False): print(" > Using model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower()) + MyModel = importlib.import_module('mozilla_voice_tts.tts.tf.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) if c.model.lower() in "tacotron": raise NotImplementedError(' [!] Tacotron model is not ready.') @@ -95,5 +98,6 @@ def setup_model(num_chars, num_speakers, c): location_attn=c.location_attn, attn_K=c.attention_heads, separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder) + bidirectional_decoder=c.bidirectional_decoder, + enable_tflite=enable_tflite) return model diff --git a/mozilla_voice_tts/tts/tf/utils/io.py b/mozilla_voice_tts/tts/tf/utils/io.py new file mode 100644 index 00000000..143422d2 --- /dev/null +++ b/mozilla_voice_tts/tts/tf/utils/io.py @@ -0,0 +1,41 @@ +import pickle +import datetime +import tensorflow as tf + + +def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): + state = { + 'model': model.weights, + 'optimizer': optimizer, + 'step': current_step, + 'epoch': epoch, + 'date': datetime.date.today().strftime("%B %d, %Y"), + 'r': r + } + state.update(kwargs) + pickle.dump(state, open(output_path, 'wb')) + + +def load_checkpoint(model, checkpoint_path): + checkpoint = pickle.load(open(checkpoint_path, 'rb')) + chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} + tf_vars = model.weights + for tf_var in tf_vars: + layer_name = tf_var.name + try: + chkp_var_value = chkp_var_dict[layer_name] + except KeyError: + class_name = list(chkp_var_dict.keys())[0].split("/")[0] + layer_name = f"{class_name}/{layer_name}" + chkp_var_value = chkp_var_dict[layer_name] + + tf.keras.backend.set_value(tf_var, chkp_var_value) + if 'r' in checkpoint.keys(): + model.decoder.set_r(checkpoint['r']) + return model + + +def load_tflite_model(tflite_path): + tflite_model = tf.lite.Interpreter(model_path=tflite_path) + tflite_model.allocate_tensors() + return tflite_model diff --git a/tf/utils/tf_utils.py b/mozilla_voice_tts/tts/tf/utils/tf_utils.py similarity index 100% rename from tf/utils/tf_utils.py rename to mozilla_voice_tts/tts/tf/utils/tf_utils.py diff --git a/mozilla_voice_tts/tts/tf/utils/tflite.py b/mozilla_voice_tts/tts/tf/utils/tflite.py new file mode 100644 index 00000000..b8daf254 --- /dev/null +++ b/mozilla_voice_tts/tts/tf/utils/tflite.py @@ -0,0 +1,31 @@ +import tensorflow as tf + + +def convert_tacotron2_to_tflite(model, + output_path=None, + experimental_converter=True): + """Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is + provided, else return TFLite model.""" + + concrete_function = model.inference_tflite.get_concrete_function() + converter = tf.lite.TFLiteConverter.from_concrete_functions( + [concrete_function]) + converter.experimental_new_converter = experimental_converter + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS + ] + tflite_model = converter.convert() + print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.') + if output_path is not None: + # same model binary if outputpath is provided + with open(output_path, 'wb') as f: + f.write(tflite_model) + return None + return tflite_model + + +def load_tflite_model(tflite_path): + tflite_model = tf.lite.Interpreter(model_path=tflite_path) + tflite_model.allocate_tensors() + return tflite_model diff --git a/mozilla_voice_tts/tts/utils/__init__.py b/mozilla_voice_tts/tts/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/data.py b/mozilla_voice_tts/tts/utils/data.py similarity index 99% rename from utils/data.py rename to mozilla_voice_tts/tts/utils/data.py index a83325cb..a75410b4 100644 --- a/utils/data.py +++ b/mozilla_voice_tts/tts/utils/data.py @@ -74,4 +74,3 @@ class StandardScaler(): X *= self.scale_ X += self.mean_ return X - diff --git a/distribute.py b/mozilla_voice_tts/tts/utils/distribute.py similarity index 65% rename from distribute.py rename to mozilla_voice_tts/tts/utils/distribute.py index b0fc8b07..89d4efec 100644 --- a/distribute.py +++ b/mozilla_voice_tts/tts/utils/distribute.py @@ -1,15 +1,11 @@ # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py -import os, sys import math -import time -import subprocess -import argparse + import torch import torch.distributed as dist -from torch.utils.data.sampler import Sampler -from torch.autograd import Variable from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from TTS.utils.generic_utils import create_experiment_folder +from torch.autograd import Variable +from torch.utils.data.sampler import Sampler class DistributedSampler(Sampler): @@ -108,7 +104,7 @@ def apply_gradient_allreduce(module): for param in list(module.parameters()): def allreduce_hook(*_): - Variable._execution_engine.queue_callback(allreduce_params) + Variable._execution_engine.queue_callback(allreduce_params) #pylint: disable=protected-access if param.requires_grad: param.register_hook(allreduce_hook) @@ -118,61 +114,3 @@ def apply_gradient_allreduce(module): module.register_forward_hook(set_needs_reduction) return module - - -def main(): - """ - Call train.py as a new process and pass command arguments - """ - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - args = parser.parse_args() - - # OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, - # True) - # stdout_path = os.path.join(OUT_PATH, "process_stdout/") - - num_gpus = torch.cuda.device_count() - group_id = time.strftime("%Y_%m_%d-%H%M%S") - - # set arguments for train.py - command = ['train.py'] - command.append('--continue_path={}'.format(args.continue_path)) - command.append('--restore_path={}'.format(args.restore_path)) - command.append('--config_path={}'.format(args.config_path)) - command.append('--group_id=group_{}'.format(group_id)) - command.append('') - - # run processes - processes = [] - for i in range(num_gpus): - my_env = os.environ.copy() - my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) - command[-1] = '--rank={}'.format(i) - stdout = None if i == 0 else open(os.devnull, 'w') - p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env) - processes.append(p) - print(command) - - for p in processes: - p.wait() - - -if __name__ == '__main__': - main() diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py new file mode 100644 index 00000000..d7dca0ac --- /dev/null +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -0,0 +1,229 @@ +import torch +import importlib +import numpy as np +from collections import Counter + +from mozilla_voice_tts.utils.generic_utils import check_argument + + +def split_dataset(items): + is_multi_speaker = False + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = 500 if len(items) * 0.01 > 500 else int( + len(items) * 0.01) + assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + # most stupid code ever -- Fix it ! + while len(items_eval) < eval_split_size: + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + item_idx = np.random.randint(0, len(items)) + if speaker_counter[items[item_idx][-1]] > 1: + items_eval.append(items[item_idx]) + del items[item_idx] + return items_eval, items + return items[:eval_split_size], items[eval_split_size:] + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + batch_size = sequence_length.size(0) + seq_range = torch.arange(0, max_len).long() + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.to(sequence_length.device) + seq_length_expand = ( + sequence_length.unsqueeze(1).expand_as(seq_range_expand)) + # B x T_max + return seq_range_expand < seq_length_expand + + +def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): + print(" > Using model: {}".format(c.model)) + MyModel = importlib.import_module('mozilla_voice_tts.tts.models.' + c.model.lower()) + MyModel = getattr(MyModel, c.model) + if c.model.lower() in "tacotron": + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), + decoder_output_dim=c.audio['num_mels'], + gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + memory_size=c.memory_size, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim) + elif c.model.lower() == "tacotron2": + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio['num_mels'], + decoder_output_dim=c.audio['num_mels'], + gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim) + return model + + +def check_config(c): + check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) + check_argument('run_name', c, restricted=True, val_type=str) + check_argument('run_description', c, val_type=str) + + # AUDIO + check_argument('audio', c, restricted=True, val_type=dict) + + # audio processing parameters + check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) + check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) + check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') + check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') + check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) + check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) + check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) + check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + + # vocabulary parameters + check_argument('characters', c, restricted=False, val_type=dict) + check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + + # normalization parameters + check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) + check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) + check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) + check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) + check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) + check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100) + check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) + check_argument('trim_db', c['audio'], restricted=True, val_type=int) + + # training parameters + check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) + check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) + check_argument('r', c, restricted=True, val_type=int, min_val=1) + check_argument('gradual_training', c, restricted=False, val_type=list) + check_argument('loss_masking', c, restricted=True, val_type=bool) + check_argument('apex_amp_level', c, restricted=False, val_type=str) + # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + + # validation parameters + check_argument('run_eval', c, restricted=True, val_type=bool) + check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) + check_argument('test_sentences_file', c, restricted=False, val_type=str) + + # optimizer + check_argument('noam_schedule', c, restricted=False, val_type=bool) + check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) + check_argument('epochs', c, restricted=True, val_type=int, min_val=1) + check_argument('lr', c, restricted=True, val_type=float, min_val=0) + check_argument('wd', c, restricted=True, val_type=float, min_val=0) + check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) + check_argument('seq_len_norm', c, restricted=True, val_type=bool) + + # tacotron prenet + check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) + check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) + check_argument('prenet_dropout', c, restricted=True, val_type=bool) + + # attention + check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) + check_argument('attention_heads', c, restricted=True, val_type=int) + check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) + check_argument('windowing', c, restricted=True, val_type=bool) + check_argument('use_forward_attn', c, restricted=True, val_type=bool) + check_argument('forward_attn_mask', c, restricted=True, val_type=bool) + check_argument('transition_agent', c, restricted=True, val_type=bool) + check_argument('transition_agent', c, restricted=True, val_type=bool) + check_argument('location_attn', c, restricted=True, val_type=bool) + check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) + check_argument('double_decoder_consistency', c, restricted=True, val_type=bool) + check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) + + # stopnet + check_argument('stopnet', c, restricted=True, val_type=bool) + check_argument('separate_stopnet', c, restricted=True, val_type=bool) + + # tensorboard + check_argument('print_step', c, restricted=True, val_type=int, min_val=1) + check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) + check_argument('save_step', c, restricted=True, val_type=int, min_val=1) + check_argument('checkpoint', c, restricted=True, val_type=bool) + check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) + + # dataloading + # pylint: disable=import-outside-toplevel + from mozilla_voice_tts.tts.utils.text import cleaners + check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) + check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) + check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) + check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) + check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) + check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) + check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) + + # paths + check_argument('output_path', c, restricted=True, val_type=str) + + # multi-speaker and gst + check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) + check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool) + check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str) + check_argument('use_gst', c, restricted=True, val_type=bool) + check_argument('gst', c, restricted=True, val_type=dict) + check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) + check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) + check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + + # datasets - checking only the first entry + check_argument('datasets', c, restricted=True, val_type=list) + for dataset_entry in c['datasets']: + check_argument('name', dataset_entry, restricted=True, val_type=str) + check_argument('path', dataset_entry, restricted=True, val_type=str) + check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) + check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/utils/io.py b/mozilla_voice_tts/tts/utils/io.py similarity index 58% rename from utils/io.py rename to mozilla_voice_tts/tts/utils/io.py index c1067726..da5c8b27 100644 --- a/utils/io.py +++ b/mozilla_voice_tts/tts/utils/io.py @@ -1,44 +1,13 @@ import os -import json -import re import torch import datetime -class AttrDict(dict): - def __init__(self, *args, **kwargs): - super(AttrDict, self).__init__(*args, **kwargs) - self.__dict__ = self - - -def load_config(config_path): - config = AttrDict() - with open(config_path, "r", encoding = "utf-8") as f: - input_str = f.read() - input_str = re.sub(r'\\\n', '', input_str) - input_str = re.sub(r'//.*\n', '\n', input_str) - data = json.loads(input_str) - config.update(data) - return config - - -def copy_config_file(config_file, out_path, new_fields): - config_lines = open(config_file, "r", encoding = "utf-8").readlines() - # add extra information fields - for key, value in new_fields.items(): - if isinstance(value, str): - new_line = '"{}":"{}",\n'.format(key, value) - else: - new_line = '"{}":{},\n'.format(key, value) - config_lines.insert(1, new_line) - config_out_file = open(out_path, "w") - config_out_file.writelines(config_lines) - config_out_file.close() - - -def load_checkpoint(model, checkpoint_path, use_cuda=False): +def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False): state = torch.load(checkpoint_path, map_location=torch.device('cpu')) model.load_state_dict(state['model']) + if amp and 'amp' in state: + amp.load_state_dict(state['amp']) if use_cuda: model.cuda() # set model stepsize @@ -47,7 +16,7 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False): return model, state -def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs): +def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs): new_state_dict = model.state_dict() state = { 'model': new_state_dict, @@ -57,6 +26,8 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs): 'date': datetime.date.today().strftime("%B %d, %Y"), 'r': r } + if amp_state_dict: + state['amp'] = amp_state_dict state.update(kwargs) torch.save(state, output_path) diff --git a/utils/measures.py b/mozilla_voice_tts/tts/utils/measures.py similarity index 97% rename from utils/measures.py rename to mozilla_voice_tts/tts/utils/measures.py index 01d25695..fdd31242 100644 --- a/utils/measures.py +++ b/mozilla_voice_tts/tts/utils/measures.py @@ -1,6 +1,3 @@ -import torch - - def alignment_diagonal_score(alignments, binary=False): """ Compute how diagonal alignment predictions are. It is useful diff --git a/utils/speakers.py b/mozilla_voice_tts/tts/utils/speakers.py similarity index 79% rename from utils/speakers.py rename to mozilla_voice_tts/tts/utils/speakers.py index 8aa612a8..156e42af 100644 --- a/utils/speakers.py +++ b/mozilla_voice_tts/tts/utils/speakers.py @@ -1,8 +1,6 @@ import os import json -from TTS.datasets.preprocess import get_preprocessor_by_name - def make_speakers_json_path(out_path): """Returns conventional speakers.json location.""" @@ -12,12 +10,15 @@ def make_speakers_json_path(out_path): def load_speaker_mapping(out_path): """Loads speaker mapping if already present.""" try: - with open(make_speakers_json_path(out_path)) as f: + if os.path.splitext(out_path)[1] == '.json': + json_file = out_path + else: + json_file = make_speakers_json_path(out_path) + with open(json_file) as f: return json.load(f) except FileNotFoundError: return {} - def save_speaker_mapping(out_path, speaker_mapping): """Saves speaker mapping if not yet present.""" speakers_json_path = make_speakers_json_path(out_path) diff --git a/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py similarity index 65% rename from utils/synthesis.py rename to mozilla_voice_tts/tts/utils/synthesis.py index a53c12dc..0952c936 100644 --- a/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -37,23 +37,25 @@ def numpy_to_tf(np_array, dtype): return tensor -def compute_style_mel(style_wav, ap): - style_mel = ap.melspectrogram( - ap.load_wav(style_wav)).expand_dims(0) +def compute_style_mel(style_wav, ap, cuda=False): + style_mel = torch.FloatTensor(ap.melspectrogram( + ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) + if cuda: + return style_mel.cuda() return style_mel -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None): if CONFIG.use_gst: decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id) + inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) else: if truncated: decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id) + inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) else: decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id) + inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) return decoder_output, postnet_output, alignments, stop_tokens @@ -70,6 +72,31 @@ def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=No return decoder_output, postnet_output, alignments, stop_tokens +def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): + if CONFIG.use_gst and style_mel is not None: + raise NotImplementedError(' [!] GST inference not implemented for TfLite') + if truncated: + raise NotImplementedError(' [!] Truncated inference not implemented for TfLite') + if speaker_id is not None: + raise NotImplementedError(' [!] Multi-Speaker not implemented for TfLite') + # get input and output details + input_details = model.get_input_details() + output_details = model.get_output_details() + # reshape input tensor for the new input shape + model.resize_tensor_input(input_details[0]['index'], inputs.shape) + model.allocate_tensors() + detail = input_details[0] + # input_shape = detail['shape'] + model.set_tensor(detail['index'], inputs) + # run the model + model.invoke() + # collect outputs + decoder_output = model.get_tensor(output_details[0]['index']) + postnet_output = model.get_tensor(output_details[1]['index']) + # tflite model only returns feature frames + return decoder_output, postnet_output, None, None + + def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): postnet_output = postnet_output[0].data.cpu().numpy() decoder_output = decoder_output[0].data.cpu().numpy() @@ -86,25 +113,42 @@ def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): return postnet_output, decoder_output, alignment, stop_tokens +def parse_outputs_tflite(postnet_output, decoder_output): + postnet_output = postnet_output[0] + decoder_output = decoder_output[0] + return postnet_output, decoder_output + + def trim_silence(wav, ap): return wav[:ap.find_endpoint(wav)] def inv_spectrogram(postnet_output, ap, CONFIG): - if CONFIG.model in ["Tacotron", "TacotronGST"]: + if CONFIG.model.lower() in ["tacotron"]: wav = ap.inv_spectrogram(postnet_output.T) else: wav = ap.inv_melspectrogram(postnet_output.T) return wav -def id_to_torch(speaker_id): +def id_to_torch(speaker_id, cuda=False): if speaker_id is not None: speaker_id = np.asarray(speaker_id) speaker_id = torch.from_numpy(speaker_id).unsqueeze(0) + if cuda: + return speaker_id.cuda() return speaker_id +def embedding_to_torch(speaker_embedding, cuda=False): + if speaker_embedding is not None: + speaker_embedding = np.asarray(speaker_embedding) + speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor) + if cuda: + return speaker_embedding.cuda() + return speaker_embedding + + # TODO: perform GL with pytorch for batching def apply_griffin_lim(inputs, input_lens, CONFIG, ap): '''Apply griffin-lim to each sample iterating throught the first dimension. @@ -134,15 +178,16 @@ def synthesis(model, enable_eos_bos_chars=False, #pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, + speaker_embedding=None, backend='torch'): """Synthesize voice for the given text. Args: - model (TTS.models): model to synthesize. + model (mozilla_voice_tts.tts.models): model to synthesize. text (str): target text CONFIG (dict): config dictionary to be loaded from config.json. use_cuda (bool): enable cuda. - ap (TTS.utils.audio.AudioProcessor): audio processor to process + ap (mozilla_voice_tts.tts.utils.audio.AudioProcessor): audio processor to process model outputs. speaker_id (int): id of speaker style_wav (str): Uses for style embedding of GST. @@ -154,32 +199,50 @@ def synthesis(model, """ # GST processing style_mel = None - if CONFIG.model == "TacotronGST" and style_wav is not None: - style_mel = compute_style_mel(style_wav, ap) + if CONFIG.use_gst and style_wav is not None: + if isinstance(style_wav, dict): + style_mel = style_wav + else: + style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text inputs = text_to_seqvec(text, CONFIG) # pass tensors to backend if backend == 'torch': - speaker_id = id_to_torch(speaker_id) - style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + + if speaker_embedding is not None: + speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) + + if not isinstance(style_mel, dict): + style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) inputs = inputs.unsqueeze(0) - else: + elif backend == 'tf': # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) inputs = numpy_to_tf(inputs, tf.int32) inputs = tf.expand_dims(inputs, 0) + elif backend == 'tflite': + style_mel = numpy_to_tf(style_mel, tf.float32) + inputs = numpy_to_tf(inputs, tf.int32) + inputs = tf.expand_dims(inputs, 0) # synthesize voice if backend == 'torch': decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel) + model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding) postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( postnet_output, decoder_output, alignments, stop_tokens) - else: + elif backend == 'tf': decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( model, inputs, CONFIG, truncated, speaker_id, style_mel) postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens) + elif backend == 'tflite': + decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( + model, inputs, CONFIG, truncated, speaker_id, style_mel) + postnet_output, decoder_output = parse_outputs_tflite( + postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None diff --git a/utils/text/__init__.py b/mozilla_voice_tts/tts/utils/text/__init__.py similarity index 96% rename from utils/text/__init__.py rename to mozilla_voice_tts/tts/utils/text/__init__.py index 79069192..9301d545 100644 --- a/utils/text/__init__.py +++ b/mozilla_voice_tts/tts/utils/text/__init__.py @@ -4,10 +4,11 @@ import re from packaging import version import phonemizer from phonemizer.phonemize import phonemize -from TTS.utils.text import cleaners -from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ +from mozilla_voice_tts.tts.utils.text import cleaners +from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ _eos +# pylint: disable=unnecessary-comprehension # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} @@ -44,7 +45,7 @@ def text2phone(text, language): for punct in punctuations: ph = ph.replace('| |\n', '|'+punct+'| |', 1) elif version.parse(phonemizer.__version__) >= version.parse('2.1'): - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True) + ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True, language_switch='remove-flags') # this is a simple fix for phonemizer. # https://github.com/bootphon/phonemizer/issues/32 if punctuations: @@ -77,7 +78,6 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} sequence = [] - text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) to_phonemes = text2phone(clean_text, language) if to_phonemes is None: diff --git a/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py similarity index 85% rename from utils/text/cleaners.py rename to mozilla_voice_tts/tts/utils/text/cleaners.py index 35da8aef..a36ebe67 100644 --- a/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -67,15 +67,16 @@ def remove_aux_symbols(text): text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text) return text - -def replace_symbols(text): +def replace_symbols(text, lang='en'): text = text.replace(';', ',') text = text.replace('-', ' ') text = text.replace(':', ' ') - text = text.replace('&', 'and') + if lang == 'en': + text = text.replace('&', 'and') + elif lang == 'pt': + text = text.replace('&', ' e ') return text - def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) @@ -91,6 +92,13 @@ def transliteration_cleaners(text): return text +def basic_german_cleaners(text): + '''Pipeline for German text''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + # TODO: elaborate it def basic_turkish_cleaners(text): '''Pipeline for Turkish text''' @@ -99,7 +107,6 @@ def basic_turkish_cleaners(text): text = collapse_whitespace(text) return text - def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) @@ -111,6 +118,14 @@ def english_cleaners(text): text = collapse_whitespace(text) return text +def portuguese_cleaners(text): + '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and + numbers, phonemizer already does that''' + text = lowercase(text) + text = replace_symbols(text, lang='pt') + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' diff --git a/utils/text/cmudict.py b/mozilla_voice_tts/tts/utils/text/cmudict.py similarity index 100% rename from utils/text/cmudict.py rename to mozilla_voice_tts/tts/utils/text/cmudict.py diff --git a/mozilla_voice_tts/tts/utils/text/number_norm.py b/mozilla_voice_tts/tts/utils/text/number_norm.py new file mode 100644 index 00000000..50de8d5c --- /dev/null +++ b/mozilla_voice_tts/tts/utils/text/number_norm.py @@ -0,0 +1,70 @@ +""" from https://github.com/keithito/tacotron """ + +import inflect +import re + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + if dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + if cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if 1000 < num < 3000: + if num == 2000: + return 'two thousand' + if 2000 < num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + if num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + return _inflect.number_to_words(num, + andword='', + zero='oh', + group=2).replace(', ', ' ') + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/utils/text/symbols.py b/mozilla_voice_tts/tts/utils/text/symbols.py similarity index 100% rename from utils/text/symbols.py rename to mozilla_voice_tts/tts/utils/text/symbols.py diff --git a/utils/visual.py b/mozilla_voice_tts/tts/utils/visual.py similarity index 81% rename from utils/visual.py rename to mozilla_voice_tts/tts/utils/visual.py index 87fbc8e4..17be49c7 100644 --- a/utils/visual.py +++ b/mozilla_voice_tts/tts/utils/visual.py @@ -3,10 +3,10 @@ import librosa import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt -from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme +from mozilla_voice_tts.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme -def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None): +def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False): if isinstance(alignment, torch.Tensor): alignment_ = alignment.detach().cpu().numpy().squeeze() else: @@ -24,23 +24,28 @@ def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None): plt.tight_layout() if title is not None: plt.title(title) + if not output_fig: + plt.close() return fig -def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): - if isinstance(linear_output, torch.Tensor): - linear_output_ = linear_output.detach().cpu().numpy().squeeze() +def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False): + if isinstance(spectrogram, torch.Tensor): + spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T else: - linear_output_ = linear_output - spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access + spectrogram_ = spectrogram.T + if ap is not None: + spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access fig = plt.figure(figsize=fig_size) - plt.imshow(spectrogram, aspect="auto", origin="lower") + plt.imshow(spectrogram_, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() + if not output_fig: + plt.close() return fig -def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)): +def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24), output_fig=False): if decoder_output is not None: num_plot = 4 else: @@ -90,3 +95,6 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, print(output_path) fig.savefig(output_path) plt.close() + + if not output_fig: + plt.close() diff --git a/mozilla_voice_tts/utils/__init__.py b/mozilla_voice_tts/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/audio.py b/mozilla_voice_tts/utils/audio.py similarity index 90% rename from utils/audio.py rename to mozilla_voice_tts/utils/audio.py index 040e50d8..46c459f9 100644 --- a/utils/audio.py +++ b/mozilla_voice_tts/utils/audio.py @@ -3,8 +3,9 @@ import soundfile as sf import numpy as np import scipy.io.wavfile import scipy.signal +import pyworld as pw -from TTS.utils.data import StandardScaler +from mozilla_voice_tts.tts.utils.data import StandardScaler class AudioProcessor(object): @@ -17,7 +18,7 @@ class AudioProcessor(object): hop_length=None, win_length=None, ref_level_db=None, - num_freq=None, + fft_size=1024, power=None, preemphasis=0.0, signal_norm=None, @@ -25,6 +26,8 @@ class AudioProcessor(object): max_norm=None, mel_fmin=None, mel_fmax=None, + spec_gain=20, + stft_pad_mode='reflect', clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, @@ -41,7 +44,7 @@ class AudioProcessor(object): self.frame_shift_ms = frame_shift_ms self.frame_length_ms = frame_length_ms self.ref_level_db = ref_level_db - self.num_freq = num_freq + self.fft_size = fft_size self.power = power self.preemphasis = preemphasis self.griffin_lim_iters = griffin_lim_iters @@ -49,6 +52,8 @@ class AudioProcessor(object): self.symmetric_norm = symmetric_norm self.mel_fmin = mel_fmin or 0 self.mel_fmax = mel_fmax + self.spec_gain = float(spec_gain) + self.stft_pad_mode = stft_pad_mode self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence @@ -57,12 +62,14 @@ class AudioProcessor(object): self.stats_path = stats_path # setup stft parameters if hop_length is None: - self.n_fft, self.hop_length, self.win_length = self._stft_parameters() + # compute stft parameters from given time values + self.hop_length, self.win_length = self._stft_parameters() else: + # use stft parameters from config file self.hop_length = hop_length self.win_length = win_length - self.n_fft = (self.num_freq - 1) * 2 assert min_level_db != 0.0, " [!] min_level_db is 0" + assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" members = vars(self) for key, value in members.items(): print(" | > {}:{}".format(key, value)) @@ -84,19 +91,18 @@ class AudioProcessor(object): assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( self.sample_rate, - self.n_fft, + self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax) def _stft_parameters(self, ): """Compute necessary stft parameters with given time values""" - n_fft = (self.num_freq - 1) * 2 factor = self.frame_length_ms / self.frame_shift_ms assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(hop_length * factor) - return n_fft, hop_length, win_length + return hop_length, win_length ### normalization ### def _normalize(self, S): @@ -108,7 +114,7 @@ class AudioProcessor(object): if hasattr(self, 'mel_scaler'): if S.shape[0] == self.num_mels: return self.mel_scaler.transform(S.T).T - elif S.shape[0] == self.n_fft / 2: + elif S.shape[0] == self.fft_size / 2: return self.linear_scaler.transform(S.T).T else: raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') @@ -118,7 +124,7 @@ class AudioProcessor(object): if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: - S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) + S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type return S_norm else: S_norm = self.max_norm * S_norm @@ -137,13 +143,13 @@ class AudioProcessor(object): if hasattr(self, 'mel_scaler'): if S_denorm.shape[0] == self.num_mels: return self.mel_scaler.inverse_transform(S_denorm.T).T - elif S_denorm.shape[0] == self.n_fft / 2: + elif S_denorm.shape[0] == self.fft_size / 2: return self.linear_scaler.inverse_transform(S_denorm.T).T else: raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') if self.symmetric_norm: if self.clip_norm: - S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) + S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db else: @@ -182,11 +188,11 @@ class AudioProcessor(object): ### DB and AMP conversion ### # pylint: disable=no-self-use def _amp_to_db(self, x): - return 20 * np.log10(np.maximum(1e-5, x)) + return self.spec_gain * np.log10(np.maximum(1e-5, x)) # pylint: disable=no-self-use def _db_to_amp(self, x): - return np.power(10.0, x * 0.05) + return np.power(10.0, x / self.spec_gain) ### Preemphasis ### def apply_preemphasis(self, x): @@ -252,10 +258,10 @@ class AudioProcessor(object): def _stft(self, y): return librosa.stft( y=y, - n_fft=self.n_fft, + n_fft=self.fft_size, hop_length=self.hop_length, win_length=self.win_length, - pad_mode='constant' + pad_mode=self.stft_pad_mode, ) def _istft(self, y): @@ -280,6 +286,17 @@ class AudioProcessor(object): return 0, pad return pad // 2, pad // 2 + pad % 2 + ### Compute F0 ### + def compute_f0(self, x): + f0, t = pw.dio( + x.astype(np.double), + fs=self.sample_rate, + f0_ceil=self.mel_fmax, + frame_period=1000 * self.hop_length / self.sample_rate, + ) + f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) + return f0 + ### Audio Processing ### def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec) diff --git a/utils/console_logger.py b/mozilla_voice_tts/utils/console_logger.py similarity index 81% rename from utils/console_logger.py rename to mozilla_voice_tts/utils/console_logger.py index 0b361bb8..0311e0ca 100644 --- a/utils/console_logger.py +++ b/mozilla_voice_tts/utils/console_logger.py @@ -1,5 +1,5 @@ import datetime -from TTS.utils.io import AttrDict +from mozilla_voice_tts.utils.io import AttrDict tcolors = AttrDict({ @@ -35,8 +35,7 @@ class ConsoleLogger(): def print_train_start(self): print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}") - def print_train_step(self, batch_steps, step, global_step, avg_spec_length, - avg_text_length, step_time, loader_time, lr, + def print_train_step(self, batch_steps, step, global_step, log_dict, loss_dict, avg_loss_dict): indent = " | > " print() @@ -48,15 +47,20 @@ class ConsoleLogger(): log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) else: log_text += "{}{}: {:.5f} \n".format(indent, key, value) - log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\ - f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}" + for idx, (key, value) in enumerate(log_dict.items()): + if isinstance(value, list): + log_text += f"{indent}{key}: {value[0]:.{value[1]}f}" + else: + log_text += f"{indent}{key}: {value}" + if idx < len(log_dict)-1: + log_text += "\n" print(log_text, flush=True) # pylint: disable=unused-argument def print_train_epoch_end(self, global_step, epoch, epoch_time, print_dict): indent = " | > " - log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" + log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" for key, value in print_dict.items(): log_text += "{}{}: {:.5f}\n".format(indent, key, value) print(log_text, flush=True) @@ -82,14 +86,17 @@ class ConsoleLogger(): tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given - color = tcolors.FAIL + color = '' sign = '+' diff = 0 - if self.old_eval_loss_dict is not None: + if self.old_eval_loss_dict is not None and key in self.old_eval_loss_dict: diff = value - self.old_eval_loss_dict[key] - if diff <= 0: + if diff < 0: color = tcolors.OKGREEN sign = '' + elif diff > 0: + color = tcolors.FAIL + sign = '+' log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff) self.old_eval_loss_dict = avg_loss_dict print(log_text, flush=True) diff --git a/mozilla_voice_tts/utils/generic_utils.py b/mozilla_voice_tts/utils/generic_utils.py new file mode 100644 index 00000000..dcfbbdc3 --- /dev/null +++ b/mozilla_voice_tts/utils/generic_utils.py @@ -0,0 +1,156 @@ +import os +import glob +import shutil +import datetime +import subprocess + + +def get_git_branch(): + try: + out = subprocess.check_output(["git", "branch"]).decode("utf8") + current = next(line for line in out.split("\n") + if line.startswith("*")) + current.replace("* ", "") + except subprocess.CalledProcessError: + current = "inside_docker" + return current + + +def get_commit_hash(): + """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" + # try: + # subprocess.check_output(['git', 'diff-index', '--quiet', + # 'HEAD']) # Verify client is clean + # except: + # raise RuntimeError( + # " !! Commit before training to get the commit hash.") + try: + commit = subprocess.check_output( + ['git', 'rev-parse', '--short', 'HEAD']).decode().strip() + # Not copying .git folder into docker container + except subprocess.CalledProcessError: + commit = "0000000" + print(' > Git Hash: {}'.format(commit)) + return commit + + +def create_experiment_folder(root_path, model_name, debug): + """ Create a folder with the current date and time """ + date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") + if debug: + commit_hash = 'debug' + else: + commit_hash = get_commit_hash() + output_folder = os.path.join( + root_path, model_name + '-' + date_str + '-' + commit_hash) + os.makedirs(output_folder, exist_ok=True) + print(" > Experiment folder: {}".format(output_folder)) + return output_folder + + +def remove_experiment_folder(experiment_path): + """Check folder if there is a checkpoint, otherwise remove the folder""" + + checkpoint_files = glob.glob(experiment_path + "/*.pth.tar") + if not checkpoint_files: + if os.path.exists(experiment_path): + shutil.rmtree(experiment_path, ignore_errors=True) + print(" ! Run is removed from {}".format(experiment_path)) + else: + print(" ! Run is kept in {}".format(experiment_path)) + + +def count_parameters(model): + r"""Count number of trainable parameters in a network""" + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def set_init_dict(model_dict, checkpoint_state, c): + # Partial initialization: if there is a mismatch with new and old layer, it is skipped. + for k, v in checkpoint_state.items(): + if k not in model_dict: + print(" | > Layer missing in the model definition: {}".format(k)) + # 1. filter out unnecessary keys + pretrained_dict = { + k: v + for k, v in checkpoint_state.items() if k in model_dict + } + # 2. filter out different size layers + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if v.numel() == model_dict[k].numel() + } + # 3. skip reinit layers + if c.reinit_layers is not None: + for reinit_layer_name in c.reinit_layers: + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if reinit_layer_name not in k + } + # 4. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + print(" | > {} / {} layers are restored.".format(len(pretrained_dict), + len(model_dict))) + return model_dict + +class KeepAverage(): + def __init__(self): + self.avg_values = {} + self.iters = {} + + def __getitem__(self, key): + return self.avg_values[key] + + def items(self): + return self.avg_values.items() + + def add_value(self, name, init_val=0, init_iter=0): + self.avg_values[name] = init_val + self.iters[name] = init_iter + + def update_value(self, name, value, weighted_avg=False): + if name not in self.avg_values: + # add value if not exist before + self.add_value(name, init_val=value) + else: + # else update existing value + if weighted_avg: + self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value + self.iters[name] += 1 + else: + self.avg_values[name] = self.avg_values[name] * \ + self.iters[name] + value + self.iters[name] += 1 + self.avg_values[name] /= self.iters[name] + + def add_values(self, name_dict): + for key, value in name_dict.items(): + self.add_value(key, init_val=value) + + def update_values(self, value_dict): + for key, value in value_dict.items(): + self.update_value(key, value) + + +def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None): + if alternative in c.keys() and c[alternative] is not None: + return + if restricted: + assert name in c.keys(), f' [!] {name} not defined in config.json' + if name in c.keys(): + if max_val: + assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}' + if min_val: + assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' + if enum_list: + assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' + if isinstance(val_type, list): + is_valid = False + for typ in val_type: + if isinstance(c[name], typ): + is_valid = True + assert is_valid or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + elif val_type: + assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' diff --git a/mozilla_voice_tts/utils/io.py b/mozilla_voice_tts/utils/io.py new file mode 100644 index 00000000..434c3a03 --- /dev/null +++ b/mozilla_voice_tts/utils/io.py @@ -0,0 +1,50 @@ +import re +import json +from shutil import copyfile + +class AttrDict(dict): + """A custom dict which converts dict keys + to class attributes""" + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def load_config(config_path): + """Load config files and discard comments + + Args: + config_path (str): path to config file. + """ + config = AttrDict() + with open(config_path, "r") as f: + input_str = f.read() + # handle comments + input_str = re.sub(r'\\\n', '', input_str) + input_str = re.sub(r'//.*\n', '\n', input_str) + data = json.loads(input_str) + config.update(data) + return config + + +def copy_config_file(config_file, out_path, new_fields): + """Copy config.json to training folder and add + new fields. + + Args: + config_file (str): path to config file. + out_path (str): output path to copy the file. + new_fields (dict): new fileds to be added or edited + in the config file. + """ + config_lines = open(config_file, "r").readlines() + # add extra information fields + for key, value in new_fields.items(): + if isinstance(value, str): + new_line = '"{}":"{}",\n'.format(key, value) + else: + new_line = '"{}":{},\n'.format(key, value) + config_lines.insert(1, new_line) + config_out_file = open(out_path, "w") + config_out_file.writelines(config_lines) + config_out_file.close() diff --git a/utils/radam.py b/mozilla_voice_tts/utils/radam.py similarity index 97% rename from utils/radam.py rename to mozilla_voice_tts/utils/radam.py index 4724b705..58cec920 100644 --- a/utils/radam.py +++ b/mozilla_voice_tts/utils/radam.py @@ -2,7 +2,7 @@ import math import torch -from torch.optim.optimizer import Optimizer, required +from torch.optim.optimizer import Optimizer class RAdam(Optimizer): @@ -25,7 +25,7 @@ class RAdam(Optimizer): defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) super(RAdam, self).__init__(params, defaults) - def __setstate__(self, state): + def __setstate__(self, state): # pylint: disable=useless-super-delegation super(RAdam, self).__setstate__(state) def step(self, closure=None): diff --git a/utils/tensorboard_logger.py b/mozilla_voice_tts/utils/tensorboard_logger.py similarity index 74% rename from utils/tensorboard_logger.py rename to mozilla_voice_tts/utils/tensorboard_logger.py index 15fe04e4..4ee12d74 100644 --- a/utils/tensorboard_logger.py +++ b/mozilla_voice_tts/utils/tensorboard_logger.py @@ -3,7 +3,8 @@ from tensorboardX import SummaryWriter class TensorboardLogger(object): - def __init__(self, log_dir): + def __init__(self, log_dir, model_name): + self.model_name = model_name self.writer = SummaryWriter(log_dir) self.train_stats = {} self.eval_stats = {} @@ -46,35 +47,35 @@ class TensorboardLogger(object): for key, value in audios.items(): try: self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) - except: + except RuntimeError: traceback.print_exc() def tb_train_iter_stats(self, step, stats): - self.dict_to_tb_scalar("TrainIterStats", stats, step) + self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): - self.dict_to_tb_scalar("TrainEpochStats", stats, step) + self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step) def tb_train_figures(self, step, figures): - self.dict_to_tb_figure("TrainFigures", figures, step) + self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step) def tb_train_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios("TrainAudios", audios, step, sample_rate) + self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate) def tb_eval_stats(self, step, stats): - self.dict_to_tb_scalar("EvalStats", stats, step) + self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step) def tb_eval_figures(self, step, figures): - self.dict_to_tb_figure("EvalFigures", figures, step) + self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step) def tb_eval_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate) + self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate) def tb_test_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios("TestAudios", audios, step, sample_rate) + self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate) def tb_test_figures(self, step, figures): - self.dict_to_tb_figure("TestFigures", figures, step) + self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step) def tb_add_text(self, title, text, step): self.writer.add_text(title, text, step) diff --git a/utils/training.py b/mozilla_voice_tts/utils/training.py similarity index 75% rename from utils/training.py rename to mozilla_voice_tts/utils/training.py index 5ce7948b..8166562c 100644 --- a/utils/training.py +++ b/mozilla_voice_tts/utils/training.py @@ -2,13 +2,32 @@ import torch import numpy as np -def check_update(model, grad_clip, ignore_stopnet=False): +def setup_torch_training_env(cudnn_enable, cudnn_benchmark): + torch.backends.cudnn.enabled = cudnn_enable + torch.backends.cudnn.benchmark = cudnn_benchmark + torch.manual_seed(54321) + use_cuda = torch.cuda.is_available() + num_gpus = torch.cuda.device_count() + print(" > Using CUDA: ", use_cuda) + print(" > Number of GPUs: ", num_gpus) + return use_cuda, num_gpus + + +def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): r'''Check model gradient against unexpected jumps and failures''' skip_flag = False if ignore_stopnet: - grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) + if not amp_opt_params: + grad_norm = torch.nn.utils.clip_grad_norm_( + [param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) + else: + grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) else: - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + if not amp_opt_params: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + else: + grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) + # compatibility with different torch versions if isinstance(grad_norm, float): if np.isinf(grad_norm): diff --git a/mozilla_voice_tts/vocoder/README.md b/mozilla_voice_tts/vocoder/README.md new file mode 100644 index 00000000..e0ae8f21 --- /dev/null +++ b/mozilla_voice_tts/vocoder/README.md @@ -0,0 +1,39 @@ +# Mozilla TTS Vocoders (Experimental) + +Here there are vocoder model implementations which can be combined with the other TTS models. + +Currently, following models are implemented: + +- Melgan +- MultiBand-Melgan +- ParallelWaveGAN +- GAN-TTS (Discriminator Only) + +It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework. + +## Training a model + +You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset. + +In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json''' + +You need to define other relevant parameters in your ```config.json``` and then start traning with the following command. + +```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json``` + +Example config files can be found under `tts/vocoder/configs/` folder. + +You can continue a previous training run by the following command. + +```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder``` + +You can fine-tune a pre-trained model by the following command. + +```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar``` + +Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off. + +You can also follow your training runs on Tensorboard as you do with our TTS models. + +## Acknowledgement +Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work. diff --git a/mozilla_voice_tts/vocoder/__init__.py b/mozilla_voice_tts/vocoder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/configs/multiband-melgan_and_rwd_config.json b/mozilla_voice_tts/vocoder/configs/multiband-melgan_and_rwd_config.json new file mode 100644 index 00000000..0b751854 --- /dev/null +++ b/mozilla_voice_tts/vocoder/configs/multiband-melgan_and_rwd_config.json @@ -0,0 +1,151 @@ +{ + "run_name": "multiband-melgan-rwd", + "run_description": "multiband melgan with random window discriminator from https://arxiv.org/pdf/1909.11646.pdf", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_pqmf": true, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": true, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + // subband multiscale stft loss parameters + "subband_stft_loss_params":{ + "n_ffts": [384, 683, 171], + "hop_lengths": [30, 60, 10], + "win_lengths": [150, 300, 60] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "random_window_discriminator", + "discriminator_model_params":{ + "uncond_disc_donwsample_factors": [8, 4], + "cond_disc_downsample_factors": [[8, 4, 2, 2, 2], [8, 4, 2, 2], [8, 4, 2], [8, 4], [4, 2, 2]], + "cond_disc_out_channels": [[128, 128, 256, 256], [128, 256, 256], [128, 256], [256], [128, 256]], + "window_sizes": [512, 1024, 2048, 4096, 8192] + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "multiband_melgan_generator", + "generator_model_params": { + "upsample_factors":[8, 4, 2], + "num_res_blocks": 4 + }, + + // DATASET + "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", + "seq_len": 16384, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "warmup_steps_gen": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "warmup_steps_disc": 4000, + "epochs": 10000, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 1e-4, + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/" +} + diff --git a/mozilla_voice_tts/vocoder/configs/multiband_melgan_config.json b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config.json new file mode 100644 index 00000000..a89d43bb --- /dev/null +++ b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config.json @@ -0,0 +1,144 @@ +{ + "run_name": "multiband-melgan", + "run_description": "multiband melgan mean-var scaling", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_pqmf": true, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": true, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + // subband multiscale stft loss parameters + "subband_stft_loss_params":{ + "n_ffts": [384, 683, 171], + "hop_lengths": [30, 60, 10], + "win_lengths": [150, 300, 60] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "melgan_multiscale_discriminator", + "discriminator_model_params":{ + "base_channels": 16, + "max_channels":512, + "downsample_factors":[4, 4, 4] + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "multiband_melgan_generator", + "generator_model_params": { + "upsample_factors":[8, 4, 2], + "num_res_blocks": 4 + }, + + // DATASET + "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", + "feature_path": null, + "seq_len": 16384, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "epochs": 10000, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 1e-4, + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/" +} + diff --git a/mozilla_voice_tts/vocoder/configs/multiband_melgan_config_mozilla.json b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config_mozilla.json new file mode 100644 index 00000000..35f1642a --- /dev/null +++ b/mozilla_voice_tts/vocoder/configs/multiband_melgan_config_mozilla.json @@ -0,0 +1,144 @@ +{ + "run_name": "multiband-melgan", + "run_description": "multiband melgan mean-var scaling", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/MozillaMerged22050/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_pqmf": true, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": true, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + // subband multiscale stft loss parameters + "subband_stft_loss_params":{ + "n_ffts": [384, 683, 171], + "hop_lengths": [30, 60, 10], + "win_lengths": [150, 300, 60] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "melgan_multiscale_discriminator", + "discriminator_model_params":{ + "base_channels": 16, + "max_channels":512, + "downsample_factors":[4, 4, 4] + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "multiband_melgan_generator", + "generator_model_params": { + "upsample_factors":[8, 4, 2], + "num_res_blocks": 4 + }, + + // DATASET + "data_path": "/home/erogol/Data/MozillaMerged22050/wavs/", + "feature_path": null, + "seq_len": 16384, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "epochs": 10000, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 1e-4, + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/home/erogol/Models/Mozilla/" +} + diff --git a/mozilla_voice_tts/vocoder/configs/parallel_wavegan_config.json b/mozilla_voice_tts/vocoder/configs/parallel_wavegan_config.json new file mode 100644 index 00000000..fcd765bd --- /dev/null +++ b/mozilla_voice_tts/vocoder/configs/parallel_wavegan_config.json @@ -0,0 +1,143 @@ +{ + "run_name": "pwgan", + "run_description": "parallel-wavegan training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_pqmf": true, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": false, // USE ONLY WITH MULTIBAND MODELS + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + // subband multiscale stft loss parameters + "subband_stft_loss_params":{ + "n_ffts": [384, 683, 171], + "hop_lengths": [30, 60, 10], + "win_lengths": [150, 300, 60] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "parallel_wavegan_discriminator", + "discriminator_model_params":{ + "num_layers": 10 + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "parallel_wavegan_generator", + "generator_model_params": { + "upsample_factors":[4, 4, 4, 4], + "stacks": 3, + "num_res_blocks": 30 + }, + + // DATASET + "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", + "feature_path": null, + "seq_len": 25600, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 6, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "epochs": 10000, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 1e-4, + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/" +} + diff --git a/mozilla_voice_tts/vocoder/datasets/__init__.py b/mozilla_voice_tts/vocoder/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/datasets/gan_dataset.py b/mozilla_voice_tts/vocoder/datasets/gan_dataset.py new file mode 100644 index 00000000..af23fbf2 --- /dev/null +++ b/mozilla_voice_tts/vocoder/datasets/gan_dataset.py @@ -0,0 +1,127 @@ +import os +import glob +import torch +import random +import numpy as np +from torch.utils.data import Dataset +from multiprocessing import Manager + + +class GANDataset(Dataset): + """ + GAN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly and returns + random segments of (audio, feature) couples. + """ + def __init__(self, + ap, + items, + seq_len, + hop_len, + pad_short, + conv_pad=2, + is_training=True, + return_segments=True, + use_noise_augment=False, + use_cache=False, + verbose=False): + + self.ap = ap + self.item_list = items + self.compute_feat = not isinstance(items[0], (tuple, list)) + self.seq_len = seq_len + self.hop_len = hop_len + self.pad_short = pad_short + self.conv_pad = conv_pad + self.is_training = is_training + self.return_segments = return_segments + self.use_cache = use_cache + self.use_noise_augment = use_noise_augment + self.verbose = verbose + + assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." + self.feat_frame_len = seq_len // hop_len + (2 * conv_pad) + + # map G and D instances + self.G_to_D_mappings = list(range(len(self.item_list))) + self.shuffle_mapping() + + # cache acoustic features + if use_cache: + self.create_feature_cache() + + def create_feature_cache(self): + self.manager = Manager() + self.cache = self.manager.list() + self.cache += [None for _ in range(len(self.item_list))] + + @staticmethod + def find_wav_files(path): + return glob.glob(os.path.join(path, '**', '*.wav'), recursive=True) + + def __len__(self): + return len(self.item_list) + + def __getitem__(self, idx): + """ Return different items for Generator and Discriminator and + cache acoustic features """ + if self.return_segments: + idx2 = self.G_to_D_mappings[idx] + item1 = self.load_item(idx) + item2 = self.load_item(idx2) + return item1, item2 + item1 = self.load_item(idx) + return item1 + + def shuffle_mapping(self): + random.shuffle(self.G_to_D_mappings) + + def load_item(self, idx): + """ load (audio, feat) couple """ + if self.compute_feat: + # compute features from wav + wavpath = self.item_list[idx] + # print(wavpath) + + if self.use_cache and self.cache[idx] is not None: + audio, mel = self.cache[idx] + else: + audio = self.ap.load_wav(wavpath) + + if len(audio) < self.seq_len + self.pad_short: + audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ + mode='constant', constant_values=0.0) + + mel = self.ap.melspectrogram(audio) + else: + + # load precomputed features + wavpath, feat_path = self.item_list[idx] + + if self.use_cache and self.cache[idx] is not None: + audio, mel = self.cache[idx] + else: + audio = self.ap.load_wav(wavpath) + mel = np.load(feat_path) + + # correct the audio length wrt padding applied in stft + audio = np.pad(audio, (0, self.hop_len), mode="edge") + audio = audio[:mel.shape[-1] * self.hop_len] + assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}' + + audio = torch.from_numpy(audio).float().unsqueeze(0) + mel = torch.from_numpy(mel).float().squeeze(0) + + if self.return_segments: + max_mel_start = mel.shape[1] - self.feat_frame_len + mel_start = random.randint(0, max_mel_start) + mel_end = mel_start + self.feat_frame_len + mel = mel[:, mel_start:mel_end] + + audio_start = mel_start * self.hop_len + audio = audio[:, audio_start:audio_start + + self.seq_len] + + if self.use_noise_augment and self.is_training and self.return_segments: + audio = audio + (1 / 32768) * torch.randn_like(audio) + return (mel, audio) diff --git a/mozilla_voice_tts/vocoder/datasets/preprocess.py b/mozilla_voice_tts/vocoder/datasets/preprocess.py new file mode 100644 index 00000000..be60c13a --- /dev/null +++ b/mozilla_voice_tts/vocoder/datasets/preprocess.py @@ -0,0 +1,37 @@ +import glob +import os +from pathlib import Path + +import numpy as np + + +def find_wav_files(data_path): + wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) + return wav_paths + + +def find_feat_files(data_path): + feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True) + return feat_paths + + +def load_wav_data(data_path, eval_split_size): + wav_paths = find_wav_files(data_path) + np.random.seed(0) + np.random.shuffle(wav_paths) + return wav_paths[:eval_split_size], wav_paths[eval_split_size:] + + +def load_wav_feat_data(data_path, feat_path, eval_split_size): + wav_paths = sorted(find_wav_files(data_path)) + feat_paths = sorted(find_feat_files(feat_path)) + assert len(wav_paths) == len(feat_paths) + for wav, feat in zip(wav_paths, feat_paths): + wav_name = Path(wav).stem + feat_name = Path(feat).stem + assert wav_name == feat_name + + items = list(zip(wav_paths, feat_paths)) + np.random.seed(0) + np.random.shuffle(items) + return items[:eval_split_size], items[eval_split_size:] diff --git a/mozilla_voice_tts/vocoder/layers/__init__.py b/mozilla_voice_tts/vocoder/layers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/layers/losses.py b/mozilla_voice_tts/vocoder/layers/losses.py new file mode 100644 index 00000000..e705b1e0 --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/losses.py @@ -0,0 +1,309 @@ +import torch + +from torch import nn +from torch.nn import functional as F + + +class TorchSTFT(): + def __init__(self, n_fft, hop_length, win_length, window='hann_window'): + """ Torch based STFT operation """ + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.window = getattr(torch, window)(win_length) + + def __call__(self, x): + # B x D x T x 2 + o = torch.stft(x, + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=False, + onesided=True) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + + +################################# +# GENERATOR LOSSES +################################# + + +class STFTLoss(nn.Module): + """ Single scale STFT Loss """ + def __init__(self, n_fft, hop_length, win_length): + super(STFTLoss, self).__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.stft = TorchSTFT(n_fft, hop_length, win_length) + + def forward(self, y_hat, y): + y_hat_M = self.stft(y_hat) + y_M = self.stft(y) + # magnitude loss + loss_mag = F.l1_loss(torch.log(y_M), torch.log(y_hat_M)) + # spectral convergence loss + loss_sc = torch.norm(y_M - y_hat_M, p="fro") / torch.norm(y_M, p="fro") + return loss_mag, loss_sc + +class MultiScaleSTFTLoss(torch.nn.Module): + """ Multi scale STFT loss """ + def __init__(self, + n_ffts=(1024, 2048, 512), + hop_lengths=(120, 240, 50), + win_lengths=(600, 1200, 240)): + super(MultiScaleSTFTLoss, self).__init__() + self.loss_funcs = torch.nn.ModuleList() + for n_fft, hop_length, win_length in zip(n_ffts, hop_lengths, win_lengths): + self.loss_funcs.append(STFTLoss(n_fft, hop_length, win_length)) + + def forward(self, y_hat, y): + N = len(self.loss_funcs) + loss_sc = 0 + loss_mag = 0 + for f in self.loss_funcs: + lm, lsc = f(y_hat, y) + loss_mag += lm + loss_sc += lsc + loss_sc /= N + loss_mag /= N + return loss_mag, loss_sc + + +class MultiScaleSubbandSTFTLoss(MultiScaleSTFTLoss): + """ Multiscale STFT loss for multi band model outputs """ + # pylint: disable=no-self-use + def forward(self, y_hat, y): + y_hat = y_hat.view(-1, 1, y_hat.shape[2]) + y = y.view(-1, 1, y.shape[2]) + return super().forward(y_hat.squeeze(1), y.squeeze(1)) + + +class MSEGLoss(nn.Module): + """ Mean Squared Generator Loss """ + # pylint: disable=no-self-use + def forward(self, score_real): + loss_fake = F.mse_loss(score_real, score_real.new_ones(score_real.shape)) + return loss_fake + + +class HingeGLoss(nn.Module): + """ Hinge Discriminator Loss """ + # pylint: disable=no-self-use + def forward(self, score_real): + # TODO: this might be wrong + loss_fake = torch.mean(F.relu(1. - score_real)) + return loss_fake + + +################################## +# DISCRIMINATOR LOSSES +################################## + + +class MSEDLoss(nn.Module): + """ Mean Squared Discriminator Loss """ + def __init__(self,): + super(MSEDLoss, self).__init__() + self.loss_func = nn.MSELoss() + + # pylint: disable=no-self-use + def forward(self, score_fake, score_real): + loss_real = self.loss_func(score_real, score_real.new_ones(score_real.shape)) + loss_fake = self.loss_func(score_fake, score_fake.new_zeros(score_fake.shape)) + loss_d = loss_real + loss_fake + return loss_d, loss_real, loss_fake + + +class HingeDLoss(nn.Module): + """ Hinge Discriminator Loss """ + # pylint: disable=no-self-use + def forward(self, score_fake, score_real): + loss_real = torch.mean(F.relu(1. - score_real)) + loss_fake = torch.mean(F.relu(1. + score_fake)) + loss_d = loss_real + loss_fake + return loss_d, loss_real, loss_fake + + +class MelganFeatureLoss(nn.Module): + def __init__(self,): + super(MelganFeatureLoss, self).__init__() + self.loss_func = nn.L1Loss() + + # pylint: disable=no-self-use + def forward(self, fake_feats, real_feats): + loss_feats = 0 + for fake_feat, real_feat in zip(fake_feats, real_feats): + loss_feats += self.loss_func(fake_feat, real_feat) + loss_feats /= len(fake_feats) + len(real_feats) + return loss_feats + + +##################################### +# LOSS WRAPPERS +##################################### + + +def _apply_G_adv_loss(scores_fake, loss_func): + """ Compute G adversarial loss function + and normalize values """ + adv_loss = 0 + if isinstance(scores_fake, list): + for score_fake in scores_fake: + fake_loss = loss_func(score_fake) + adv_loss += fake_loss + adv_loss /= len(scores_fake) + else: + fake_loss = loss_func(scores_fake) + adv_loss = fake_loss + return adv_loss + + +def _apply_D_loss(scores_fake, scores_real, loss_func): + """ Compute D loss func and normalize loss values """ + loss = 0 + real_loss = 0 + fake_loss = 0 + if isinstance(scores_fake, list): + # multi-scale loss + for score_fake, score_real in zip(scores_fake, scores_real): + total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) + loss += total_loss + real_loss += real_loss + fake_loss += fake_loss + # normalize loss values with number of scales + loss /= len(scores_fake) + real_loss /= len(scores_real) + fake_loss /= len(scores_fake) + else: + # single scale loss + total_loss, real_loss, fake_loss = loss_func(scores_fake, scores_real) + loss = total_loss + return loss, real_loss, fake_loss + + +################################## +# MODEL LOSSES +################################## + + +class GeneratorLoss(nn.Module): + def __init__(self, C): + """ Compute Generator Loss values depending on training + configuration """ + super(GeneratorLoss, self).__init__() + assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\ + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + + self.use_stft_loss = C.use_stft_loss + self.use_subband_stft_loss = C.use_subband_stft_loss + self.use_mse_gan_loss = C.use_mse_gan_loss + self.use_hinge_gan_loss = C.use_hinge_gan_loss + self.use_feat_match_loss = C.use_feat_match_loss + + self.stft_loss_weight = C.stft_loss_weight + self.subband_stft_loss_weight = C.subband_stft_loss_weight + self.mse_gan_loss_weight = C.mse_G_loss_weight + self.hinge_gan_loss_weight = C.hinge_G_loss_weight + self.feat_match_loss_weight = C.feat_match_loss_weight + + if C.use_stft_loss: + self.stft_loss = MultiScaleSTFTLoss(**C.stft_loss_params) + if C.use_subband_stft_loss: + self.subband_stft_loss = MultiScaleSubbandSTFTLoss(**C.subband_stft_loss_params) + if C.use_mse_gan_loss: + self.mse_loss = MSEGLoss() + if C.use_hinge_gan_loss: + self.hinge_loss = HingeGLoss() + if C.use_feat_match_loss: + self.feat_match_loss = MelganFeatureLoss() + + def forward(self, y_hat=None, y=None, scores_fake=None, feats_fake=None, feats_real=None, y_hat_sub=None, y_sub=None): + gen_loss = 0 + adv_loss = 0 + return_dict = {} + + # STFT Loss + if self.use_stft_loss: + stft_loss_mg, stft_loss_sc = self.stft_loss(y_hat.squeeze(1), y.squeeze(1)) + return_dict['G_stft_loss_mg'] = stft_loss_mg + return_dict['G_stft_loss_sc'] = stft_loss_sc + gen_loss += self.stft_loss_weight * (stft_loss_mg + stft_loss_sc) + + # subband STFT Loss + if self.use_subband_stft_loss: + subband_stft_loss_mg, subband_stft_loss_sc = self.subband_stft_loss(y_hat_sub, y_sub) + return_dict['G_subband_stft_loss_mg'] = subband_stft_loss_mg + return_dict['G_subband_stft_loss_sc'] = subband_stft_loss_sc + gen_loss += self.subband_stft_loss_weight * (subband_stft_loss_mg + subband_stft_loss_sc) + + # multiscale MSE adversarial loss + if self.use_mse_gan_loss and scores_fake is not None: + mse_fake_loss = _apply_G_adv_loss(scores_fake, self.mse_loss) + return_dict['G_mse_fake_loss'] = mse_fake_loss + adv_loss += self.mse_gan_loss_weight * mse_fake_loss + + # multiscale Hinge adversarial loss + if self.use_hinge_gan_loss and not scores_fake is not None: + hinge_fake_loss = _apply_G_adv_loss(scores_fake, self.hinge_loss) + return_dict['G_hinge_fake_loss'] = hinge_fake_loss + adv_loss += self.hinge_gan_loss_weight * hinge_fake_loss + + # Feature Matching Loss + if self.use_feat_match_loss and not feats_fake: + feat_match_loss = self.feat_match_loss(feats_fake, feats_real) + return_dict['G_feat_match_loss'] = feat_match_loss + adv_loss += self.feat_match_loss_weight * feat_match_loss + return_dict['G_loss'] = gen_loss + adv_loss + return_dict['G_gen_loss'] = gen_loss + return_dict['G_adv_loss'] = adv_loss + return return_dict + + +class DiscriminatorLoss(nn.Module): + """ Compute Discriminator Loss values depending on training + configuration """ + def __init__(self, C): + super(DiscriminatorLoss, self).__init__() + assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\ + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + + self.use_mse_gan_loss = C.use_mse_gan_loss + self.use_hinge_gan_loss = C.use_hinge_gan_loss + + if C.use_mse_gan_loss: + self.mse_loss = MSEDLoss() + if C.use_hinge_gan_loss: + self.hinge_loss = HingeDLoss() + + def forward(self, scores_fake, scores_real): + loss = 0 + return_dict = {} + + if self.use_mse_gan_loss: + mse_D_loss, mse_D_real_loss, mse_D_fake_loss = _apply_D_loss( + scores_fake=scores_fake, + scores_real=scores_real, + loss_func=self.mse_loss) + return_dict['D_mse_gan_loss'] = mse_D_loss + return_dict['D_mse_gan_real_loss'] = mse_D_real_loss + return_dict['D_mse_gan_fake_loss'] = mse_D_fake_loss + loss += mse_D_loss + + if self.use_hinge_gan_loss: + hinge_D_loss, hinge_D_real_loss, hinge_D_fake_loss = _apply_D_loss( + scores_fake=scores_fake, + scores_real=scores_real, + loss_func=self.hinge_loss) + return_dict['D_hinge_gan_loss'] = hinge_D_loss + return_dict['D_hinge_gan_real_loss'] = hinge_D_real_loss + return_dict['D_hinge_gan_fake_loss'] = hinge_D_fake_loss + loss += hinge_D_loss + + return_dict['D_loss'] = loss + return return_dict diff --git a/mozilla_voice_tts/vocoder/layers/melgan.py b/mozilla_voice_tts/vocoder/layers/melgan.py new file mode 100644 index 00000000..58c12a2e --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/melgan.py @@ -0,0 +1,45 @@ +from torch import nn +from torch.nn.utils import weight_norm + + +class ResidualStack(nn.Module): + def __init__(self, channels, num_res_blocks, kernel_size): + super(ResidualStack, self).__init__() + + assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." + base_padding = (kernel_size - 1) // 2 + + self.blocks = nn.ModuleList() + for idx in range(num_res_blocks): + layer_kernel_size = kernel_size + layer_dilation = layer_kernel_size**idx + layer_padding = base_padding * layer_dilation + self.blocks += [nn.Sequential( + nn.LeakyReLU(0.2), + nn.ReflectionPad1d(layer_padding), + weight_norm( + nn.Conv1d(channels, + channels, + kernel_size=kernel_size, + dilation=layer_dilation, + bias=True)), + nn.LeakyReLU(0.2), + weight_norm( + nn.Conv1d(channels, channels, kernel_size=1, bias=True)), + )] + + self.shortcuts = nn.ModuleList([ + weight_norm(nn.Conv1d(channels, channels, kernel_size=1, + bias=True)) for i in range(num_res_blocks) + ]) + + def forward(self, x): + for block, shortcut in zip(self.blocks, self.shortcuts): + x = shortcut(x) + block(x) + return x + + def remove_weight_norm(self): + for block, shortcut in zip(self.blocks, self.shortcuts): + nn.utils.remove_weight_norm(block[2]) + nn.utils.remove_weight_norm(block[4]) + nn.utils.remove_weight_norm(shortcut) diff --git a/mozilla_voice_tts/vocoder/layers/parallel_wavegan.py b/mozilla_voice_tts/vocoder/layers/parallel_wavegan.py new file mode 100644 index 00000000..bedfe551 --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/parallel_wavegan.py @@ -0,0 +1,87 @@ +import torch +from torch.nn import functional as F + + +class ResidualBlock(torch.nn.Module): + """Residual block module in WaveNet.""" + def __init__(self, + kernel_size=3, + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + dropout=0.0, + dilation=1, + bias=True, + use_causal_conv=False): + super(ResidualBlock, self).__init__() + self.dropout = dropout + # no future time stamps available + if use_causal_conv: + padding = (kernel_size - 1) * dilation + else: + assert (kernel_size - + 1) % 2 == 0, "Not support even number kernel size." + padding = (kernel_size - 1) // 2 * dilation + self.use_causal_conv = use_causal_conv + + # dilation conv + self.conv = torch.nn.Conv1d(res_channels, + gate_channels, + kernel_size, + padding=padding, + dilation=dilation, + bias=bias) + + # local conditioning + if aux_channels > 0: + self.conv1x1_aux = torch.nn.Conv1d(aux_channels, + gate_channels, + 1, + bias=False) + else: + self.conv1x1_aux = None + + # conv output is split into two groups + gate_out_channels = gate_channels // 2 + self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, + res_channels, + 1, + bias=bias) + self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, + skip_channels, + 1, + bias=bias) + + def forward(self, x, c): + """ + x: B x D_res x T + c: B x D_aux x T + """ + residual = x + x = F.dropout(x, p=self.dropout, training=self.training) + x = self.conv(x) + + # remove future time steps if use_causal_conv conv + x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x + + # split into two part for gated activation + splitdim = 1 + xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) + + # local conditioning + if c is not None: + assert self.conv1x1_aux is not None + c = self.conv1x1_aux(c) + ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) + xa, xb = xa + ca, xb + cb + + x = torch.tanh(xa) * torch.sigmoid(xb) + + # for skip connection + s = self.conv1x1_skip(x) + + # for residual connection + x = (self.conv1x1_out(x) + residual) * (0.5**2) + + return x, s diff --git a/mozilla_voice_tts/vocoder/layers/pqmf.py b/mozilla_voice_tts/vocoder/layers/pqmf.py new file mode 100644 index 00000000..ef5a3507 --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/pqmf.py @@ -0,0 +1,56 @@ +import numpy as np +import torch +import torch.nn.functional as F + +from scipy import signal as sig + + +# adapted from +# https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan +class PQMF(torch.nn.Module): + def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): + super(PQMF, self).__init__() + + self.N = N + self.taps = taps + self.cutoff = cutoff + self.beta = beta + + QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta)) + H = np.zeros((N, len(QMF))) + G = np.zeros((N, len(QMF))) + for k in range(N): + constant_factor = (2 * k + 1) * (np.pi / + (2 * N)) * (np.arange(taps + 1) - + ((taps - 1) / 2)) + phase = (-1)**k * np.pi / 4 + H[k] = 2 * QMF * np.cos(constant_factor + phase) + + G[k] = 2 * QMF * np.cos(constant_factor - phase) + + H = torch.from_numpy(H[:, None, :]).float() + G = torch.from_numpy(G[None, :, :]).float() + + self.register_buffer("H", H) + self.register_buffer("G", G) + + updown_filter = torch.zeros((N, N, N)).float() + for k in range(N): + updown_filter[k, k, 0] = 1.0 + self.register_buffer("updown_filter", updown_filter) + self.N = N + + self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) + + def forward(self, x): + return self.analysis(x) + + def analysis(self, x): + return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N) + + def synthesis(self, x): + x = F.conv_transpose1d(x, + self.updown_filter * self.N, + stride=self.N) + x = F.conv1d(x, self.G, padding=self.taps // 2) + return x diff --git a/mozilla_voice_tts/vocoder/layers/qmf.dat b/mozilla_voice_tts/vocoder/layers/qmf.dat new file mode 100644 index 00000000..17eab137 --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/qmf.dat @@ -0,0 +1,640 @@ + 0.0000000e+000 + -5.5252865e-004 + -5.6176926e-004 + -4.9475181e-004 + -4.8752280e-004 + -4.8937912e-004 + -5.0407143e-004 + -5.2265643e-004 + -5.4665656e-004 + -5.6778026e-004 + -5.8709305e-004 + -6.1327474e-004 + -6.3124935e-004 + -6.5403334e-004 + -6.7776908e-004 + -6.9416146e-004 + -7.1577365e-004 + -7.2550431e-004 + -7.4409419e-004 + -7.4905981e-004 + -7.6813719e-004 + -7.7248486e-004 + -7.8343323e-004 + -7.7798695e-004 + -7.8036647e-004 + -7.8014496e-004 + -7.7579773e-004 + -7.6307936e-004 + -7.5300014e-004 + -7.3193572e-004 + -7.2153920e-004 + -6.9179375e-004 + -6.6504151e-004 + -6.3415949e-004 + -5.9461189e-004 + -5.5645764e-004 + -5.1455722e-004 + -4.6063255e-004 + -4.0951215e-004 + -3.5011759e-004 + -2.8969812e-004 + -2.0983373e-004 + -1.4463809e-004 + -6.1733441e-005 + 1.3494974e-005 + 1.0943831e-004 + 2.0430171e-004 + 2.9495311e-004 + 4.0265402e-004 + 5.1073885e-004 + 6.2393761e-004 + 7.4580259e-004 + 8.6084433e-004 + 9.8859883e-004 + 1.1250155e-003 + 1.2577885e-003 + 1.3902495e-003 + 1.5443220e-003 + 1.6868083e-003 + 1.8348265e-003 + 1.9841141e-003 + 2.1461584e-003 + 2.3017255e-003 + 2.4625617e-003 + 2.6201759e-003 + 2.7870464e-003 + 2.9469448e-003 + 3.1125421e-003 + 3.2739613e-003 + 3.4418874e-003 + 3.6008268e-003 + 3.7603923e-003 + 3.9207432e-003 + 4.0819753e-003 + 4.2264269e-003 + 4.3730720e-003 + 4.5209853e-003 + 4.6606461e-003 + 4.7932561e-003 + 4.9137604e-003 + 5.0393023e-003 + 5.1407354e-003 + 5.2461166e-003 + 5.3471681e-003 + 5.4196776e-003 + 5.4876040e-003 + 5.5475715e-003 + 5.5938023e-003 + 5.6220643e-003 + 5.6455197e-003 + 5.6389200e-003 + 5.6266114e-003 + 5.5917129e-003 + 5.5404364e-003 + 5.4753783e-003 + 5.3838976e-003 + 5.2715759e-003 + 5.1382275e-003 + 4.9839688e-003 + 4.8109469e-003 + 4.6039530e-003 + 4.3801862e-003 + 4.1251642e-003 + 3.8456408e-003 + 3.5401247e-003 + 3.2091886e-003 + 2.8446758e-003 + 2.4508540e-003 + 2.0274176e-003 + 1.5784683e-003 + 1.0902329e-003 + 5.8322642e-004 + 2.7604519e-005 + -5.4642809e-004 + -1.1568136e-003 + -1.8039473e-003 + -2.4826724e-003 + -3.1933778e-003 + -3.9401124e-003 + -4.7222596e-003 + -5.5337211e-003 + -6.3792293e-003 + -7.2615817e-003 + -8.1798233e-003 + -9.1325330e-003 + -1.0115022e-002 + -1.1131555e-002 + -1.2185000e-002 + -1.3271822e-002 + -1.4390467e-002 + -1.5540555e-002 + -1.6732471e-002 + -1.7943338e-002 + -1.9187243e-002 + -2.0453179e-002 + -2.1746755e-002 + -2.3068017e-002 + -2.4416099e-002 + -2.5787585e-002 + -2.7185943e-002 + -2.8607217e-002 + -3.0050266e-002 + -3.1501761e-002 + -3.2975408e-002 + -3.4462095e-002 + -3.5969756e-002 + -3.7481285e-002 + -3.9005368e-002 + -4.0534917e-002 + -4.2064909e-002 + -4.3609754e-002 + -4.5148841e-002 + -4.6684303e-002 + -4.8216572e-002 + -4.9738576e-002 + -5.1255616e-002 + -5.2763075e-002 + -5.4245277e-002 + -5.5717365e-002 + -5.7161645e-002 + -5.8591568e-002 + -5.9983748e-002 + -6.1345517e-002 + -6.2685781e-002 + -6.3971590e-002 + -6.5224711e-002 + -6.6436751e-002 + -6.7607599e-002 + -6.8704383e-002 + -6.9763024e-002 + -7.0762871e-002 + -7.1700267e-002 + -7.2568258e-002 + -7.3362026e-002 + -7.4100364e-002 + -7.4745256e-002 + -7.5313734e-002 + -7.5800836e-002 + -7.6199248e-002 + -7.6499217e-002 + -7.6709349e-002 + -7.6817398e-002 + -7.6823001e-002 + -7.6720492e-002 + -7.6505072e-002 + -7.6174832e-002 + -7.5730576e-002 + -7.5157626e-002 + -7.4466439e-002 + -7.3640601e-002 + -7.2677464e-002 + -7.1582636e-002 + -7.0353307e-002 + -6.8966401e-002 + -6.7452502e-002 + -6.5769067e-002 + -6.3944481e-002 + -6.1960278e-002 + -5.9816657e-002 + -5.7515269e-002 + -5.5046003e-002 + -5.2409382e-002 + -4.9597868e-002 + -4.6630331e-002 + -4.3476878e-002 + -4.0145828e-002 + -3.6641812e-002 + -3.2958393e-002 + -2.9082401e-002 + -2.5030756e-002 + -2.0799707e-002 + -1.6370126e-002 + -1.1762383e-002 + -6.9636862e-003 + -1.9765601e-003 + 3.2086897e-003 + 8.5711749e-003 + 1.4128883e-002 + 1.9883413e-002 + 2.5822729e-002 + 3.1953127e-002 + 3.8277657e-002 + 4.4780682e-002 + 5.1480418e-002 + 5.8370533e-002 + 6.5440985e-002 + 7.2694330e-002 + 8.0137293e-002 + 8.7754754e-002 + 9.5553335e-002 + 1.0353295e-001 + 1.1168269e-001 + 1.2000780e-001 + 1.2850029e-001 + 1.3715518e-001 + 1.4597665e-001 + 1.5496071e-001 + 1.6409589e-001 + 1.7338082e-001 + 1.8281725e-001 + 1.9239667e-001 + 2.0212502e-001 + 2.1197359e-001 + 2.2196527e-001 + 2.3206909e-001 + 2.4230169e-001 + 2.5264803e-001 + 2.6310533e-001 + 2.7366340e-001 + 2.8432142e-001 + 2.9507167e-001 + 3.0590986e-001 + 3.1682789e-001 + 3.2781137e-001 + 3.3887227e-001 + 3.4999141e-001 + 3.6115899e-001 + 3.7237955e-001 + 3.8363500e-001 + 3.9492118e-001 + 4.0623177e-001 + 4.1756969e-001 + 4.2891199e-001 + 4.4025538e-001 + 4.5159965e-001 + 4.6293081e-001 + 4.7424532e-001 + 4.8552531e-001 + 4.9677083e-001 + 5.0798175e-001 + 5.1912350e-001 + 5.3022409e-001 + 5.4125534e-001 + 5.5220513e-001 + 5.6307891e-001 + 5.7385241e-001 + 5.8454032e-001 + 5.9511231e-001 + 6.0557835e-001 + 6.1591099e-001 + 6.2612427e-001 + 6.3619801e-001 + 6.4612697e-001 + 6.5590163e-001 + 6.6551399e-001 + 6.7496632e-001 + 6.8423533e-001 + 6.9332824e-001 + 7.0223887e-001 + 7.1094104e-001 + 7.1944626e-001 + 7.2774489e-001 + 7.3582118e-001 + 7.4368279e-001 + 7.5131375e-001 + 7.5870808e-001 + 7.6586749e-001 + 7.7277809e-001 + 7.7942875e-001 + 7.8583531e-001 + 7.9197358e-001 + 7.9784664e-001 + 8.0344858e-001 + 8.0876950e-001 + 8.1381913e-001 + 8.1857760e-001 + 8.2304199e-001 + 8.2722753e-001 + 8.3110385e-001 + 8.3469374e-001 + 8.3797173e-001 + 8.4095414e-001 + 8.4362383e-001 + 8.4598185e-001 + 8.4803158e-001 + 8.4978052e-001 + 8.5119715e-001 + 8.5230470e-001 + 8.5310209e-001 + 8.5357206e-001 + 8.5373856e-001 + 8.5357206e-001 + 8.5310209e-001 + 8.5230470e-001 + 8.5119715e-001 + 8.4978052e-001 + 8.4803158e-001 + 8.4598185e-001 + 8.4362383e-001 + 8.4095414e-001 + 8.3797173e-001 + 8.3469374e-001 + 8.3110385e-001 + 8.2722753e-001 + 8.2304199e-001 + 8.1857760e-001 + 8.1381913e-001 + 8.0876950e-001 + 8.0344858e-001 + 7.9784664e-001 + 7.9197358e-001 + 7.8583531e-001 + 7.7942875e-001 + 7.7277809e-001 + 7.6586749e-001 + 7.5870808e-001 + 7.5131375e-001 + 7.4368279e-001 + 7.3582118e-001 + 7.2774489e-001 + 7.1944626e-001 + 7.1094104e-001 + 7.0223887e-001 + 6.9332824e-001 + 6.8423533e-001 + 6.7496632e-001 + 6.6551399e-001 + 6.5590163e-001 + 6.4612697e-001 + 6.3619801e-001 + 6.2612427e-001 + 6.1591099e-001 + 6.0557835e-001 + 5.9511231e-001 + 5.8454032e-001 + 5.7385241e-001 + 5.6307891e-001 + 5.5220513e-001 + 5.4125534e-001 + 5.3022409e-001 + 5.1912350e-001 + 5.0798175e-001 + 4.9677083e-001 + 4.8552531e-001 + 4.7424532e-001 + 4.6293081e-001 + 4.5159965e-001 + 4.4025538e-001 + 4.2891199e-001 + 4.1756969e-001 + 4.0623177e-001 + 3.9492118e-001 + 3.8363500e-001 + 3.7237955e-001 + 3.6115899e-001 + 3.4999141e-001 + 3.3887227e-001 + 3.2781137e-001 + 3.1682789e-001 + 3.0590986e-001 + 2.9507167e-001 + 2.8432142e-001 + 2.7366340e-001 + 2.6310533e-001 + 2.5264803e-001 + 2.4230169e-001 + 2.3206909e-001 + 2.2196527e-001 + 2.1197359e-001 + 2.0212502e-001 + 1.9239667e-001 + 1.8281725e-001 + 1.7338082e-001 + 1.6409589e-001 + 1.5496071e-001 + 1.4597665e-001 + 1.3715518e-001 + 1.2850029e-001 + 1.2000780e-001 + 1.1168269e-001 + 1.0353295e-001 + 9.5553335e-002 + 8.7754754e-002 + 8.0137293e-002 + 7.2694330e-002 + 6.5440985e-002 + 5.8370533e-002 + 5.1480418e-002 + 4.4780682e-002 + 3.8277657e-002 + 3.1953127e-002 + 2.5822729e-002 + 1.9883413e-002 + 1.4128883e-002 + 8.5711749e-003 + 3.2086897e-003 + -1.9765601e-003 + -6.9636862e-003 + -1.1762383e-002 + -1.6370126e-002 + -2.0799707e-002 + -2.5030756e-002 + -2.9082401e-002 + -3.2958393e-002 + -3.6641812e-002 + -4.0145828e-002 + -4.3476878e-002 + -4.6630331e-002 + -4.9597868e-002 + -5.2409382e-002 + -5.5046003e-002 + -5.7515269e-002 + -5.9816657e-002 + -6.1960278e-002 + -6.3944481e-002 + -6.5769067e-002 + -6.7452502e-002 + -6.8966401e-002 + -7.0353307e-002 + -7.1582636e-002 + -7.2677464e-002 + -7.3640601e-002 + -7.4466439e-002 + -7.5157626e-002 + -7.5730576e-002 + -7.6174832e-002 + -7.6505072e-002 + -7.6720492e-002 + -7.6823001e-002 + -7.6817398e-002 + -7.6709349e-002 + -7.6499217e-002 + -7.6199248e-002 + -7.5800836e-002 + -7.5313734e-002 + -7.4745256e-002 + -7.4100364e-002 + -7.3362026e-002 + -7.2568258e-002 + -7.1700267e-002 + -7.0762871e-002 + -6.9763024e-002 + -6.8704383e-002 + -6.7607599e-002 + -6.6436751e-002 + -6.5224711e-002 + -6.3971590e-002 + -6.2685781e-002 + -6.1345517e-002 + -5.9983748e-002 + -5.8591568e-002 + -5.7161645e-002 + -5.5717365e-002 + -5.4245277e-002 + -5.2763075e-002 + -5.1255616e-002 + -4.9738576e-002 + -4.8216572e-002 + -4.6684303e-002 + -4.5148841e-002 + -4.3609754e-002 + -4.2064909e-002 + -4.0534917e-002 + -3.9005368e-002 + -3.7481285e-002 + -3.5969756e-002 + -3.4462095e-002 + -3.2975408e-002 + -3.1501761e-002 + -3.0050266e-002 + -2.8607217e-002 + -2.7185943e-002 + -2.5787585e-002 + -2.4416099e-002 + -2.3068017e-002 + -2.1746755e-002 + -2.0453179e-002 + -1.9187243e-002 + -1.7943338e-002 + -1.6732471e-002 + -1.5540555e-002 + -1.4390467e-002 + -1.3271822e-002 + -1.2185000e-002 + -1.1131555e-002 + -1.0115022e-002 + -9.1325330e-003 + -8.1798233e-003 + -7.2615817e-003 + -6.3792293e-003 + -5.5337211e-003 + -4.7222596e-003 + -3.9401124e-003 + -3.1933778e-003 + -2.4826724e-003 + -1.8039473e-003 + -1.1568136e-003 + -5.4642809e-004 + 2.7604519e-005 + 5.8322642e-004 + 1.0902329e-003 + 1.5784683e-003 + 2.0274176e-003 + 2.4508540e-003 + 2.8446758e-003 + 3.2091886e-003 + 3.5401247e-003 + 3.8456408e-003 + 4.1251642e-003 + 4.3801862e-003 + 4.6039530e-003 + 4.8109469e-003 + 4.9839688e-003 + 5.1382275e-003 + 5.2715759e-003 + 5.3838976e-003 + 5.4753783e-003 + 5.5404364e-003 + 5.5917129e-003 + 5.6266114e-003 + 5.6389200e-003 + 5.6455197e-003 + 5.6220643e-003 + 5.5938023e-003 + 5.5475715e-003 + 5.4876040e-003 + 5.4196776e-003 + 5.3471681e-003 + 5.2461166e-003 + 5.1407354e-003 + 5.0393023e-003 + 4.9137604e-003 + 4.7932561e-003 + 4.6606461e-003 + 4.5209853e-003 + 4.3730720e-003 + 4.2264269e-003 + 4.0819753e-003 + 3.9207432e-003 + 3.7603923e-003 + 3.6008268e-003 + 3.4418874e-003 + 3.2739613e-003 + 3.1125421e-003 + 2.9469448e-003 + 2.7870464e-003 + 2.6201759e-003 + 2.4625617e-003 + 2.3017255e-003 + 2.1461584e-003 + 1.9841141e-003 + 1.8348265e-003 + 1.6868083e-003 + 1.5443220e-003 + 1.3902495e-003 + 1.2577885e-003 + 1.1250155e-003 + 9.8859883e-004 + 8.6084433e-004 + 7.4580259e-004 + 6.2393761e-004 + 5.1073885e-004 + 4.0265402e-004 + 2.9495311e-004 + 2.0430171e-004 + 1.0943831e-004 + 1.3494974e-005 + -6.1733441e-005 + -1.4463809e-004 + -2.0983373e-004 + -2.8969812e-004 + -3.5011759e-004 + -4.0951215e-004 + -4.6063255e-004 + -5.1455722e-004 + -5.5645764e-004 + -5.9461189e-004 + -6.3415949e-004 + -6.6504151e-004 + -6.9179375e-004 + -7.2153920e-004 + -7.3193572e-004 + -7.5300014e-004 + -7.6307936e-004 + -7.7579773e-004 + -7.8014496e-004 + -7.8036647e-004 + -7.7798695e-004 + -7.8343323e-004 + -7.7248486e-004 + -7.6813719e-004 + -7.4905981e-004 + -7.4409419e-004 + -7.2550431e-004 + -7.1577365e-004 + -6.9416146e-004 + -6.7776908e-004 + -6.5403334e-004 + -6.3124935e-004 + -6.1327474e-004 + -5.8709305e-004 + -5.6778026e-004 + -5.4665656e-004 + -5.2265643e-004 + -5.0407143e-004 + -4.8937912e-004 + -4.8752280e-004 + -4.9475181e-004 + -5.6176926e-004 + -5.5252865e-004 diff --git a/mozilla_voice_tts/vocoder/layers/upsample.py b/mozilla_voice_tts/vocoder/layers/upsample.py new file mode 100644 index 00000000..13406875 --- /dev/null +++ b/mozilla_voice_tts/vocoder/layers/upsample.py @@ -0,0 +1,101 @@ +import torch +from torch.nn import functional as F + + +class Stretch2d(torch.nn.Module): + def __init__(self, x_scale, y_scale, mode="nearest"): + super(Stretch2d, self).__init__() + self.x_scale = x_scale + self.y_scale = y_scale + self.mode = mode + + def forward(self, x): + """ + x (Tensor): Input tensor (B, C, F, T). + Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), + """ + return F.interpolate( + x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) + + +class UpsampleNetwork(torch.nn.Module): + # pylint: disable=dangerous-default-value + def __init__(self, + upsample_factors, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + use_causal_conv=False, + ): + super(UpsampleNetwork, self).__init__() + self.use_causal_conv = use_causal_conv + self.up_layers = torch.nn.ModuleList() + for scale in upsample_factors: + # interpolation layer + stretch = Stretch2d(scale, 1, interpolate_mode) + self.up_layers += [stretch] + + # conv layer + assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." + freq_axis_padding = (freq_axis_kernel_size - 1) // 2 + kernel_size = (freq_axis_kernel_size, scale * 2 + 1) + if use_causal_conv: + padding = (freq_axis_padding, scale * 2) + else: + padding = (freq_axis_padding, scale) + conv = torch.nn.Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) + self.up_layers += [conv] + + # nonlinear + if nonlinear_activation is not None: + nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) + self.up_layers += [nonlinear] + + def forward(self, c): + """ + c : (B, C, T_in). + Tensor: (B, C, T_upsample) + """ + c = c.unsqueeze(1) # (B, 1, C, T) + for f in self.up_layers: + c = f(c) + return c.squeeze(1) # (B, C, T') + + +class ConvUpsample(torch.nn.Module): + # pylint: disable=dangerous-default-value + def __init__(self, + upsample_factors, + nonlinear_activation=None, + nonlinear_activation_params={}, + interpolate_mode="nearest", + freq_axis_kernel_size=1, + aux_channels=80, + aux_context_window=0, + use_causal_conv=False + ): + super(ConvUpsample, self).__init__() + self.aux_context_window = aux_context_window + self.use_causal_conv = use_causal_conv and aux_context_window > 0 + # To capture wide-context information in conditional features + kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 + # NOTE(kan-bayashi): Here do not use padding because the input is already padded + self.conv_in = torch.nn.Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) + self.upsample = UpsampleNetwork( + upsample_factors=upsample_factors, + nonlinear_activation=nonlinear_activation, + nonlinear_activation_params=nonlinear_activation_params, + interpolate_mode=interpolate_mode, + freq_axis_kernel_size=freq_axis_kernel_size, + use_causal_conv=use_causal_conv, + ) + + def forward(self, c): + """ + c : (B, C, T_in). + Tensor: (B, C, T_upsampled), + """ + c_ = self.conv_in(c) + c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ + return self.upsample(c) diff --git a/mozilla_voice_tts/vocoder/models/__init__.py b/mozilla_voice_tts/vocoder/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/models/melgan_discriminator.py b/mozilla_voice_tts/vocoder/models/melgan_discriminator.py new file mode 100644 index 00000000..3847babb --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/melgan_discriminator.py @@ -0,0 +1,78 @@ +import numpy as np +from torch import nn +from torch.nn.utils import weight_norm + + +class MelganDiscriminator(nn.Module): + def __init__(self, + in_channels=1, + out_channels=1, + kernel_sizes=(5, 3), + base_channels=16, + max_channels=1024, + downsample_factors=(4, 4, 4, 4)): + super(MelganDiscriminator, self).__init__() + self.layers = nn.ModuleList() + + layer_kernel_size = np.prod(kernel_sizes) + layer_padding = (layer_kernel_size - 1) // 2 + + # initial layer + self.layers += [ + nn.Sequential( + nn.ReflectionPad1d(layer_padding), + weight_norm( + nn.Conv1d(in_channels, + base_channels, + layer_kernel_size, + stride=1)), nn.LeakyReLU(0.2, inplace=True)) + ] + + # downsampling layers + layer_in_channels = base_channels + for downsample_factor in downsample_factors: + layer_out_channels = min(layer_in_channels * downsample_factor, + max_channels) + layer_kernel_size = downsample_factor * 10 + 1 + layer_padding = (layer_kernel_size - 1) // 2 + layer_groups = layer_in_channels // 4 + self.layers += [ + nn.Sequential( + weight_norm( + nn.Conv1d(layer_in_channels, + layer_out_channels, + kernel_size=layer_kernel_size, + stride=downsample_factor, + padding=layer_padding, + groups=layer_groups)), + nn.LeakyReLU(0.2, inplace=True)) + ] + layer_in_channels = layer_out_channels + + # last 2 layers + layer_padding1 = (kernel_sizes[0] - 1) // 2 + layer_padding2 = (kernel_sizes[1] - 1) // 2 + self.layers += [ + nn.Sequential( + weight_norm( + nn.Conv1d(layer_out_channels, + layer_out_channels, + kernel_size=kernel_sizes[0], + stride=1, + padding=layer_padding1)), + nn.LeakyReLU(0.2, inplace=True), + ), + weight_norm( + nn.Conv1d(layer_out_channels, + out_channels, + kernel_size=kernel_sizes[1], + stride=1, + padding=layer_padding2)), + ] + + def forward(self, x): + feats = [] + for layer in self.layers: + x = layer(x) + feats.append(x) + return x, feats diff --git a/mozilla_voice_tts/vocoder/models/melgan_generator.py b/mozilla_voice_tts/vocoder/models/melgan_generator.py new file mode 100644 index 00000000..4c35b1eb --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/melgan_generator.py @@ -0,0 +1,97 @@ +import torch +from torch import nn +from torch.nn.utils import weight_norm + +from mozilla_voice_tts.vocoder.layers.melgan import ResidualStack + + +class MelganGenerator(nn.Module): + def __init__(self, + in_channels=80, + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=(8, 8, 2, 2), + res_kernel=3, + num_res_blocks=3): + super(MelganGenerator, self).__init__() + + # assert model parameters + assert (proj_kernel - + 1) % 2 == 0, " [!] proj_kernel should be an odd number." + + # setup additional model parameters + base_padding = (proj_kernel - 1) // 2 + act_slope = 0.2 + self.inference_padding = 2 + + # initial layer + layers = [] + layers += [ + nn.ReflectionPad1d(base_padding), + weight_norm( + nn.Conv1d(in_channels, + base_channels, + kernel_size=proj_kernel, + stride=1, + bias=True)) + ] + + # upsampling layers and residual stacks + for idx, upsample_factor in enumerate(upsample_factors): + layer_in_channels = base_channels // (2**idx) + layer_out_channels = base_channels // (2**(idx + 1)) + layer_filter_size = upsample_factor * 2 + layer_stride = upsample_factor + layer_output_padding = upsample_factor % 2 + layer_padding = upsample_factor // 2 + layer_output_padding + layers += [ + nn.LeakyReLU(act_slope), + weight_norm( + nn.ConvTranspose1d(layer_in_channels, + layer_out_channels, + layer_filter_size, + stride=layer_stride, + padding=layer_padding, + output_padding=layer_output_padding, + bias=True)), + ResidualStack( + channels=layer_out_channels, + num_res_blocks=num_res_blocks, + kernel_size=res_kernel + ) + ] + + layers += [nn.LeakyReLU(act_slope)] + + # final layer + layers += [ + nn.ReflectionPad1d(base_padding), + weight_norm( + nn.Conv1d(layer_out_channels, + out_channels, + proj_kernel, + stride=1, + bias=True)), + nn.Tanh() + ] + self.layers = nn.Sequential(*layers) + + def forward(self, c): + return self.layers(c) + + def inference(self, c): + c = c.to(self.layers[1].weight.device) + c = torch.nn.functional.pad( + c, + (self.inference_padding, self.inference_padding), + 'replicate') + return self.layers(c) + + def remove_weight_norm(self): + for _, layer in enumerate(self.layers): + if len(layer.state_dict()) != 0: + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + layer.remove_weight_norm() diff --git a/mozilla_voice_tts/vocoder/models/melgan_multiscale_discriminator.py b/mozilla_voice_tts/vocoder/models/melgan_multiscale_discriminator.py new file mode 100644 index 00000000..69adcc27 --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/melgan_multiscale_discriminator.py @@ -0,0 +1,41 @@ +from torch import nn + +from mozilla_voice_tts.vocoder.models.melgan_discriminator import MelganDiscriminator + + +class MelganMultiscaleDiscriminator(nn.Module): + def __init__(self, + in_channels=1, + out_channels=1, + num_scales=3, + kernel_sizes=(5, 3), + base_channels=16, + max_channels=1024, + downsample_factors=(4, 4, 4), + pooling_kernel_size=4, + pooling_stride=2, + pooling_padding=1): + super(MelganMultiscaleDiscriminator, self).__init__() + + self.discriminators = nn.ModuleList([ + MelganDiscriminator(in_channels=in_channels, + out_channels=out_channels, + kernel_sizes=kernel_sizes, + base_channels=base_channels, + max_channels=max_channels, + downsample_factors=downsample_factors) + for _ in range(num_scales) + ]) + + self.pooling = nn.AvgPool1d(kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False) + + + def forward(self, x): + scores = list() + feats = list() + for disc in self.discriminators: + score, feat = disc(x) + scores.append(score) + feats.append(feat) + x = self.pooling(x) + return scores, feats diff --git a/mozilla_voice_tts/vocoder/models/multiband_melgan_generator.py b/mozilla_voice_tts/vocoder/models/multiband_melgan_generator.py new file mode 100644 index 00000000..5571ed54 --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/multiband_melgan_generator.py @@ -0,0 +1,39 @@ +import torch + +from mozilla_voice_tts.vocoder.models.melgan_generator import MelganGenerator +from mozilla_voice_tts.vocoder.layers.pqmf import PQMF + + +class MultibandMelganGenerator(MelganGenerator): + def __init__(self, + in_channels=80, + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=(2, 8, 2, 2), + res_kernel=3, + num_res_blocks=3): + super(MultibandMelganGenerator, + self).__init__(in_channels=in_channels, + out_channels=out_channels, + proj_kernel=proj_kernel, + base_channels=base_channels, + upsample_factors=upsample_factors, + res_kernel=res_kernel, + num_res_blocks=num_res_blocks) + self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) + + def pqmf_analysis(self, x): + return self.pqmf_layer.analysis(x) + + def pqmf_synthesis(self, x): + return self.pqmf_layer.synthesis(x) + + @torch.no_grad() + def inference(self, cond_features): + cond_features = cond_features.to(self.layers[1].weight.device) + cond_features = torch.nn.functional.pad( + cond_features, + (self.inference_padding, self.inference_padding), + 'replicate') + return self.pqmf_synthesis(self.layers(cond_features)) diff --git a/mozilla_voice_tts/vocoder/models/parallel_wavegan_discriminator.py b/mozilla_voice_tts/vocoder/models/parallel_wavegan_discriminator.py new file mode 100644 index 00000000..8d4f071c --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/parallel_wavegan_discriminator.py @@ -0,0 +1,197 @@ +import math +import torch +from torch import nn + +from mozilla_voice_tts.vocoder.layers.parallel_wavegan import ResidualBlock + + +class ParallelWaveganDiscriminator(nn.Module): + """PWGAN discriminator as in https://arxiv.org/abs/1910.11480. + It classifies each audio window real/fake and returns a sequence + of predictions. + It is a stack of convolutional blocks with dilation. + """ + # pylint: disable=dangerous-default-value + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=10, + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + ): + super(ParallelWaveganDiscriminator, self).__init__() + assert (kernel_size - 1) % 2 == 0, " [!] does not support even number kernel size." + assert dilation_factor > 0, " [!] dilation factor must be > 0." + self.conv_layers = nn.ModuleList() + conv_in_channels = in_channels + for i in range(num_layers - 1): + if i == 0: + dilation = 1 + else: + dilation = i if dilation_factor == 1 else dilation_factor ** i + conv_in_channels = conv_channels + padding = (kernel_size - 1) // 2 * dilation + conv_layer = [ + nn.Conv1d(conv_in_channels, + conv_channels, + kernel_size=kernel_size, + padding=padding, + dilation=dilation, + bias=bias), + getattr(nn, + nonlinear_activation)(inplace=True, + **nonlinear_activation_params) + ] + self.conv_layers += conv_layer + padding = (kernel_size - 1) // 2 + last_conv_layer = nn.Conv1d( + conv_in_channels, out_channels, + kernel_size=kernel_size, padding=padding, bias=bias) + self.conv_layers += [last_conv_layer] + self.apply_weight_norm() + + def forward(self, x): + """ + x : (B, 1, T). + Returns: + Tensor: (B, 1, T) + """ + for f in self.conv_layers: + x = f(x) + return x + + def apply_weight_norm(self): + def _apply_weight_norm(m): + if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + torch.nn.utils.weight_norm(m) + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + def _remove_weight_norm(m): + try: + # print(f"Weight norm is removed from {m}.") + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + self.apply(_remove_weight_norm) + + +class ResidualParallelWaveganDiscriminator(nn.Module): + # pylint: disable=dangerous-default-value + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=30, + stacks=3, + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ): + super(ResidualParallelWaveganDiscriminator, self).__init__() + assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." + + self.in_channels = in_channels + self.out_channels = out_channels + self.num_layers = num_layers + self.stacks = stacks + self.kernel_size = kernel_size + self.res_factor = math.sqrt(1.0 / num_layers) + + # check the number of num_layers and stacks + assert num_layers % stacks == 0 + layers_per_stack = num_layers // stacks + + # define first convolution + self.first_conv = nn.Sequential( + nn.Conv1d(in_channels, + res_channels, + kernel_size=1, + padding=0, + dilation=1, + bias=True), + getattr(nn, nonlinear_activation)(inplace=True, + **nonlinear_activation_params), + ) + + # define residual blocks + self.conv_layers = nn.ModuleList() + for layer in range(num_layers): + dilation = 2 ** (layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + res_channels=res_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=-1, + dilation=dilation, + dropout=dropout, + bias=bias, + use_causal_conv=False, + ) + self.conv_layers += [conv] + + # define output layers + self.last_conv_layers = nn.ModuleList([ + getattr(nn, nonlinear_activation)(inplace=True, + **nonlinear_activation_params), + nn.Conv1d(skip_channels, + skip_channels, + kernel_size=1, + padding=0, + dilation=1, + bias=True), + getattr(nn, nonlinear_activation)(inplace=True, + **nonlinear_activation_params), + nn.Conv1d(skip_channels, + out_channels, + kernel_size=1, + padding=0, + dilation=1, + bias=True), + ]) + + # apply weight norm + self.apply_weight_norm() + + def forward(self, x): + """ + x: (B, 1, T). + """ + x = self.first_conv(x) + + skips = 0 + for f in self.conv_layers: + x, h = f(x, None) + skips += h + skips *= self.res_factor + + # apply final layers + x = skips + for f in self.last_conv_layers: + x = f(x) + return x + + def apply_weight_norm(self): + def _apply_weight_norm(m): + if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + torch.nn.utils.weight_norm(m) + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + def _remove_weight_norm(m): + try: + print(f"Weight norm is removed from {m}.") + nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) diff --git a/mozilla_voice_tts/vocoder/models/parallel_wavegan_generator.py b/mozilla_voice_tts/vocoder/models/parallel_wavegan_generator.py new file mode 100644 index 00000000..858da282 --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/parallel_wavegan_generator.py @@ -0,0 +1,158 @@ +import math +import numpy as np +import torch + +from mozilla_voice_tts.vocoder.layers.parallel_wavegan import ResidualBlock +from mozilla_voice_tts.vocoder.layers.upsample import ConvUpsample + + +class ParallelWaveganGenerator(torch.nn.Module): + """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf. + It is similar to WaveNet with no causal convolution. + It is conditioned on an aux feature (spectrogram) to generate + an output waveform from an input noise. + """ + # pylint: disable=dangerous-default-value + def __init__(self, + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=30, + stacks=3, + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=[4, 4, 4, 4], + inference_padding=2): + + super(ParallelWaveganGenerator, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.num_res_blocks = num_res_blocks + self.stacks = stacks + self.kernel_size = kernel_size + self.upsample_factors = upsample_factors + self.upsample_scale = np.prod(upsample_factors) + self.inference_padding = inference_padding + + # check the number of layers and stacks + assert num_res_blocks % stacks == 0 + layers_per_stack = num_res_blocks // stacks + + # define first convolution + self.first_conv = torch.nn.Conv1d(in_channels, + res_channels, + kernel_size=1, + bias=True) + + # define conv + upsampling network + self.upsample_net = ConvUpsample(upsample_factors=upsample_factors) + + # define residual blocks + self.conv_layers = torch.nn.ModuleList() + for layer in range(num_res_blocks): + dilation = 2**(layer % layers_per_stack) + conv = ResidualBlock( + kernel_size=kernel_size, + res_channels=res_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias, + ) + self.conv_layers += [conv] + + # define output layers + self.last_conv_layers = torch.nn.ModuleList([ + torch.nn.ReLU(inplace=True), + torch.nn.Conv1d(skip_channels, + skip_channels, + kernel_size=1, + bias=True), + torch.nn.ReLU(inplace=True), + torch.nn.Conv1d(skip_channels, + out_channels, + kernel_size=1, + bias=True), + ]) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, c): + """ + c: (B, C ,T'). + o: Output tensor (B, out_channels, T) + """ + # random noise + x = torch.randn([c.shape[0], 1, c.shape[2] * self.upsample_scale]) + x = x.to(self.first_conv.bias.device) + + # perform upsampling + if c is not None and self.upsample_net is not None: + c = self.upsample_net(c) + assert c.shape[-1] == x.shape[ + -1], f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + + # encode to hidden representation + x = self.first_conv(x) + skips = 0 + for f in self.conv_layers: + x, h = f(x, c) + skips += h + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + # apply final layers + x = skips + for f in self.last_conv_layers: + x = f(x) + + return x + + @torch.no_grad() + def inference(self, c): + c = c.to(self.first_conv.weight.device) + c = torch.nn.functional.pad( + c, (self.inference_padding, self.inference_padding), 'replicate') + return self.forward(c) + + def remove_weight_norm(self): + def _remove_weight_norm(m): + try: + # print(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + def _apply_weight_norm(m): + if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + torch.nn.utils.weight_norm(m) + # print(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + @staticmethod + def _get_receptive_field_size(layers, + stacks, + kernel_size, + dilation=lambda x: 2**x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + @property + def receptive_field_size(self): + return self._get_receptive_field_size(self.layers, self.stacks, + self.kernel_size) diff --git a/mozilla_voice_tts/vocoder/models/random_window_discriminator.py b/mozilla_voice_tts/vocoder/models/random_window_discriminator.py new file mode 100644 index 00000000..3efd395e --- /dev/null +++ b/mozilla_voice_tts/vocoder/models/random_window_discriminator.py @@ -0,0 +1,225 @@ +import numpy as np +from torch import nn + + +class GBlock(nn.Module): + def __init__(self, in_channels, cond_channels, downsample_factor): + super(GBlock, self).__init__() + + self.in_channels = in_channels + self.cond_channels = cond_channels + self.downsample_factor = downsample_factor + + self.start = nn.Sequential( + nn.AvgPool1d(downsample_factor, stride=downsample_factor), + nn.ReLU(), + nn.Conv1d(in_channels, in_channels * 2, kernel_size=3, padding=1)) + self.lc_conv1d = nn.Conv1d(cond_channels, + in_channels * 2, + kernel_size=1) + self.end = nn.Sequential( + nn.ReLU(), + nn.Conv1d(in_channels * 2, + in_channels * 2, + kernel_size=3, + dilation=2, + padding=2)) + self.residual = nn.Sequential( + nn.Conv1d(in_channels, in_channels * 2, kernel_size=1), + nn.AvgPool1d(downsample_factor, stride=downsample_factor)) + + def forward(self, inputs, conditions): + outputs = self.start(inputs) + self.lc_conv1d(conditions) + outputs = self.end(outputs) + residual_outputs = self.residual(inputs) + outputs = outputs + residual_outputs + + return outputs + + +class DBlock(nn.Module): + def __init__(self, in_channels, out_channels, downsample_factor): + super(DBlock, self).__init__() + + self.in_channels = in_channels + self.downsample_factor = downsample_factor + self.out_channels = out_channels + + self.donwsample_layer = nn.AvgPool1d(downsample_factor, + stride=downsample_factor) + self.layers = nn.Sequential( + nn.ReLU(), + nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1), + nn.ReLU(), + nn.Conv1d(out_channels, + out_channels, + kernel_size=3, + dilation=2, + padding=2)) + self.residual = nn.Sequential( + nn.Conv1d(in_channels, out_channels, kernel_size=1), ) + + def forward(self, inputs): + if self.downsample_factor > 1: + outputs = self.layers(self.donwsample_layer(inputs))\ + + self.donwsample_layer(self.residual(inputs)) + else: + outputs = self.layers(inputs) + self.residual(inputs) + return outputs + + +class ConditionalDiscriminator(nn.Module): + def __init__(self, + in_channels, + cond_channels, + downsample_factors=(2, 2, 2), + out_channels=(128, 256)): + super(ConditionalDiscriminator, self).__init__() + + assert len(downsample_factors) == len(out_channels) + 1 + + self.in_channels = in_channels + self.cond_channels = cond_channels + self.downsample_factors = downsample_factors + self.out_channels = out_channels + + self.pre_cond_layers = nn.ModuleList() + self.post_cond_layers = nn.ModuleList() + + # layers before condition features + self.pre_cond_layers += [DBlock(in_channels, 64, 1)] + in_channels = 64 + for (i, channel) in enumerate(out_channels): + self.pre_cond_layers.append( + DBlock(in_channels, channel, downsample_factors[i])) + in_channels = channel + + # condition block + self.cond_block = GBlock(in_channels, cond_channels, + downsample_factors[-1]) + + # layers after condition block + self.post_cond_layers += [ + DBlock(in_channels * 2, in_channels * 2, 1), + DBlock(in_channels * 2, in_channels * 2, 1), + nn.AdaptiveAvgPool1d(1), + nn.Conv1d(in_channels * 2, 1, kernel_size=1), + ] + + def forward(self, inputs, conditions): + batch_size = inputs.size()[0] + outputs = inputs.view(batch_size, self.in_channels, -1) + for layer in self.pre_cond_layers: + outputs = layer(outputs) + outputs = self.cond_block(outputs, conditions) + for layer in self.post_cond_layers: + outputs = layer(outputs) + + return outputs + + +class UnconditionalDiscriminator(nn.Module): + def __init__(self, + in_channels, + base_channels=64, + downsample_factors=(8, 4), + out_channels=(128, 256)): + super(UnconditionalDiscriminator, self).__init__() + + self.downsample_factors = downsample_factors + self.in_channels = in_channels + self.downsample_factors = downsample_factors + self.out_channels = out_channels + + self.layers = nn.ModuleList() + self.layers += [DBlock(self.in_channels, base_channels, 1)] + in_channels = base_channels + for (i, factor) in enumerate(downsample_factors): + self.layers.append(DBlock(in_channels, out_channels[i], factor)) + in_channels *= 2 + self.layers += [ + DBlock(in_channels, in_channels, 1), + DBlock(in_channels, in_channels, 1), + nn.AdaptiveAvgPool1d(1), + nn.Conv1d(in_channels, 1, kernel_size=1), + ] + + def forward(self, inputs): + batch_size = inputs.size()[0] + outputs = inputs.view(batch_size, self.in_channels, -1) + for layer in self.layers: + outputs = layer(outputs) + return outputs + + +class RandomWindowDiscriminator(nn.Module): + """Random Window Discriminator as described in + http://arxiv.org/abs/1909.11646""" + def __init__(self, + cond_channels, + hop_length, + uncond_disc_donwsample_factors=(8, 4), + cond_disc_downsample_factors=((8, 4, 2, 2, 2), (8, 4, 2, 2), + (8, 4, 2), (8, 4), (4, 2, 2)), + cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256), + (128, 256), (256, ), (128, 256)), + window_sizes=(512, 1024, 2048, 4096, 8192)): + + super(RandomWindowDiscriminator, self).__init__() + self.cond_channels = cond_channels + self.window_sizes = window_sizes + self.hop_length = hop_length + self.base_window_size = self.hop_length * 2 + self.ks = [ws // self.base_window_size for ws in window_sizes] + + # check arguments + assert len(cond_disc_downsample_factors) == len( + cond_disc_out_channels) == len(window_sizes) + for ws in window_sizes: + assert ws % hop_length == 0 + + for idx, cf in enumerate(cond_disc_downsample_factors): + assert np.prod(cf) == hop_length // self.ks[idx] + + # define layers + self.unconditional_discriminators = nn.ModuleList([]) + for k in self.ks: + layer = UnconditionalDiscriminator( + in_channels=k, + base_channels=64, + downsample_factors=uncond_disc_donwsample_factors) + self.unconditional_discriminators.append(layer) + + self.conditional_discriminators = nn.ModuleList([]) + for idx, k in enumerate(self.ks): + layer = ConditionalDiscriminator( + in_channels=k, + cond_channels=cond_channels, + downsample_factors=cond_disc_downsample_factors[idx], + out_channels=cond_disc_out_channels[idx]) + self.conditional_discriminators.append(layer) + + def forward(self, x, c): + scores = [] + feats = [] + # unconditional pass + for (window_size, layer) in zip(self.window_sizes, + self.unconditional_discriminators): + index = np.random.randint(x.shape[-1] - window_size) + + score = layer(x[:, :, index:index + window_size]) + scores.append(score) + + # conditional pass + for (window_size, layer) in zip(self.window_sizes, + self.conditional_discriminators): + frame_size = window_size // self.hop_length + lc_index = np.random.randint(c.shape[-1] - frame_size) + sample_index = lc_index * self.hop_length + x_sub = x[:, :, + sample_index:(lc_index + frame_size) * self.hop_length] + c_sub = c[:, :, lc_index:lc_index + frame_size] + + score = layer(x_sub, c_sub) + scores.append(score) + return scores, feats diff --git a/mozilla_voice_tts/vocoder/pqmf_output.wav b/mozilla_voice_tts/vocoder/pqmf_output.wav new file mode 100644 index 00000000..8a77747b Binary files /dev/null and b/mozilla_voice_tts/vocoder/pqmf_output.wav differ diff --git a/mozilla_voice_tts/vocoder/tf/layers/melgan.py b/mozilla_voice_tts/vocoder/tf/layers/melgan.py new file mode 100644 index 00000000..f9806579 --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/layers/melgan.py @@ -0,0 +1,57 @@ +import tensorflow as tf + + +class ReflectionPad1d(tf.keras.layers.Layer): + def __init__(self, padding): + super(ReflectionPad1d, self).__init__() + self.padding = padding + + def call(self, x): + return tf.pad(x, [[0, 0], [self.padding, self.padding], [0, 0], [0, 0]], "REFLECT") + + +class ResidualStack(tf.keras.layers.Layer): + def __init__(self, channels, num_res_blocks, kernel_size, name): + super(ResidualStack, self).__init__(name=name) + + assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." + base_padding = (kernel_size - 1) // 2 + + self.blocks = [] + num_layers = 2 + for idx in range(num_res_blocks): + layer_kernel_size = kernel_size + layer_dilation = layer_kernel_size**idx + layer_padding = base_padding * layer_dilation + block = [ + tf.keras.layers.LeakyReLU(0.2), + ReflectionPad1d(layer_padding), + tf.keras.layers.Conv2D(filters=channels, + kernel_size=(kernel_size, 1), + dilation_rate=(layer_dilation, 1), + use_bias=True, + padding='valid', + name=f'blocks.{idx}.{num_layers}'), + tf.keras.layers.LeakyReLU(0.2), + tf.keras.layers.Conv2D(filters=channels, + kernel_size=(1, 1), + use_bias=True, + name=f'blocks.{idx}.{num_layers + 2}') + ] + self.blocks.append(block) + self.shortcuts = [ + tf.keras.layers.Conv2D(channels, + kernel_size=1, + use_bias=True, + name=f'shortcuts.{i}') + for i in range(num_res_blocks) + ] + + def call(self, x): + # breakpoint() + for block, shortcut in zip(self.blocks, self.shortcuts): + res = shortcut(x) + for layer in block: + x = layer(x) + x += res + return x diff --git a/mozilla_voice_tts/vocoder/tf/layers/pqmf.py b/mozilla_voice_tts/vocoder/tf/layers/pqmf.py new file mode 100644 index 00000000..c018971f --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/layers/pqmf.py @@ -0,0 +1,66 @@ +import numpy as np +import tensorflow as tf + +from scipy import signal as sig + + +class PQMF(tf.keras.layers.Layer): + def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): + super(PQMF, self).__init__() + # define filter coefficient + self.N = N + self.taps = taps + self.cutoff = cutoff + self.beta = beta + + QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta)) + H = np.zeros((N, len(QMF))) + G = np.zeros((N, len(QMF))) + for k in range(N): + constant_factor = (2 * k + 1) * (np.pi / + (2 * N)) * (np.arange(taps + 1) - + ((taps - 1) / 2)) + phase = (-1)**k * np.pi / 4 + H[k] = 2 * QMF * np.cos(constant_factor + phase) + + G[k] = 2 * QMF * np.cos(constant_factor - phase) + + # [N, 1, taps + 1] == [filter_width, in_channels, out_channels] + self.H = np.transpose(H[:, None, :], (2, 1, 0)).astype('float32') + self.G = np.transpose(G[None, :, :], (2, 1, 0)).astype('float32') + + # filter for downsampling & upsampling + updown_filter = np.zeros((N, N, N), dtype=np.float32) + for k in range(N): + updown_filter[0, k, k] = 1.0 + self.updown_filter = updown_filter.astype(np.float32) + + def analysis(self, x): + """ + x : B x 1 x T + """ + x = tf.transpose(x, perm=[0, 2, 1]) + x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) + x = tf.nn.conv1d(x, self.H, stride=1, padding='VALID') + x = tf.nn.conv1d(x, + self.updown_filter, + stride=self.N, + padding='VALID') + x = tf.transpose(x, perm=[0, 2, 1]) + return x + + def synthesis(self, x): + """ + x : B x D x T + """ + x = tf.transpose(x, perm=[0, 2, 1]) + x = tf.nn.conv1d_transpose( + x, + self.updown_filter * self.N, + strides=self.N, + output_shape=(tf.shape(x)[0], tf.shape(x)[1] * self.N, + self.N)) + x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) + x = tf.nn.conv1d(x, self.G, stride=1, padding="VALID") + x = tf.transpose(x, perm=[0, 2, 1]) + return x diff --git a/mozilla_voice_tts/vocoder/tf/models/melgan_generator.py b/mozilla_voice_tts/vocoder/tf/models/melgan_generator.py new file mode 100644 index 00000000..60e870cc --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/models/melgan_generator.py @@ -0,0 +1,128 @@ +import logging +import os + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # FATAL +logging.getLogger('tensorflow').setLevel(logging.FATAL) + +import tensorflow as tf +from mozilla_voice_tts.vocoder.tf.layers.melgan import ResidualStack, ReflectionPad1d + + +#pylint: disable=too-many-ancestors +#pylint: disable=abstract-method +class MelganGenerator(tf.keras.models.Model): + """ Melgan Generator TF implementation dedicated for inference with no + weight norm """ + def __init__(self, + in_channels=80, + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=(8, 8, 2, 2), + res_kernel=3, + num_res_blocks=3): + super(MelganGenerator, self).__init__() + + self.in_channels = in_channels + + # assert model parameters + assert (proj_kernel - + 1) % 2 == 0, " [!] proj_kernel should be an odd number." + + # setup additional model parameters + base_padding = (proj_kernel - 1) // 2 + act_slope = 0.2 + self.inference_padding = 2 + + # initial layer + self.initial_layer = [ + ReflectionPad1d(base_padding), + tf.keras.layers.Conv2D(filters=base_channels, + kernel_size=(proj_kernel, 1), + strides=1, + padding='valid', + use_bias=True, + name="1") + ] + num_layers = 3 # count number of layers for layer naming + + # upsampling layers and residual stacks + self.upsample_layers = [] + for idx, upsample_factor in enumerate(upsample_factors): + layer_out_channels = base_channels // (2**(idx + 1)) + layer_filter_size = upsample_factor * 2 + layer_stride = upsample_factor + # layer_output_padding = upsample_factor % 2 + self.upsample_layers += [ + tf.keras.layers.LeakyReLU(act_slope), + tf.keras.layers.Conv2DTranspose( + filters=layer_out_channels, + kernel_size=(layer_filter_size, 1), + strides=(layer_stride, 1), + padding='same', + # output_padding=layer_output_padding, + use_bias=True, + name=f'{num_layers}'), + ResidualStack(channels=layer_out_channels, + num_res_blocks=num_res_blocks, + kernel_size=res_kernel, + name=f'layers.{num_layers + 1}') + ] + num_layers += num_res_blocks - 1 + + self.upsample_layers += [tf.keras.layers.LeakyReLU(act_slope)] + + # final layer + self.final_layers = [ + ReflectionPad1d(base_padding), + tf.keras.layers.Conv2D(filters=out_channels, + kernel_size=(proj_kernel, 1), + use_bias=True, + name=f'layers.{num_layers + 1}'), + tf.keras.layers.Activation("tanh") + ] + + # self.model_layers = tf.keras.models.Sequential(self.initial_layer + self.upsample_layers + self.final_layers, name="layers") + self.model_layers = self.initial_layer + self.upsample_layers + self.final_layers + + @tf.function(experimental_relax_shapes=True) + def call(self, c, training=False): + """ + c : B x C x T + """ + if training: + raise NotImplementedError() + return self.inference(c) + + def inference(self, c): + c = tf.transpose(c, perm=[0, 2, 1]) + c = tf.expand_dims(c, 2) + # FIXME: TF had no replicate padding as in Torch + # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") + o = c + for layer in self.model_layers: + o = layer(o) + # o = self.model_layers(c) + o = tf.transpose(o, perm=[0, 3, 2, 1]) + return o[:, :, 0, :] + + def build_inference(self): + x = tf.random.uniform((1, self.in_channels, 4), dtype=tf.float32) + self(x, training=False) + + @tf.function( + experimental_relax_shapes=True, + input_signature=[ + tf.TensorSpec([1, None, None], dtype=tf.float32), + ],) + def inference_tflite(self, c): + c = tf.transpose(c, perm=[0, 2, 1]) + c = tf.expand_dims(c, 2) + # FIXME: TF had no replicate padding as in Torch + # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") + o = c + for layer in self.model_layers: + o = layer(o) + # o = self.model_layers(c) + o = tf.transpose(o, perm=[0, 3, 2, 1]) + return o[:, :, 0, :] diff --git a/mozilla_voice_tts/vocoder/tf/models/multiband_melgan_generator.py b/mozilla_voice_tts/vocoder/tf/models/multiband_melgan_generator.py new file mode 100644 index 00000000..1ae355ce --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/models/multiband_melgan_generator.py @@ -0,0 +1,60 @@ +import tensorflow as tf + +from mozilla_voice_tts.vocoder.tf.models.melgan_generator import MelganGenerator +from mozilla_voice_tts.vocoder.tf.layers.pqmf import PQMF + +#pylint: disable=too-many-ancestors +#pylint: disable=abstract-method +class MultibandMelganGenerator(MelganGenerator): + def __init__(self, + in_channels=80, + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=(2, 8, 2, 2), + res_kernel=3, + num_res_blocks=3): + super(MultibandMelganGenerator, + self).__init__(in_channels=in_channels, + out_channels=out_channels, + proj_kernel=proj_kernel, + base_channels=base_channels, + upsample_factors=upsample_factors, + res_kernel=res_kernel, + num_res_blocks=num_res_blocks) + self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) + + def pqmf_analysis(self, x): + return self.pqmf_layer.analysis(x) + + def pqmf_synthesis(self, x): + return self.pqmf_layer.synthesis(x) + + def inference(self, c): + c = tf.transpose(c, perm=[0, 2, 1]) + c = tf.expand_dims(c, 2) + # FIXME: TF had no replicate padding as in Torch + # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") + o = c + for layer in self.model_layers: + o = layer(o) + o = tf.transpose(o, perm=[0, 3, 2, 1]) + o = self.pqmf_layer.synthesis(o[:, :, 0, :]) + return o + + @tf.function( + experimental_relax_shapes=True, + input_signature=[ + tf.TensorSpec([1, 80, None], dtype=tf.float32), + ],) + def inference_tflite(self, c): + c = tf.transpose(c, perm=[0, 2, 1]) + c = tf.expand_dims(c, 2) + # FIXME: TF had no replicate padding as in Torch + # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") + o = c + for layer in self.model_layers: + o = layer(o) + o = tf.transpose(o, perm=[0, 3, 2, 1]) + o = self.pqmf_layer.synthesis(o[:, :, 0, :]) + return o diff --git a/mozilla_voice_tts/vocoder/tf/utils/__init__.py b/mozilla_voice_tts/vocoder/tf/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/tf/utils/convert_torch_to_tf_utils.py b/mozilla_voice_tts/vocoder/tf/utils/convert_torch_to_tf_utils.py new file mode 100644 index 00000000..25139cc3 --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/utils/convert_torch_to_tf_utils.py @@ -0,0 +1,45 @@ +import numpy as np +import tensorflow as tf + + +def compare_torch_tf(torch_tensor, tf_tensor): + """ Compute the average absolute difference b/w torch and tf tensors """ + return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() + + +def convert_tf_name(tf_name): + """ Convert certain patterns in TF layer names to Torch patterns """ + tf_name_tmp = tf_name + tf_name_tmp = tf_name_tmp.replace(':0', '') + tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0') + tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1') + tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh') + tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight') + tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight') + tf_name_tmp = tf_name_tmp.replace('/beta', '/bias') + tf_name_tmp = tf_name_tmp.replace('/', '.') + return tf_name_tmp + + +def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): + """ Transfer weigths from torch state_dict to TF variables """ + print(" > Passing weights from Torch to TF ...") + for tf_var in tf_vars: + torch_var_name = var_map_dict[tf_var.name] + print(f' | > {tf_var.name} <-- {torch_var_name}') + # if tuple, it is a bias variable + if 'kernel' in tf_var.name: + torch_weight = state_dict[torch_var_name] + numpy_weight = torch_weight.permute([2, 1, 0]).numpy()[:, None, :, :] + if 'bias' in tf_var.name: + torch_weight = state_dict[torch_var_name] + numpy_weight = torch_weight + assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" + tf.keras.backend.set_value(tf_var, numpy_weight) + return tf_vars + + +def load_tf_vars(model_tf, tf_vars): + for tf_var in tf_vars: + model_tf.get_layer(tf_var.name).set_weights(tf_var) + return model_tf diff --git a/mozilla_voice_tts/vocoder/tf/utils/generic_utils.py b/mozilla_voice_tts/vocoder/tf/utils/generic_utils.py new file mode 100644 index 00000000..6f1622dc --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/utils/generic_utils.py @@ -0,0 +1,35 @@ +import re +import importlib + + +def to_camel(text): + text = text.capitalize() + return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) + + +def setup_generator(c): + print(" > Generator Model: {}".format(c.generator_model)) + MyModel = importlib.import_module('mozilla_voice_tts.vocoder.tf.models.' + + c.generator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.generator_model)) + if c.generator_model in 'melgan_generator': + model = MyModel( + in_channels=c.audio['num_mels'], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params['upsample_factors'], + res_kernel=3, + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'melgan_fb_generator': + pass + if c.generator_model in 'multiband_melgan_generator': + model = MyModel( + in_channels=c.audio['num_mels'], + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=c.generator_model_params['upsample_factors'], + res_kernel=3, + num_res_blocks=c.generator_model_params['num_res_blocks']) + return model diff --git a/mozilla_voice_tts/vocoder/tf/utils/io.py b/mozilla_voice_tts/vocoder/tf/utils/io.py new file mode 100644 index 00000000..c73c9cd8 --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/utils/io.py @@ -0,0 +1,27 @@ +import datetime +import pickle +import tensorflow as tf + + +def save_checkpoint(model, current_step, epoch, output_path, **kwargs): + """ Save TF Vocoder model """ + state = { + 'model': model.weights, + 'step': current_step, + 'epoch': epoch, + 'date': datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + pickle.dump(state, open(output_path, 'wb')) + + +def load_checkpoint(model, checkpoint_path): + """ Load TF Vocoder model """ + checkpoint = pickle.load(open(checkpoint_path, 'rb')) + chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} + tf_vars = model.weights + for tf_var in tf_vars: + layer_name = tf_var.name + chkp_var_value = chkp_var_dict[layer_name] + tf.keras.backend.set_value(tf_var, chkp_var_value) + return model diff --git a/mozilla_voice_tts/vocoder/tf/utils/tflite.py b/mozilla_voice_tts/vocoder/tf/utils/tflite.py new file mode 100644 index 00000000..d62a081a --- /dev/null +++ b/mozilla_voice_tts/vocoder/tf/utils/tflite.py @@ -0,0 +1,31 @@ +import tensorflow as tf + + +def convert_melgan_to_tflite(model, + output_path=None, + experimental_converter=True): + """Convert Tensorflow MelGAN model to TFLite. Save a binary file if output_path is + provided, else return TFLite model.""" + + concrete_function = model.inference_tflite.get_concrete_function() + converter = tf.lite.TFLiteConverter.from_concrete_functions( + [concrete_function]) + converter.experimental_new_converter = experimental_converter + converter.optimizations = [] + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS + ] + tflite_model = converter.convert() + print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.') + if output_path is not None: + # same model binary if outputpath is provided + with open(output_path, 'wb') as f: + f.write(tflite_model) + return None + return tflite_model + + +def load_tflite_model(tflite_path): + tflite_model = tf.lite.Interpreter(model_path=tflite_path) + tflite_model.allocate_tensors() + return tflite_model diff --git a/mozilla_voice_tts/vocoder/utils/__init__.py b/mozilla_voice_tts/vocoder/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mozilla_voice_tts/vocoder/utils/generic_utils.py b/mozilla_voice_tts/vocoder/utils/generic_utils.py new file mode 100644 index 00000000..e28dd771 --- /dev/null +++ b/mozilla_voice_tts/vocoder/utils/generic_utils.py @@ -0,0 +1,149 @@ +import re +import importlib +import numpy as np +from matplotlib import pyplot as plt + +from mozilla_voice_tts.tts.utils.visual import plot_spectrogram + + +def plot_results(y_hat, y, ap, global_step, name_prefix): + """ Plot vocoder model results """ + + # select an instance from batch + y_hat = y_hat[0].squeeze(0).detach().cpu().numpy() + y = y[0].squeeze(0).detach().cpu().numpy() + + spec_fake = ap.melspectrogram(y_hat).T + spec_real = ap.melspectrogram(y).T + spec_diff = np.abs(spec_fake - spec_real) + + # plot figure and save it + fig_wave = plt.figure() + plt.subplot(2, 1, 1) + plt.plot(y) + plt.title("groundtruth speech") + plt.subplot(2, 1, 2) + plt.plot(y_hat) + plt.title(f"generated speech @ {global_step} steps") + plt.tight_layout() + plt.close() + + figures = { + name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake), + name_prefix + "spectrogram/real": plot_spectrogram(spec_real), + name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff), + name_prefix + "speech_comparison": fig_wave, + } + return figures + + +def to_camel(text): + text = text.capitalize() + return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) + + +def setup_generator(c): + print(" > Generator Model: {}".format(c.generator_model)) + MyModel = importlib.import_module('mozilla_voice_tts.vocoder.models.' + + c.generator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.generator_model)) + if c.generator_model in 'melgan_generator': + model = MyModel( + in_channels=c.audio['num_mels'], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params['upsample_factors'], + res_kernel=3, + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'melgan_fb_generator': + pass + if c.generator_model in 'multiband_melgan_generator': + model = MyModel( + in_channels=c.audio['num_mels'], + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=c.generator_model_params['upsample_factors'], + res_kernel=3, + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'parallel_wavegan_generator': + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=c.generator_model_params['num_res_blocks'], + stacks=c.generator_model_params['stacks'], + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=c.audio['num_mels'], + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=c.generator_model_params['upsample_factors']) + return model + + +def setup_discriminator(c): + print(" > Discriminator Model: {}".format(c.discriminator_model)) + if 'parallel_wavegan' in c.discriminator_model: + MyModel = importlib.import_module( + 'mozilla_voice_tts.vocoder.models.parallel_wavegan_discriminator') + else: + MyModel = importlib.import_module('mozilla_voice_tts.vocoder.models.' + + c.discriminator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) + if c.discriminator_model in 'random_window_discriminator': + model = MyModel( + cond_channels=c.audio['num_mels'], + hop_length=c.audio['hop_length'], + uncond_disc_donwsample_factors=c. + discriminator_model_params['uncond_disc_donwsample_factors'], + cond_disc_downsample_factors=c. + discriminator_model_params['cond_disc_downsample_factors'], + cond_disc_out_channels=c. + discriminator_model_params['cond_disc_out_channels'], + window_sizes=c.discriminator_model_params['window_sizes']) + if c.discriminator_model in 'melgan_multiscale_discriminator': + model = MyModel( + in_channels=1, + out_channels=1, + kernel_sizes=(5, 3), + base_channels=c.discriminator_model_params['base_channels'], + max_channels=c.discriminator_model_params['max_channels'], + downsample_factors=c. + discriminator_model_params['downsample_factors']) + if c.discriminator_model == 'residual_parallel_wavegan_discriminator': + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params['num_layers'], + stacks=c.discriminator_model_params['stacks'], + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ) + if c.discriminator_model == 'parallel_wavegan_discriminator': + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params['num_layers'], + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True + ) + return model + + +# def check_config(c): +# c = None +# pass diff --git a/mozilla_voice_tts/vocoder/utils/io.py b/mozilla_voice_tts/vocoder/utils/io.py new file mode 100644 index 00000000..734714e0 --- /dev/null +++ b/mozilla_voice_tts/vocoder/utils/io.py @@ -0,0 +1,63 @@ +import os +import torch +import datetime + + +def save_model(model, optimizer, scheduler, model_disc, optimizer_disc, + scheduler_disc, current_step, epoch, output_path, **kwargs): + model_state = model.state_dict() + model_disc_state = model_disc.state_dict()\ + if model_disc is not None else None + optimizer_state = optimizer.state_dict()\ + if optimizer is not None else None + optimizer_disc_state = optimizer_disc.state_dict()\ + if optimizer_disc is not None else None + scheduler_state = scheduler.state_dict()\ + if scheduler is not None else None + scheduler_disc_state = scheduler_disc.state_dict()\ + if scheduler_disc is not None else None + state = { + 'model': model_state, + 'optimizer': optimizer_state, + 'scheduler': scheduler_state, + 'model_disc': model_disc_state, + 'optimizer_disc': optimizer_disc_state, + 'scheduler_disc': scheduler_disc_state, + 'step': current_step, + 'epoch': epoch, + 'date': datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + torch.save(state, output_path) + + +def save_checkpoint(model, optimizer, scheduler, model_disc, optimizer_disc, + scheduler_disc, current_step, epoch, output_folder, + **kwargs): + file_name = 'checkpoint_{}.pth.tar'.format(current_step) + checkpoint_path = os.path.join(output_folder, file_name) + print(" > CHECKPOINT : {}".format(checkpoint_path)) + save_model(model, optimizer, scheduler, model_disc, optimizer_disc, + scheduler_disc, current_step, epoch, checkpoint_path, **kwargs) + + +def save_best_model(target_loss, best_loss, model, optimizer, scheduler, + model_disc, optimizer_disc, scheduler_disc, current_step, + epoch, output_folder, **kwargs): + if target_loss < best_loss: + file_name = 'best_model.pth.tar' + checkpoint_path = os.path.join(output_folder, file_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model(model, + optimizer, + scheduler, + model_disc, + optimizer_disc, + scheduler_disc, + current_step, + epoch, + checkpoint_path, + model_loss=target_loss, + **kwargs) + best_loss = target_loss + return best_loss diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..15206130 --- /dev/null +++ b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repository:\n", + "- TTS: https://github.com/mozilla/TTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", + "\n", + "\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/VCTK/']\n", + "DATASETS_METAFILE = ['']\n", + "\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Preprocess dataset\n", + "meta_data = []\n", + "for i in range(len(DATASETS_NAME)):\n", + " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", + " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", + " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", + " \n", + "meta_data= list(meta_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**c['audio'])\n", + "\n", + "model = SpeakerEncoder(**c.model)\n", + "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", + "model.eval()\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "embeddings_dict = {}\n", + "len_meta_data= len(meta_data)\n", + "\n", + "for i in tqdm(range(len_meta_data)):\n", + " _, wav_file, speaker_id = meta_data[i]\n", + " wav_file_name = os.path.basename(wav_file)\n", + " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create and export speakers.json\n", + "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", + "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#test load integrity\n", + "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", + "assert speaker_mapping == speaker_mapping_load\n", + "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/CheckSpectrograms.ipynb b/notebooks/CheckSpectrograms.ipynb index 1e12bd61..7829d920 100644 --- a/notebooks/CheckSpectrograms.ipynb +++ b/notebooks/CheckSpectrograms.ipynb @@ -16,9 +16,9 @@ "outputs": [], "source": [ "%matplotlib inline\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.visual import plot_spectrogram\n", - "from TTS.utils.io import load_config\n", + "from mozilla_voice_tts.tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.visual import plot_spectrogram\n", + "from mozilla_voice_tts.tts.utils.generic_utils import load_config\n", "import glob \n", "import IPython.display as ipd" ] diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb new file mode 100644 index 00000000..782e4f61 --- /dev/null +++ b/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb @@ -0,0 +1,329 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6LWsNd3_M3MP", + "colab_type": "text" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAqrSIWgLyP0", + "colab_type": "text" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ku-dA4DKoeXk", + "colab_type": "text" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jGIgnWhGsxU1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", + "tags": [] + }, + "source": [ + "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", + "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4dnpE0-kvTsu", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", + "tags": [] + }, + "source": [ + "!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", + "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", + "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zlgi8fPdpRF0", + "colab_type": "text" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f-Yc42nQZG5A", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.flatten()\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZksegYQepkFg", + "colab_type": "text" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVa0kOamprgj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from mozilla_voice_tts.tts.utils.generic_utils import setup_model\n", + "from mozilla_voice_tts.utils.io import load_config\n", + "from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes\n", + "from mozilla_voice_tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.synthesis import synthesis" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EY-sHVO8IFSH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# runtime settings\n", + "use_cuda = False" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_1aIUp2FpxOQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# model paths\n", + "TTS_MODEL = \"data/tts_model.pth.tar\"\n", + "TTS_CONFIG = \"data/config.json\"\n", + "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", + "VOCODER_CONFIG = \"data/config_vocoder.json\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CpgmdBVQplbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zmrQxiozIUVE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", + "tags": [] + }, + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8fLoI4ipqMeS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", + "tags": [] + }, + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "\n", + "# load model state\n", + "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zKoq0GgzqzhQ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", + "tags": [] + }, + "source": [ + "from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", + "vocoder_model.remove_weight_norm()\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + "if use_cuda:\n", + " vocoder_model.cuda()\n", + "vocoder_model.eval()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws_YkPKsLgo-", + "colab_type": "text" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FuWxZ9Ey5Puj", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", + "tags": [] + }, + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb new file mode 100644 index 00000000..753d5157 --- /dev/null +++ b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb @@ -0,0 +1,1328 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "DDC-TTS_and_MultiBand-MelGAN_TFLite_Example.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6LWsNd3_M3MP", + "colab_type": "text" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis with TFLite" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAqrSIWgLyP0", + "colab_type": "text" + }, + "source": [ + "**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n", + "\n", + "#### **Notebook Details**\n", + "These TFLite models support TF 2.3rc0 and for different versions you might need to regenerate them. \n", + "\n", + "TFLite optimizations degrades the TTS model performance and we do not apply\n", + "any optimization for the vocoder model due to the same reason. If you like to\n", + "keep the quality, consider to regenerate TFLite model accordingly.\n", + "\n", + "Models optimized with TFLite can be slow on a regular CPU since it is optimized\n", + "specifically for lower-end systems.\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "#### **Model Details** \n", + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ku-dA4DKoeXk", + "colab_type": "text" + }, + "source": [ + "### Download TF Models and configs" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jGIgnWhGsxU1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "outputId": "57af701e-77ec-400d-fee5-64aa7603d357" + }, + "source": [ + "!gdown --id 17PYXCmTe0el_SLTwznrt3vOArNGMGo5v -O tts_model.tflite\n", + "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O config.json" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=17PYXCmTe0el_SLTwznrt3vOArNGMGo5v\n", + "To: /content/tts_model.tflite\n", + "30.1MB [00:00, 36.8MB/s]\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc\n", + "To: /content/config.json\n", + "100% 9.53k/9.53k [00:00<00:00, 7.38MB/s]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4dnpE0-kvTsu", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "6aab0622-9add-4ee4-b9f8-177d6ddc0e86" + }, + "source": [ + "!gdown --id 1aXveT-NjOM1mUr6tM4JfWjshq67GvVIO -O vocoder_model.tflite\n", + "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O config_vocoder.json\n", + "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O scale_stats.npy" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1aXveT-NjOM1mUr6tM4JfWjshq67GvVIO\n", + "To: /content/vocoder_model.tflite\n", + "10.2MB [00:00, 16.5MB/s]\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu\n", + "To: /content/config_vocoder.json\n", + "100% 6.76k/6.76k [00:00<00:00, 11.4MB/s]\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU\n", + "To: /content/scale_stats.npy\n", + "100% 10.5k/10.5k [00:00<00:00, 16.6MB/s]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_ZuDrj_ioqHE", + "colab_type": "text" + }, + "source": [ + "### Setup Libraries" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X2axt5BYq7gv", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 964 + }, + "outputId": "aa53986f-f218-4d17-8667-0d74bb90c927" + }, + "source": [ + "# need it for char to phoneme conversion\n", + "! sudo apt-get install espeak" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following package was automatically installed and is no longer required:\n", + " libnvidia-common-440\n", + "Use 'sudo apt autoremove' to remove it.\n", + "The following additional packages will be installed:\n", + " espeak-data libespeak1 libportaudio2 libsonic0\n", + "The following NEW packages will be installed:\n", + " espeak espeak-data libespeak1 libportaudio2 libsonic0\n", + "0 upgraded, 5 newly installed, 0 to remove and 35 not upgraded.\n", + "Need to get 1,219 kB of archives.\n", + "After this operation, 3,031 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]\n", + "Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]\n", + "Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]\n", + "Fetched 1,219 kB in 2s (498 kB/s)\n", + "debconf: unable to initialize frontend: Dialog\n", + "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 5.)\n", + "debconf: falling back to frontend: Readline\n", + "debconf: unable to initialize frontend: Readline\n", + "debconf: (This frontend requires a controlling tty.)\n", + "debconf: falling back to frontend: Teletype\n", + "dpkg-preconfigure: unable to re-open stdin: \n", + "Selecting previously unselected package libportaudio2:amd64.\n", + "(Reading database ... 144465 files and directories currently installed.)\n", + "Preparing to unpack .../libportaudio2_19.6.0-1_amd64.deb ...\n", + "Unpacking libportaudio2:amd64 (19.6.0-1) ...\n", + "Selecting previously unselected package libsonic0:amd64.\n", + "Preparing to unpack .../libsonic0_0.2.0-6_amd64.deb ...\n", + "Unpacking libsonic0:amd64 (0.2.0-6) ...\n", + "Selecting previously unselected package espeak-data:amd64.\n", + "Preparing to unpack .../espeak-data_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking espeak-data:amd64 (1.48.04+dfsg-5) ...\n", + "Selecting previously unselected package libespeak1:amd64.\n", + "Preparing to unpack .../libespeak1_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking libespeak1:amd64 (1.48.04+dfsg-5) ...\n", + "Selecting previously unselected package espeak.\n", + "Preparing to unpack .../espeak_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking espeak (1.48.04+dfsg-5) ...\n", + "Setting up libportaudio2:amd64 (19.6.0-1) ...\n", + "Setting up espeak-data:amd64 (1.48.04+dfsg-5) ...\n", + "Setting up libsonic0:amd64 (0.2.0-6) ...\n", + "Setting up libespeak1:amd64 (1.48.04+dfsg-5) ...\n", + "Setting up espeak (1.48.04+dfsg-5) ...\n", + "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", + "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", + "/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZduAf-qYYEIT", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 144 + }, + "outputId": "c1fcac0d-b8f8-442c-d598-4f549c42b698" + }, + "source": [ + "!git clone https://github.com/mozilla/TTS" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Cloning into 'TTS'...\n", + "remote: Enumerating objects: 107, done.\u001b[K\n", + "remote: Counting objects: 100% (107/107), done.\u001b[K\n", + "remote: Compressing objects: 100% (79/79), done.\u001b[K\n", + "remote: Total 7252 (delta 51), reused 68 (delta 28), pack-reused 7145\u001b[K\n", + "Receiving objects: 100% (7252/7252), 115.36 MiB | 11.38 MiB/s, done.\n", + "Resolving deltas: 100% (4892/4892), done.\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ofPCvPyjZEcT", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "f3d3ea73-eae5-473c-db19-276bd0e721cc" + }, + "source": [ + "%cd TTS\n", + "!git checkout c7296b3\n", + "!pip install -r requirements.txt\n", + "!python setup.py install\n", + "!pip install tensorflow==2.3.0rc0\n", + "%cd .." + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/content/TTS\n", + "Note: checking out 'c7296b3'.\n", + "\n", + "You are in 'detached HEAD' state. You can look around, make experimental\n", + "changes and commit them, and you can discard any commits you make in this\n", + "state without impacting any branches by performing another checkout.\n", + "\n", + "If you want to create a new branch to retain commits you create, you may\n", + "do so (now or later) by using -b with the checkout command again. Example:\n", + "\n", + " git checkout -b \n", + "\n", + "HEAD is now at c7296b3 add module requirement\n", + "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.18.5)\n", + "Requirement already satisfied: torch>=1.5 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (1.5.1+cu101)\n", + "Requirement already satisfied: librosa>=0.5.1 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (0.6.3)\n", + "Collecting Unidecode>=0.4.20\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n", + "\u001b[K |████████████████████████████████| 245kB 2.7MB/s \n", + "\u001b[?25hRequirement already satisfied: tensorboard in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 5)) (2.2.2)\n", + "Collecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)\n", + "\u001b[K |████████████████████████████████| 317kB 11.6MB/s \n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 7)) (3.2.2)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 8)) (7.0.0)\n", + "Requirement already satisfied: flask in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 9)) (1.1.2)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 10)) (1.4.1)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 11)) (4.41.1)\n", + "Collecting soundfile\n", + " Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl\n", + "Collecting phonemizer\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/93/b24323b7b7d99d65c41188685f423c66b2e53d0fd959851ac224c2aa2bfb/phonemizer-2.2-py3-none-any.whl (47kB)\n", + "\u001b[K |████████████████████████████████| 51kB 6.0MB/s \n", + "\u001b[?25hRequirement already satisfied: bokeh==1.4.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 14)) (1.4.0)\n", + "Requirement already satisfied: inflect in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 15)) (2.1.0)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.5->-r requirements.txt (line 2)) (0.16.0)\n", + "Requirement already satisfied: numba>=0.38.0 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (0.48.0)\n", + "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (4.4.2)\n", + "Requirement already satisfied: joblib>=0.12 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (0.16.0)\n", + "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (2.1.8)\n", + "Requirement already satisfied: six>=1.3 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (1.12.0)\n", + "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (0.22.2.post1)\n", + "Requirement already satisfied: resampy>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from librosa>=0.5.1->-r requirements.txt (line 3)) (0.2.2)\n", + "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (0.34.2)\n", + "Requirement already satisfied: protobuf>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (3.10.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (3.2.2)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (2.23.0)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (0.9.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (1.17.2)\n", + "Requirement already satisfied: grpcio>=1.24.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (1.30.0)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (49.1.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (1.7.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (1.0.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard->-r requirements.txt (line 5)) (0.4.1)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 7)) (2.4.7)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 7)) (0.10.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 7)) (2.8.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 7)) (1.2.0)\n", + "Requirement already satisfied: click>=5.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 9)) (7.1.2)\n", + "Requirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 9)) (2.11.2)\n", + "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 9)) (1.1.0)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->-r requirements.txt (line 12)) (1.14.0)\n", + "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.6/dist-packages (from phonemizer->-r requirements.txt (line 13)) (19.3.0)\n", + "Collecting segments\n", + " Downloading https://files.pythonhosted.org/packages/5b/a0/0c3fe64787745c39eb3f2f5f5f9ed8d008d9ef22e9d7f9f52f71ea4712f7/segments-2.1.3-py2.py3-none-any.whl\n", + "Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 14)) (20.4)\n", + "Requirement already satisfied: tornado>=4.3 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 14)) (4.5.3)\n", + "Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 14)) (3.13)\n", + "Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa>=0.5.1->-r requirements.txt (line 3)) (0.31.0)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard->-r requirements.txt (line 5)) (1.7.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 5)) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 5)) (2020.6.20)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 5)) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard->-r requirements.txt (line 5)) (3.0.4)\n", + "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard->-r requirements.txt (line 5)) (4.6)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard->-r requirements.txt (line 5)) (4.1.1)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard->-r requirements.txt (line 5)) (0.2.8)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 5)) (1.3.0)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from Jinja2>=2.10.1->flask->-r requirements.txt (line 9)) (1.1.1)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 12)) (2.20)\n", + "Collecting clldutils>=1.7.3\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7b/b3/05882a8d5c8a7f7c69a47500334ac99623928edca930278d6ab88ee6d99b/clldutils-3.5.2-py2.py3-none-any.whl (189kB)\n", + "\u001b[K |████████████████████████████████| 194kB 13.2MB/s \n", + "\u001b[?25hCollecting csvw>=1.5.6\n", + " Downloading https://files.pythonhosted.org/packages/d1/b6/8fef6788b8f05b21424a17ae3881eff916d42e5c7e87f57a85d9d7abf0a1/csvw-1.7.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from segments->phonemizer->-r requirements.txt (line 13)) (2019.12.20)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard->-r requirements.txt (line 5)) (3.1.0)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3\"->google-auth<2,>=1.6.3->tensorboard->-r requirements.txt (line 5)) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 5)) (3.1.0)\n", + "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from clldutils>=1.7.3->segments->phonemizer->-r requirements.txt (line 13)) (0.8.7)\n", + "Collecting colorlog\n", + " Downloading https://files.pythonhosted.org/packages/00/0d/22c73c2eccb21dd3498df7d22c0b1d4a30f5a5fb3feb64e1ce06bc247747/colorlog-4.1.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from csvw>=1.5.6->segments->phonemizer->-r requirements.txt (line 13)) (3.0.1)\n", + "Collecting isodate\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)\n", + "\u001b[K |████████████████████████████████| 51kB 6.7MB/s \n", + "\u001b[?25hCollecting rfc3986\n", + " Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe622b8/rfc3986-1.4.0-py2.py3-none-any.whl\n", + "Installing collected packages: Unidecode, tensorboardX, soundfile, isodate, rfc3986, csvw, colorlog, clldutils, segments, phonemizer\n", + "Successfully installed Unidecode-1.1.1 clldutils-3.5.2 colorlog-4.1.0 csvw-1.7.0 isodate-0.6.0 phonemizer-2.2 rfc3986-1.4.0 segments-2.1.3 soundfile-0.10.3.post1 tensorboardX-2.1\n", + "running install\n", + "running bdist_egg\n", + "running egg_info\n", + "creating tts_namespace/TTS.egg-info\n", + "writing tts_namespace/TTS.egg-info/PKG-INFO\n", + "writing dependency_links to tts_namespace/TTS.egg-info/dependency_links.txt\n", + "writing entry points to tts_namespace/TTS.egg-info/entry_points.txt\n", + "writing requirements to tts_namespace/TTS.egg-info/requires.txt\n", + "writing top-level names to tts_namespace/TTS.egg-info/top_level.txt\n", + "writing manifest file 'tts_namespace/TTS.egg-info/SOURCES.txt'\n", + "writing manifest file 'tts_namespace/TTS.egg-info/SOURCES.txt'\n", + "installing library code to build/bdist.linux-x86_64/egg\n", + "running install_lib\n", + "running build_py\n", + "-- Building version 0.0.3+c7296b3\n", + "creating temp_build\n", + "creating temp_build/TTS\n", + "copying tts_namespace/TTS/distribute.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/train.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/version.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/compute_statistics.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/__init__.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/setup.py -> temp_build/TTS\n", + "copying tts_namespace/TTS/synthesize.py -> temp_build/TTS\n", + "creating temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_demo_server.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_text_processing.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_preprocessors.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_loader.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_audio.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/__init__.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_tacotron2_model.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/generic_utils_text.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_tacotron_model.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/symbols_tests.py -> temp_build/TTS/tests\n", + "copying tts_namespace/TTS/tests/test_layers.py -> temp_build/TTS/tests\n", + "creating temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/data.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/radam.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/training.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/console_logger.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/__init__.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/visual.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/audio.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/tensorboard_logger.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/speakers.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/measures.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/generic_utils.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/io.py -> temp_build/TTS/utils\n", + "copying tts_namespace/TTS/utils/synthesis.py -> temp_build/TTS/utils\n", + "creating temp_build/TTS/vocoder\n", + "copying tts_namespace/TTS/vocoder/train.py -> temp_build/TTS/vocoder\n", + "copying tts_namespace/TTS/vocoder/__init__.py -> temp_build/TTS/vocoder\n", + "copying tts_namespace/TTS/vocoder/compute_tts_features.py -> temp_build/TTS/vocoder\n", + "creating temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/compute_embeddings.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/loss.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/train.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/dataset.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/__init__.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/visual.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/model.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/tests.py -> temp_build/TTS/speaker_encoder\n", + "copying tts_namespace/TTS/speaker_encoder/generic_utils.py -> temp_build/TTS/speaker_encoder\n", + "creating temp_build/TTS/models\n", + "copying tts_namespace/TTS/models/tacotron.py -> temp_build/TTS/models\n", + "copying tts_namespace/TTS/models/__init__.py -> temp_build/TTS/models\n", + "copying tts_namespace/TTS/models/tacotron_abstract.py -> temp_build/TTS/models\n", + "copying tts_namespace/TTS/models/tacotron2.py -> temp_build/TTS/models\n", + "creating temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/tacotron.py -> temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/gst_layers.py -> temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/losses.py -> temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/__init__.py -> temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/common_layers.py -> temp_build/TTS/layers\n", + "copying tts_namespace/TTS/layers/tacotron2.py -> temp_build/TTS/layers\n", + "creating temp_build/TTS/server\n", + "copying tts_namespace/TTS/server/server.py -> temp_build/TTS/server\n", + "copying tts_namespace/TTS/server/__init__.py -> temp_build/TTS/server\n", + "copying tts_namespace/TTS/server/synthesizer.py -> temp_build/TTS/server\n", + "creating temp_build/TTS/datasets\n", + "copying tts_namespace/TTS/datasets/TTSDataset.py -> temp_build/TTS/datasets\n", + "copying tts_namespace/TTS/datasets/preprocess.py -> temp_build/TTS/datasets\n", + "copying tts_namespace/TTS/datasets/__init__.py -> temp_build/TTS/datasets\n", + "creating temp_build/TTS/utils/text\n", + "copying tts_namespace/TTS/utils/text/symbols.py -> temp_build/TTS/utils/text\n", + "copying tts_namespace/TTS/utils/text/number_norm.py -> temp_build/TTS/utils/text\n", + "copying tts_namespace/TTS/utils/text/cmudict.py -> temp_build/TTS/utils/text\n", + "copying tts_namespace/TTS/utils/text/__init__.py -> temp_build/TTS/utils/text\n", + "copying tts_namespace/TTS/utils/text/cleaners.py -> temp_build/TTS/utils/text\n", + "creating temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_losses.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_pqmf.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_datasets.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_melgan_discriminator.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_melgan_generator.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/__init__.py -> temp_build/TTS/vocoder/tests\n", + "copying tts_namespace/TTS/vocoder/tests/test_rwd.py -> temp_build/TTS/vocoder/tests\n", + "creating temp_build/TTS/vocoder/utils\n", + "copying tts_namespace/TTS/vocoder/utils/console_logger.py -> temp_build/TTS/vocoder/utils\n", + "copying tts_namespace/TTS/vocoder/utils/__init__.py -> temp_build/TTS/vocoder/utils\n", + "copying tts_namespace/TTS/vocoder/utils/generic_utils.py -> temp_build/TTS/vocoder/utils\n", + "copying tts_namespace/TTS/vocoder/utils/io.py -> temp_build/TTS/vocoder/utils\n", + "creating temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/melgan_discriminator.py -> temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/random_window_discriminator.py -> temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/__init__.py -> temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/multiband_melgan_generator.py -> temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/melgan_multiscale_discriminator.py -> temp_build/TTS/vocoder/models\n", + "copying tts_namespace/TTS/vocoder/models/melgan_generator.py -> temp_build/TTS/vocoder/models\n", + "creating temp_build/TTS/vocoder/layers\n", + "copying tts_namespace/TTS/vocoder/layers/pqmf.py -> temp_build/TTS/vocoder/layers\n", + "copying tts_namespace/TTS/vocoder/layers/losses.py -> temp_build/TTS/vocoder/layers\n", + "copying tts_namespace/TTS/vocoder/layers/__init__.py -> temp_build/TTS/vocoder/layers\n", + "copying tts_namespace/TTS/vocoder/layers/melgan.py -> temp_build/TTS/vocoder/layers\n", + "creating temp_build/TTS/vocoder/datasets\n", + "copying tts_namespace/TTS/vocoder/datasets/preprocess.py -> temp_build/TTS/vocoder/datasets\n", + "copying tts_namespace/TTS/vocoder/datasets/__init__.py -> temp_build/TTS/vocoder/datasets\n", + "copying tts_namespace/TTS/vocoder/datasets/gan_dataset.py -> temp_build/TTS/vocoder/datasets\n", + "creating temp_build/TTS/server/templates\n", + "copying tts_namespace/TTS/server/templates/index.html -> temp_build/TTS/server/templates\n", + "creating build\n", + "creating build/bdist.linux-x86_64\n", + "creating build/bdist.linux-x86_64/egg\n", + "creating build/bdist.linux-x86_64/egg/TTS\n", + "creating build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_demo_server.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_text_processing.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_preprocessors.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_loader.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_audio.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/__init__.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_tacotron2_model.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/generic_utils_text.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_tacotron_model.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/symbols_tests.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "copying temp_build/TTS/tests/test_layers.py -> build/bdist.linux-x86_64/egg/TTS/tests\n", + "creating build/bdist.linux-x86_64/egg/TTS/utils\n", + "creating build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/text/symbols.py -> build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/text/number_norm.py -> build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/text/cmudict.py -> build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/text/__init__.py -> build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/text/cleaners.py -> build/bdist.linux-x86_64/egg/TTS/utils/text\n", + "copying temp_build/TTS/utils/data.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/radam.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/training.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/console_logger.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/__init__.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/visual.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/audio.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/tensorboard_logger.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/speakers.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/measures.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/generic_utils.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/io.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/utils/synthesis.py -> build/bdist.linux-x86_64/egg/TTS/utils\n", + "copying temp_build/TTS/distribute.py -> build/bdist.linux-x86_64/egg/TTS\n", + "copying temp_build/TTS/train.py -> build/bdist.linux-x86_64/egg/TTS\n", + "copying temp_build/TTS/version.py -> build/bdist.linux-x86_64/egg/TTS\n", + "copying temp_build/TTS/compute_statistics.py -> build/bdist.linux-x86_64/egg/TTS\n", + "copying temp_build/TTS/__init__.py -> build/bdist.linux-x86_64/egg/TTS\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_losses.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_pqmf.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_datasets.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_melgan_discriminator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_melgan_generator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "copying temp_build/TTS/vocoder/tests/test_rwd.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/tests\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder/utils\n", + "copying temp_build/TTS/vocoder/utils/console_logger.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/utils\n", + "copying temp_build/TTS/vocoder/utils/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/utils\n", + "copying temp_build/TTS/vocoder/utils/generic_utils.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/utils\n", + "copying temp_build/TTS/vocoder/utils/io.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/utils\n", + "copying temp_build/TTS/vocoder/train.py -> build/bdist.linux-x86_64/egg/TTS/vocoder\n", + "copying temp_build/TTS/vocoder/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/melgan_discriminator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/random_window_discriminator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/multiband_melgan_generator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/melgan_multiscale_discriminator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/models/melgan_generator.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/models\n", + "copying temp_build/TTS/vocoder/compute_tts_features.py -> build/bdist.linux-x86_64/egg/TTS/vocoder\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder/layers\n", + "copying temp_build/TTS/vocoder/layers/pqmf.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/layers\n", + "copying temp_build/TTS/vocoder/layers/losses.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/layers\n", + "copying temp_build/TTS/vocoder/layers/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/layers\n", + "copying temp_build/TTS/vocoder/layers/melgan.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/layers\n", + "creating build/bdist.linux-x86_64/egg/TTS/vocoder/datasets\n", + "copying temp_build/TTS/vocoder/datasets/preprocess.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/datasets\n", + "copying temp_build/TTS/vocoder/datasets/__init__.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/datasets\n", + "copying temp_build/TTS/vocoder/datasets/gan_dataset.py -> build/bdist.linux-x86_64/egg/TTS/vocoder/datasets\n", + "creating build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/compute_embeddings.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/loss.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/train.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/dataset.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/__init__.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/visual.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/model.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/tests.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/speaker_encoder/generic_utils.py -> build/bdist.linux-x86_64/egg/TTS/speaker_encoder\n", + "copying temp_build/TTS/setup.py -> build/bdist.linux-x86_64/egg/TTS\n", + "copying temp_build/TTS/synthesize.py -> build/bdist.linux-x86_64/egg/TTS\n", + "creating build/bdist.linux-x86_64/egg/TTS/models\n", + "copying temp_build/TTS/models/tacotron.py -> build/bdist.linux-x86_64/egg/TTS/models\n", + "copying temp_build/TTS/models/__init__.py -> build/bdist.linux-x86_64/egg/TTS/models\n", + "copying temp_build/TTS/models/tacotron_abstract.py -> build/bdist.linux-x86_64/egg/TTS/models\n", + "copying temp_build/TTS/models/tacotron2.py -> build/bdist.linux-x86_64/egg/TTS/models\n", + "creating build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/tacotron.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/gst_layers.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/losses.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/__init__.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/common_layers.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "copying temp_build/TTS/layers/tacotron2.py -> build/bdist.linux-x86_64/egg/TTS/layers\n", + "creating build/bdist.linux-x86_64/egg/TTS/server\n", + "copying temp_build/TTS/server/server.py -> build/bdist.linux-x86_64/egg/TTS/server\n", + "creating build/bdist.linux-x86_64/egg/TTS/server/templates\n", + "copying temp_build/TTS/server/templates/index.html -> build/bdist.linux-x86_64/egg/TTS/server/templates\n", + "copying temp_build/TTS/server/__init__.py -> build/bdist.linux-x86_64/egg/TTS/server\n", + "copying temp_build/TTS/server/synthesizer.py -> build/bdist.linux-x86_64/egg/TTS/server\n", + "creating build/bdist.linux-x86_64/egg/TTS/datasets\n", + "copying temp_build/TTS/datasets/TTSDataset.py -> build/bdist.linux-x86_64/egg/TTS/datasets\n", + "copying temp_build/TTS/datasets/preprocess.py -> build/bdist.linux-x86_64/egg/TTS/datasets\n", + "copying temp_build/TTS/datasets/__init__.py -> build/bdist.linux-x86_64/egg/TTS/datasets\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_demo_server.py to test_demo_server.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_text_processing.py to test_text_processing.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_preprocessors.py to test_preprocessors.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_loader.py to test_loader.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_audio.py to test_audio.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_tacotron2_model.py to test_tacotron2_model.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/generic_utils_text.py to generic_utils_text.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_tacotron_model.py to test_tacotron_model.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/symbols_tests.py to symbols_tests.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/tests/test_layers.py to test_layers.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/text/symbols.py to symbols.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/text/number_norm.py to number_norm.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/text/cmudict.py to cmudict.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/text/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/text/cleaners.py to cleaners.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/data.py to data.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/radam.py to radam.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/training.py to training.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/console_logger.py to console_logger.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/visual.py to visual.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/audio.py to audio.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/tensorboard_logger.py to tensorboard_logger.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/speakers.py to speakers.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/measures.py to measures.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/generic_utils.py to generic_utils.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/io.py to io.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/utils/synthesis.py to synthesis.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/distribute.py to distribute.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/train.py to train.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/version.py to version.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/compute_statistics.py to compute_statistics.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_losses.py to test_losses.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_pqmf.py to test_pqmf.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_datasets.py to test_datasets.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_melgan_discriminator.py to test_melgan_discriminator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_melgan_generator.py to test_melgan_generator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/tests/test_rwd.py to test_rwd.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/utils/console_logger.py to console_logger.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/utils/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/utils/generic_utils.py to generic_utils.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/utils/io.py to io.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/train.py to train.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/melgan_discriminator.py to melgan_discriminator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/random_window_discriminator.py to random_window_discriminator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/multiband_melgan_generator.py to multiband_melgan_generator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/melgan_multiscale_discriminator.py to melgan_multiscale_discriminator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/models/melgan_generator.py to melgan_generator.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/compute_tts_features.py to compute_tts_features.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/layers/pqmf.py to pqmf.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/layers/losses.py to losses.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/layers/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/layers/melgan.py to melgan.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/datasets/preprocess.py to preprocess.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/datasets/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/vocoder/datasets/gan_dataset.py to gan_dataset.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/compute_embeddings.py to compute_embeddings.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/loss.py to loss.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/train.py to train.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/dataset.py to dataset.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/visual.py to visual.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/model.py to model.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/tests.py to tests.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/speaker_encoder/generic_utils.py to generic_utils.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/setup.py to setup.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/synthesize.py to synthesize.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/models/tacotron.py to tacotron.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/models/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/models/tacotron_abstract.py to tacotron_abstract.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/models/tacotron2.py to tacotron2.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/tacotron.py to tacotron.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/gst_layers.py to gst_layers.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/losses.py to losses.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/common_layers.py to common_layers.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/layers/tacotron2.py to tacotron2.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/server/server.py to server.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/server/__init__.py to __init__.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/server/synthesizer.py to synthesizer.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/datasets/TTSDataset.py to TTSDataset.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/datasets/preprocess.py to preprocess.cpython-36.pyc\n", + "byte-compiling build/bdist.linux-x86_64/egg/TTS/datasets/__init__.py to __init__.cpython-36.pyc\n", + "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/entry_points.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/requires.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying tts_namespace/TTS.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "zip_safe flag not set; analyzing archive contents...\n", + "TTS.__pycache__.setup.cpython-36: module references __file__\n", + "TTS.__pycache__.train.cpython-36: module references __file__\n", + "TTS.server.__pycache__.server.cpython-36: module references __file__\n", + "TTS.speaker_encoder.__pycache__.tests.cpython-36: module references __file__\n", + "TTS.speaker_encoder.__pycache__.train.cpython-36: module references __file__\n", + "TTS.tests.__pycache__.__init__.cpython-36: module references __file__\n", + "TTS.tests.__pycache__.test_loader.cpython-36: module references __file__\n", + "TTS.tests.__pycache__.test_tacotron2_model.cpython-36: module references __file__\n", + "TTS.tests.__pycache__.test_tacotron_model.cpython-36: module references __file__\n", + "TTS.vocoder.__pycache__.train.cpython-36: module references __file__\n", + "TTS.vocoder.tests.__pycache__.test_datasets.cpython-36: module references __file__\n", + "TTS.vocoder.tests.__pycache__.test_losses.cpython-36: module references __file__\n", + "creating dist\n", + "creating 'dist/TTS-0.0.3+c7296b3-py3.6.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", + "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", + "Processing TTS-0.0.3+c7296b3-py3.6.egg\n", + "creating /usr/local/lib/python3.6/dist-packages/TTS-0.0.3+c7296b3-py3.6.egg\n", + "Extracting TTS-0.0.3+c7296b3-py3.6.egg to /usr/local/lib/python3.6/dist-packages\n", + "Adding TTS 0.0.3+c7296b3 to easy-install.pth file\n", + "Installing tts-server script to /usr/local/bin\n", + "\n", + "Installed /usr/local/lib/python3.6/dist-packages/TTS-0.0.3+c7296b3-py3.6.egg\n", + "Processing dependencies for TTS==0.0.3+c7296b3\n", + "Searching for attrdict\n", + "Reading https://pypi.org/simple/attrdict/\n", + "Downloading https://files.pythonhosted.org/packages/ef/97/28fe7e68bc7adfce67d4339756e85e9fcf3c6fd7f0c0781695352b70472c/attrdict-2.0.1-py2.py3-none-any.whl#sha256=9432e3498c74ff7e1b20b3d93b45d766b71cbffa90923496f82c4ae38b92be34\n", + "Best match: attrdict 2.0.1\n", + "Processing attrdict-2.0.1-py2.py3-none-any.whl\n", + "Installing attrdict-2.0.1-py2.py3-none-any.whl to /usr/local/lib/python3.6/dist-packages\n", + "Adding attrdict 2.0.1 to easy-install.pth file\n", + "\n", + "Installed /usr/local/lib/python3.6/dist-packages/attrdict-2.0.1-py3.6.egg\n", + "Searching for unidecode==0.4.20\n", + "Reading https://pypi.org/simple/unidecode/\n", + "Downloading https://files.pythonhosted.org/packages/c3/6f/05f5deb753d0594583aa1cc0d2fe9d631d9a00e9b28d0da49f8d3763755b/Unidecode-0.04.20-py2.py3-none-any.whl#sha256=eedac7bfd886f43484787206f6a141b232e2b2a58652c54d06499b187fd84660\n", + "Best match: Unidecode 0.4.20\n", + "Processing Unidecode-0.04.20-py2.py3-none-any.whl\n", + "Installing Unidecode-0.04.20-py2.py3-none-any.whl to /usr/local/lib/python3.6/dist-packages\n", + "Adding Unidecode 0.4.20 to easy-install.pth file\n", + "Installing unidecode script to /usr/local/bin\n", + "\n", + "Installed /usr/local/lib/python3.6/dist-packages/Unidecode-0.4.20-py3.6.egg\n", + "Searching for librosa==0.6.2\n", + "Reading https://pypi.org/simple/librosa/\n", + "Downloading https://files.pythonhosted.org/packages/09/b4/5b411f19de48f8fc1a0ff615555aa9124952e4156e94d4803377e50cfa4c/librosa-0.6.2.tar.gz#sha256=2aa868b8aade749b9904eeb7034fcf44115601c367969b6d01f5e1b4b9b6031d\n", + "Best match: librosa 0.6.2\n", + "Processing librosa-0.6.2.tar.gz\n", + "Writing /tmp/easy_install-3oxyyk5x/librosa-0.6.2/setup.cfg\n", + "Running librosa-0.6.2/setup.py -q bdist_egg --dist-dir /tmp/easy_install-3oxyyk5x/librosa-0.6.2/egg-dist-tmp-ky3tcqa8\n", + "zip_safe flag not set; analyzing archive contents...\n", + "librosa.util.__pycache__.deprecation.cpython-36: module MAY be using inspect.stack\n", + "creating /usr/local/lib/python3.6/dist-packages/librosa-0.6.2-py3.6.egg\n", + "Extracting librosa-0.6.2-py3.6.egg to /usr/local/lib/python3.6/dist-packages\n", + "Adding librosa 0.6.2 to easy-install.pth file\n", + "\n", + "Installed /usr/local/lib/python3.6/dist-packages/librosa-0.6.2-py3.6.egg\n", + "Searching for phonemizer==2.2\n", + "Best match: phonemizer 2.2\n", + "Adding phonemizer 2.2 to easy-install.pth file\n", + "Installing phonemize script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for SoundFile==0.10.3.post1\n", + "Best match: SoundFile 0.10.3.post1\n", + "Adding SoundFile 0.10.3.post1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for bokeh==1.4.0\n", + "Best match: bokeh 1.4.0\n", + "Adding bokeh 1.4.0 to easy-install.pth file\n", + "Installing bokeh script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for inflect==2.1.0\n", + "Best match: inflect 2.1.0\n", + "Adding inflect 2.1.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tqdm==4.41.1\n", + "Best match: tqdm 4.41.1\n", + "Adding tqdm 4.41.1 to easy-install.pth file\n", + "Installing tqdm script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Flask==1.1.2\n", + "Best match: Flask 1.1.2\n", + "Adding Flask 1.1.2 to easy-install.pth file\n", + "Installing flask script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Pillow==7.0.0\n", + "Best match: Pillow 7.0.0\n", + "Adding Pillow 7.0.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for matplotlib==3.2.2\n", + "Best match: matplotlib 3.2.2\n", + "Adding matplotlib 3.2.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tensorboardX==2.1\n", + "Best match: tensorboardX 2.1\n", + "Adding tensorboardX 2.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for numpy==1.18.5\n", + "Best match: numpy 1.18.5\n", + "Adding numpy 1.18.5 to easy-install.pth file\n", + "Installing f2py script to /usr/local/bin\n", + "Installing f2py3 script to /usr/local/bin\n", + "Installing f2py3.6 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for torch==1.5.1+cu101\n", + "Best match: torch 1.5.1+cu101\n", + "Adding torch 1.5.1+cu101 to easy-install.pth file\n", + "Installing convert-caffe2-to-onnx script to /usr/local/bin\n", + "Installing convert-onnx-to-caffe2 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for scipy==1.4.1\n", + "Best match: scipy 1.4.1\n", + "Adding scipy 1.4.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for attrs==19.3.0\n", + "Best match: attrs 19.3.0\n", + "Adding attrs 19.3.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for segments==2.1.3\n", + "Best match: segments 2.1.3\n", + "Adding segments 2.1.3 to easy-install.pth file\n", + "Installing segments script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for joblib==0.16.0\n", + "Best match: joblib 0.16.0\n", + "Adding joblib 0.16.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for cffi==1.14.0\n", + "Best match: cffi 1.14.0\n", + "Adding cffi 1.14.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for python-dateutil==2.8.1\n", + "Best match: python-dateutil 2.8.1\n", + "Adding python-dateutil 2.8.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for packaging==20.4\n", + "Best match: packaging 20.4\n", + "Adding packaging 20.4 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Jinja2==2.11.2\n", + "Best match: Jinja2 2.11.2\n", + "Adding Jinja2 2.11.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for PyYAML==3.13\n", + "Best match: PyYAML 3.13\n", + "Adding PyYAML 3.13 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for six==1.12.0\n", + "Best match: six 1.12.0\n", + "Adding six 1.12.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tornado==4.5.3\n", + "Best match: tornado 4.5.3\n", + "Adding tornado 4.5.3 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Werkzeug==1.0.1\n", + "Best match: Werkzeug 1.0.1\n", + "Adding Werkzeug 1.0.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for itsdangerous==1.1.0\n", + "Best match: itsdangerous 1.1.0\n", + "Adding itsdangerous 1.1.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for click==7.1.2\n", + "Best match: click 7.1.2\n", + "Adding click 7.1.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pyparsing==2.4.7\n", + "Best match: pyparsing 2.4.7\n", + "Adding pyparsing 2.4.7 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for cycler==0.10.0\n", + "Best match: cycler 0.10.0\n", + "Adding cycler 0.10.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for kiwisolver==1.2.0\n", + "Best match: kiwisolver 1.2.0\n", + "Adding kiwisolver 1.2.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for protobuf==3.10.0\n", + "Best match: protobuf 3.10.0\n", + "Adding protobuf 3.10.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for numba==0.48.0\n", + "Best match: numba 0.48.0\n", + "Adding numba 0.48.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for resampy==0.2.2\n", + "Best match: resampy 0.2.2\n", + "Adding resampy 0.2.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for decorator==4.4.2\n", + "Best match: decorator 4.4.2\n", + "Adding decorator 4.4.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for scikit-learn==0.22.2.post1\n", + "Best match: scikit-learn 0.22.2.post1\n", + "Adding scikit-learn 0.22.2.post1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for audioread==2.1.8\n", + "Best match: audioread 2.1.8\n", + "Adding audioread 2.1.8 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for future==0.16.0\n", + "Best match: future 0.16.0\n", + "Adding future 0.16.0 to easy-install.pth file\n", + "Installing futurize script to /usr/local/bin\n", + "Installing pasteurize script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for clldutils==3.5.2\n", + "Best match: clldutils 3.5.2\n", + "Adding clldutils 3.5.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for regex==2019.12.20\n", + "Best match: regex 2019.12.20\n", + "Adding regex 2019.12.20 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for csvw==1.7.0\n", + "Best match: csvw 1.7.0\n", + "Adding csvw 1.7.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pycparser==2.20\n", + "Best match: pycparser 2.20\n", + "Adding pycparser 2.20 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for MarkupSafe==1.1.1\n", + "Best match: MarkupSafe 1.1.1\n", + "Adding MarkupSafe 1.1.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for setuptools==49.1.0\n", + "Best match: setuptools 49.1.0\n", + "Adding setuptools 49.1.0 to easy-install.pth file\n", + "Installing easy_install script to /usr/local/bin\n", + "Installing easy_install-3.8 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for llvmlite==0.31.0\n", + "Best match: llvmlite 0.31.0\n", + "Adding llvmlite 0.31.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tabulate==0.8.7\n", + "Best match: tabulate 0.8.7\n", + "Adding tabulate 0.8.7 to easy-install.pth file\n", + "Installing tabulate script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for colorlog==4.1.0\n", + "Best match: colorlog 4.1.0\n", + "Adding colorlog 4.1.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for rfc3986==1.4.0\n", + "Best match: rfc3986 1.4.0\n", + "Adding rfc3986 1.4.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for isodate==0.6.0\n", + "Best match: isodate 0.6.0\n", + "Adding isodate 0.6.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for uritemplate==3.0.1\n", + "Best match: uritemplate 3.0.1\n", + "Adding uritemplate 3.0.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Finished processing dependencies for TTS==0.0.3+c7296b3\n", + "Collecting tensorflow==2.3.0rc0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8b/68/7c6c8e2b65ad4a3ff5ef658c04a6c2802ff7fe55fc7eecacb6efee1abc40/tensorflow-2.3.0rc0-cp36-cp36m-manylinux2010_x86_64.whl (320.3MB)\n", + "\u001b[K |████████████████████████████████| 320.3MB 49kB/s \n", + "\u001b[?25hRequirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.6.3)\n", + "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.2.2)\n", + "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.34.2)\n", + "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.18.5)\n", + "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.30.0)\n", + "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.9.0)\n", + "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.2)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.0)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.12.0)\n", + "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.3.3)\n", + "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.10.0)\n", + "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.12.1)\n", + "Requirement already satisfied: scipy==1.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.4.1)\n", + "Collecting tf-estimator-nightly<2.3.0.dev2020062302,>=2.3.0.dev2020062301\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/17/3b/fb9aafd734da258411bff2a600cabff65c7d201782318791b72422bd973d/tf_estimator_nightly-2.3.0.dev2020062301-py2.py3-none-any.whl (459kB)\n", + "\u001b[K |████████████████████████████████| 460kB 35.1MB/s \n", + "\u001b[?25hRequirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.2.0)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.10.0)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.2.1)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.23.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.0.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.2.2)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (49.1.0)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.17.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2020.6.20)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.0.4)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.3.0)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.6)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.2.8)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.1.1)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3\"->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.8)\n", + "Installing collected packages: tf-estimator-nightly, tensorflow\n", + " Found existing installation: tensorflow 2.2.0\n", + " Uninstalling tensorflow-2.2.0:\n", + " Successfully uninstalled tensorflow-2.2.0\n", + "Successfully installed tensorflow-2.3.0rc0 tf-estimator-nightly-2.3.0.dev2020062301\n", + "/content\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zlgi8fPdpRF0", + "colab_type": "text" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f-Yc42nQZG5A", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def run_vocoder(mel_spec):\n", + " vocoder_inputs = mel_spec[None, :, :]\n", + " # get input and output details\n", + " input_details = vocoder_model.get_input_details()\n", + " # reshape input tensor for the new input shape\n", + " vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n", + " vocoder_model.allocate_tensors()\n", + " detail = input_details[0]\n", + " vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n", + " # run the model\n", + " vocoder_model.invoke()\n", + " # collect outputs\n", + " output_details = vocoder_model.get_output_details()\n", + " waveform = vocoder_model.get_tensor(output_details[0]['index'])\n", + " return waveform \n", + "\n", + "\n", + "def tts(model, text, CONFIG, p):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", + " backend='tflite')\n", + " waveform = run_vocoder(mel_postnet_spec.T)\n", + " waveform = waveform[0, 0]\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZksegYQepkFg", + "colab_type": "text" + }, + "source": [ + "### Load TF Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVa0kOamprgj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from mozilla_voice_tts.tf.utils.tflite import load_tflite_model\n", + "from mozilla_voice_tts.tf.utils.io import load_checkpoint\n", + "from mozilla_voice_tts.utils.io import load_config\n", + "from mozilla_voice_tts.utils.text.symbols import symbols, phonemes\n", + "from mozilla_voice_tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.synthesis import synthesis" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EY-sHVO8IFSH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# runtime settings\n", + "use_cuda = False" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_1aIUp2FpxOQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# model paths\n", + "TTS_MODEL = \"tts_model.tflite\"\n", + "TTS_CONFIG = \"config.json\"\n", + "VOCODER_MODEL = \"vocoder_model.tflite\"\n", + "VOCODER_CONFIG = \"config_vocoder.json\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CpgmdBVQplbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zmrQxiozIUVE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "outputId": "ca7e9016-4c28-4cef-efe7-0613d399aa4c" + }, + "source": [ + "# load the audio processor\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:./scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8fLoI4ipqMeS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the models\n", + "model = load_tflite_model(TTS_MODEL)\n", + "vocoder_model = load_tflite_model(VOCODER_MODEL)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws_YkPKsLgo-", + "colab_type": "text" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FuWxZ9Ey5Puj", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "outputId": "d1888ebd-3208-42a4-aaf9-78d0e3ec987d" + }, + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(185856,)\n", + " > Run-time: 3.8069238662719727\n", + " > Real-time factor: 0.45162849859449977\n", + " > Time per step: 2.048206938938661e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb new file mode 100644 index 00000000..e675948c --- /dev/null +++ b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb @@ -0,0 +1,316 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "DDC-TTS_and_MultiBand-MelGAN_TF_Example.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6LWsNd3_M3MP", + "colab_type": "text" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAqrSIWgLyP0", + "colab_type": "text" + }, + "source": [ + "**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla mozilla_voice_tts.**\n", + "\n", + "These TF models support TF 2.2 and for different versions you might need to\n", + "regenerate them. \n", + "\n", + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ku-dA4DKoeXk", + "colab_type": "text" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jGIgnWhGsxU1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "outputId": "08b0dddd-4edf-48c9-e8e5-a419b36a5c3d", + "tags": [] + }, + "source": [ + "!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl\n", + "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4dnpE0-kvTsu", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "2fe836eb-c7e7-4f1e-9352-0142126bb19f", + "tags": [] + }, + "source": [ + "!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl\n", + "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", + "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zlgi8fPdpRF0", + "colab_type": "text" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f-Yc42nQZG5A", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def tts(model, text, CONFIG, p):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", + " backend='tf')\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.numpy()[0, 0]\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZksegYQepkFg", + "colab_type": "text" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVa0kOamprgj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from mozilla_voice_tts.tts.tf.utils.generic_utils import setup_model\n", + "from mozilla_voice_tts.tts.tf.utils.io import load_checkpoint\n", + "from mozilla_voice_tts.utils.io import load_config\n", + "from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes\n", + "from mozilla_voice_tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.synthesis import synthesis" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EY-sHVO8IFSH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# runtime settings\n", + "use_cuda = False" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_1aIUp2FpxOQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# model paths\n", + "TTS_MODEL = \"data/tts_model.pkl\"\n", + "TTS_CONFIG = \"data/config.json\"\n", + "VOCODER_MODEL = \"data/vocoder_model.pkl\"\n", + "VOCODER_CONFIG = \"data/config_vocoder.json\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CpgmdBVQplbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zmrQxiozIUVE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "outputId": "fa71bd05-401f-4e5b-a6f7-60ae765966db", + "tags": [] + }, + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8fLoI4ipqMeS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 72 + }, + "outputId": "595d990f-930d-4698-ee14-77796b5eed7d", + "tags": [] + }, + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "model.build_inference()\n", + "model = load_checkpoint(model, TTS_MODEL)\n", + "model.decoder.set_max_decoder_steps(1000)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zKoq0GgzqzhQ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 489 + }, + "outputId": "2cc3deae-144f-4465-da3b-98628d948506" + }, + "source": [ + "from mozilla_voice_tts.vocoder.tf.utils.generic_utils import setup_generator\n", + "from mozilla_voice_tts.vocoder.tf.utils.io import load_checkpoint\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.build_inference()\n", + "vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL)\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws_YkPKsLgo-", + "colab_type": "text" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FuWxZ9Ey5Puj", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "outputId": "07ede6e5-06e6-4612-f687-7984d20e5254" + }, + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb b/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb new file mode 100644 index 00000000..661ef579 --- /dev/null +++ b/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb @@ -0,0 +1,329 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6LWsNd3_M3MP", + "colab_type": "text" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAqrSIWgLyP0", + "colab_type": "text" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ku-dA4DKoeXk", + "colab_type": "text" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jGIgnWhGsxU1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", + "tags": [] + }, + "source": [ + "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", + "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4dnpE0-kvTsu", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", + "tags": [] + }, + "source": [ + "!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n", + "!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n", + "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zlgi8fPdpRF0", + "colab_type": "text" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f-Yc42nQZG5A", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.flatten()\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZksegYQepkFg", + "colab_type": "text" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVa0kOamprgj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from mozilla_voice_tts.tts.utils.generic_utils import setup_model\n", + "from mozilla_voice_tts.utils.io import load_config\n", + "from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes\n", + "from mozilla_voice_tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.synthesis import synthesis" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EY-sHVO8IFSH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# runtime settings\n", + "use_cuda = False" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_1aIUp2FpxOQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# model paths\n", + "TTS_MODEL = \"data/tts_model.pth.tar\"\n", + "TTS_CONFIG = \"data/config.json\"\n", + "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", + "VOCODER_CONFIG = \"data/config_vocoder.json\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CpgmdBVQplbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zmrQxiozIUVE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", + "tags": [] + }, + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8fLoI4ipqMeS", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", + "tags": [] + }, + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "\n", + "# load model state\n", + "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zKoq0GgzqzhQ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", + "tags": [] + }, + "source": [ + "from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", + "vocoder_model.remove_weight_norm()\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + "if use_cuda:\n", + " vocoder_model.cuda()\n", + "vocoder_model.eval()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws_YkPKsLgo-", + "colab_type": "text" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FuWxZ9Ey5Puj", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", + "tags": [] + }, + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb new file mode 100755 index 00000000..458422c0 --- /dev/null +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb @@ -0,0 +1,637 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb", + "provenance": [], + "collapsed_sections": [ + "vnV-FigfvsS2", + "hkvv7gRcx4WV", + "QJ6VgT2a4vHW" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yZK6UdwSFnOO", + "colab_type": "text" + }, + "source": [ + "# **Download and install Mozilla TTS**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yvb0pX3WY6MN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os \n", + "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iB9nl2UEG3SY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!apt-get install espeak\n", + "os.chdir('TTS')\n", + "!pip install -r requirements.txt\n", + "!python setup.py develop\n", + "os.chdir('..')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w6Krn8k1inC_", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "**Download Checkpoint**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PiYHf3lKhi9z", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n", + "!unzip ./TTS-checkpoint.zip\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MpYNgqrZcJKn", + "colab_type": "text" + }, + "source": [ + "**Utils Functions**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4KZA4b_CbMqx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import json\n", + "# pylint: disable=redefined-outer-name, unused-argument\n", + "import os\n", + "import string\n", + "import time\n", + "import sys\n", + "import numpy as np\n", + "\n", + "TTS_PATH = \"../content/TTS\"\n", + "# add libraries into environment\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "\n", + "import torch\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "\n", + "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n", + " t_1 = time.time()\n", + " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " return waveform\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENA2OumIVeMA", + "colab_type": "text" + }, + "source": [ + "# **Vars definitions**\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jPD0d_XpVXmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "TEXT = ''\n", + "OUT_PATH = 'tests-audios/'\n", + "# create output path\n", + "os.makedirs(OUT_PATH, exist_ok=True)\n", + "\n", + "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", + "\n", + "# model vars \n", + "MODEL_PATH = 'best_model.pth.tar'\n", + "CONFIG_PATH = 'config.json'\n", + "SPEAKER_JSON = 'speakers.json'\n", + "\n", + "# vocoder vars\n", + "VOCODER_PATH = ''\n", + "VOCODER_CONFIG_PATH = ''\n", + "\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dV6cXXlfi72r", + "colab_type": "text" + }, + "source": [ + "# **Restore TTS Model**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1WgLFauWUPe", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "C.forward_attn_mask = True\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "speaker_embedding = None\n", + "speaker_embedding_dim = None\n", + "num_speakers = 0\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " num_speakers = len(speaker_mapping)\n", + " if C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID is not None:\n", + " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", + " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", + " choise_speaker = list(speaker_mapping.keys())[0]\n", + " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", + " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", + " speaker_embedding_dim = len(speaker_embedding)\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "model.decoder.set_r(cp['r'])\n", + "\n", + "# load vocoder model\n", + "if VOCODER_PATH!= \"\":\n", + " VC = load_config(VOCODER_CONFIG_PATH)\n", + " vocoder_model = setup_generator(VC)\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", + " vocoder_model.remove_weight_norm()\n", + " if USE_CUDA:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval()\n", + "else:\n", + " vocoder_model = None\n", + " VC = None\n", + "\n", + "# synthesize voice\n", + "use_griffin_lim = VOCODER_PATH== \"\"\n", + "\n", + "if not C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID.isdigit():\n", + " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", + " else:\n", + " SPEAKER_FILEID = None\n", + "else:\n", + " SPEAKER_FILEID = None\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNvVEoE30qY6", + "colab_type": "text" + }, + "source": [ + "Synthesize sentence with Speaker\n", + "\n", + "> Stop running the cell to leave!\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2o8fXkVSyXOa", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnV-FigfvsS2", + "colab_type": "text" + }, + "source": [ + "# **Select Speaker**\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RuCGOnJ_fgDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "# VCTK speakers not seen in training (new speakers)\n", + "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", + "\n", + "# VCTK speakers seen in training\n", + "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", + "\n", + "\n", + "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hkvv7gRcx4WV", + "colab_type": "text" + }, + "source": [ + "## **Example select a VCTK seen speaker in training**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BviNMI9UyCYz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5e5_XnLsx3jg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QJ6VgT2a4vHW" + }, + "source": [ + "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", + "\n", + "\n", + "> Fitting new Speakers :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "SZS57ZK-4vHa", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "bbs85vzz4vHo", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEE6mQLh5Who" + }, + "source": [ + "# **Example Synthesizing with your own voice :)**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "La70gSB65nrs", + "colab_type": "text" + }, + "source": [ + " Download and load GE2E Speaker Encoder " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0IEFZ0B5vQg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", + "!unzip ./SpeakerEncoder-checkpoint.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jEH8HCTh5mF6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", + "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", + "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tOwkfQqT6-Qo", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "se_config = load_config(SE_CONFIG_PATH)\n", + "se_ap = AudioProcessor(**se_config['audio'])\n", + "\n", + "se_model = SpeakerEncoder(**se_config.model)\n", + "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", + "se_model.eval()\n", + "if USE_CUDA:\n", + " se_model.cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0TLlbUFG8O36", + "colab_type": "text" + }, + "source": [ + "Upload a wav audio file in your voice.\n", + "\n", + "\n", + "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_FWwHPjJ8NXl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from google.colab import files\n", + "file_list = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWOf6sgbBbGY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# extract embedding from wav files\n", + "speaker_embeddings = []\n", + "for name in file_list.keys():\n", + " if '.wav' in name:\n", + " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " speaker_embeddings.append(embedd)\n", + " else:\n", + " print(\" You need upload Wav files, others files is not supported !!\")\n", + "\n", + "# takes the average of the embedings samples of the announcers\n", + "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "xmItcGac5WiG", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb new file mode 100755 index 00000000..e059461e --- /dev/null +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb @@ -0,0 +1,834 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb", + "provenance": [], + "collapsed_sections": [ + "yZK6UdwSFnOO", + "ENA2OumIVeMA", + "dV6cXXlfi72r", + "vnV-FigfvsS2", + "g_G_HweN04W-", + "LEE6mQLh5Who" + ], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yZK6UdwSFnOO", + "colab_type": "text" + }, + "source": [ + "# **Download and install Mozilla TTS**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yvb0pX3WY6MN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os \n", + "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iB9nl2UEG3SY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!apt-get install espeak\n", + "os.chdir('TTS')\n", + "!pip install -r requirements.txt\n", + "!python setup.py develop\n", + "os.chdir('..')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w6Krn8k1inC_", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "**Download Checkpoint**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PiYHf3lKhi9z", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n", + "!unzip ./TTS-checkpoint.zip\n", + "\n", + "# Download gst style example\n", + "!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MpYNgqrZcJKn", + "colab_type": "text" + }, + "source": [ + "**Utils Functions**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4KZA4b_CbMqx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import json\n", + "# pylint: disable=redefined-outer-name, unused-argument\n", + "import os\n", + "import string\n", + "import time\n", + "import sys\n", + "import numpy as np\n", + "\n", + "TTS_PATH = \"../content/TTS\"\n", + "# add libraries into environment\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "\n", + "import torch\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "\n", + "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n", + " t_1 = time.time()\n", + " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " return waveform\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENA2OumIVeMA", + "colab_type": "text" + }, + "source": [ + "# **Vars definitions**\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jPD0d_XpVXmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "TEXT = ''\n", + "OUT_PATH = 'tests-audios/'\n", + "# create output path\n", + "os.makedirs(OUT_PATH, exist_ok=True)\n", + "\n", + "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", + "\n", + "# model vars \n", + "MODEL_PATH = 'best_model.pth.tar'\n", + "CONFIG_PATH = 'config.json'\n", + "SPEAKER_JSON = 'speakers.json'\n", + "\n", + "# vocoder vars\n", + "VOCODER_PATH = ''\n", + "VOCODER_CONFIG_PATH = ''\n", + "\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dV6cXXlfi72r", + "colab_type": "text" + }, + "source": [ + "# **Restore TTS Model**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1WgLFauWUPe", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "C.forward_attn_mask = True\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "speaker_embedding = None\n", + "speaker_embedding_dim = None\n", + "num_speakers = 0\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " num_speakers = len(speaker_mapping)\n", + " if C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID is not None:\n", + " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", + " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", + " choise_speaker = list(speaker_mapping.keys())[0]\n", + " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", + " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", + " speaker_embedding_dim = len(speaker_embedding)\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "model.decoder.set_r(cp['r'])\n", + "\n", + "# load vocoder model\n", + "if VOCODER_PATH!= \"\":\n", + " VC = load_config(VOCODER_CONFIG_PATH)\n", + " vocoder_model = setup_generator(VC)\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", + " vocoder_model.remove_weight_norm()\n", + " if USE_CUDA:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval()\n", + "else:\n", + " vocoder_model = None\n", + " VC = None\n", + "\n", + "# synthesize voice\n", + "use_griffin_lim = VOCODER_PATH== \"\"\n", + "\n", + "if not C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID.isdigit():\n", + " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", + " else:\n", + " SPEAKER_FILEID = None\n", + "else:\n", + " SPEAKER_FILEID = None\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNvVEoE30qY6", + "colab_type": "text" + }, + "source": [ + "Synthesize sentence with Speaker\n", + "\n", + "> Stop running the cell to leave!\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2o8fXkVSyXOa", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnV-FigfvsS2", + "colab_type": "text" + }, + "source": [ + "# **Select Speaker**\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RuCGOnJ_fgDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "# VCTK speakers not seen in training (new speakers)\n", + "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", + "\n", + "# VCTK speakers seen in training\n", + "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", + "\n", + "\n", + "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hkvv7gRcx4WV", + "colab_type": "text" + }, + "source": [ + "## **Example select a VCTK seen speaker in training**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BviNMI9UyCYz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5e5_XnLsx3jg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QJ6VgT2a4vHW" + }, + "source": [ + "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", + "\n", + "\n", + "> Fitting new Speakers :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "SZS57ZK-4vHa", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "bbs85vzz4vHo", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g_G_HweN04W-", + "colab_type": "text" + }, + "source": [ + "# **Changing GST tokens manually (without wav reference)**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jyFP5syW2bjt", + "colab_type": "text" + }, + "source": [ + "You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SpwjDjCM2a3Y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# set gst tokens, in this model we have 5 tokens\n", + "gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qWChMbI_0z5X", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uFjUi9xQ3mG3", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Uw0d6gWg4L27", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V9izw4-54-Tl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEE6mQLh5Who" + }, + "source": [ + "# **Example Synthesizing with your own voice :)**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "La70gSB65nrs", + "colab_type": "text" + }, + "source": [ + " Download and load GE2E Speaker Encoder " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0IEFZ0B5vQg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", + "!unzip ./SpeakerEncoder-checkpoint.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jEH8HCTh5mF6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", + "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", + "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tOwkfQqT6-Qo", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "se_config = load_config(SE_CONFIG_PATH)\n", + "se_ap = AudioProcessor(**se_config['audio'])\n", + "\n", + "se_model = SpeakerEncoder(**se_config.model)\n", + "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", + "se_model.eval()\n", + "if USE_CUDA:\n", + " se_model.cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0TLlbUFG8O36", + "colab_type": "text" + }, + "source": [ + "Upload one or more wav audio files in your voice.\n", + "\n", + "\n", + "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_FWwHPjJ8NXl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one or more wav files\n", + "from google.colab import files\n", + "file_list = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWOf6sgbBbGY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# extract embedding from wav files\n", + "speaker_embeddings = []\n", + "for name in file_list.keys():\n", + " if '.wav' in name:\n", + " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " speaker_embeddings.append(embedd)\n", + " else:\n", + " print(\"You need upload Wav files, others files is not supported !!\")\n", + "\n", + "# takes the average of the embedings samples of the announcers\n", + "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AQ7eP31d9yzq", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n", + "gst_style = 'gst-style-example.wav'\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "11i10yE1-LMJ", + "colab_type": "text" + }, + "source": [ + "Uploading your own GST reference wav file" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eKohSQG1-KkT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one wav file for GST reference\n", + "from google.colab import files\n", + "file_list = files.upload()\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "xmItcGac5WiG", + "colab": {} + }, + "source": [ + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = list(file_list.keys())[0]\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index c747c764..37687517 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -22,12 +22,12 @@ "import numpy as np\n", "from tqdm import tqdm as tqdm\n", "from torch.utils.data import DataLoader\n", - "from TTS.datasets.TTSDataset import MyDataset\n", - "from TTS.layers.losses import L1LossMasked\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.visual import plot_spectrogram\n", - "from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", + "from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset\n", + "from mozilla_voice_tts.tts.layers.losses import L1LossMasked\n", + "from mozilla_voice_tts.tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.visual import plot_spectrogram\n", + "from mozilla_voice_tts.tts.utils.generic_utils import load_config, setup_model, sequence_mask\n", + "from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "%matplotlib inline\n", "\n", @@ -108,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + "preprocessor = importlib.import_module('mozilla_voice_tts.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", diff --git a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..576a95fe --- /dev/null +++ b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,25495 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET:\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repositories:\n", + "- TTS: https://github.com/mozilla/TTS\n", + "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from TTS.utils.io import load_config\n", + "from tqdm import tqdm\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'Real-Time-Voice-Cloning'...\n", + "remote: Enumerating objects: 5, done.\u001b[K\n", + "remote: Counting objects: 100% (5/5), done.\u001b[K\n", + "remote: Compressing objects: 100% (5/5), done.\u001b[K\n", + "remote: Total 2508 (delta 0), reused 3 (delta 0), pack-reused 2503\u001b[K\n", + "Receiving objects: 100% (2508/2508), 360.78 MiB | 17.84 MiB/s, done.\n", + "Resolving deltas: 100% (1387/1387), done.\n", + "Checking connectivity... done.\n" + ] + } + ], + "source": [ + "# Clone encoder \n", + "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", + "os.chdir('Real-Time-Voice-Cloning/')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Install voxceleb_trainer Requeriments\n", + "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-08-05 06:51:05-- https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", + "Resolving github.com (github.com)... 18.231.5.6\n", + "Connecting to github.com (github.com)|18.231.5.6|:443... connected.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip [following]\n", + "--2020-08-05 06:51:05-- https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip\n", + "Reusing existing connection to github.com:443.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream [following]\n", + "--2020-08-05 06:51:05-- https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream\n", + "Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.18.24\n", + "Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.18.24|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 383640573 (366M) [application/octet-stream]\n", + "Saving to: ‘pretrained.zip’\n", + "\n", + "pretrained.zip 100%[===================>] 365,87M 6,62MB/s in 56s \n", + "\n", + "2020-08-05 06:52:03 (6,48 MB/s) - ‘pretrained.zip’ saved [383640573/383640573]\n", + "\n", + "Archive: pretrained.zip\n", + " creating: encoder/saved_models/\n", + " inflating: encoder/saved_models/pretrained.pt \n", + " creating: synthesizer/saved_models/\n", + " creating: synthesizer/saved_models/logs-pretrained/\n", + " creating: synthesizer/saved_models/logs-pretrained/taco_pretrained/\n", + " extracting: synthesizer/saved_models/logs-pretrained/taco_pretrained/checkpoint \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001 \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.index \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.meta \n", + " creating: vocoder/saved_models/\n", + " creating: vocoder/saved_models/pretrained/\n", + " inflating: vocoder/saved_models/pretrained/pretrained.pt \n" + ] + } + ], + "source": [ + "#Download encoder Checkpoint\n", + "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", + "!unzip pretrained.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from encoder import inference as encoder\n", + "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing the encoder, the synthesizer and the vocoder...\n", + "Loaded encoder \"pretrained.pt\" trained to step 1564501\n", + "Testing your configuration with small inputs.\n", + "\tTesting the encoder...\n", + "(256,)\n" + ] + } + ], + "source": [ + "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", + "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", + "print(\"Testing your configuration with small inputs.\")\n", + "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", + "# sampling rate, which may differ.\n", + "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", + "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", + "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", + "# 16000 for the encoder. Creating an array of length will always correspond \n", + "# to an audio of 1 second.\n", + "print(\"\\tTesting the encoder...\")\n", + "\n", + "wav = np.zeros(encoder.sampling_rate) \n", + "embed = encoder.embed_utterance(wav)\n", + "print(embed.shape)\n", + "\n", + "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", + "# embeddings it will be).\n", + "#embed /= np.linalg.norm(embed) # for random embedding\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "SAVE_PATH = '../'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Set constants\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", + "DATASETS_METAFILE = ['']\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0/44063 [00:00 TF -> TFLite" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tLhz8SAf8Pgp", + "colab_type": "text" + }, + "source": [ + "## Converting PyTorch to Tensorflow\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Xsrvr_WQ8Ib5", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "dae96616-e5f7-41b6-cdb9-5026cfcd3214", + "tags": [] + }, + "source": [ + "# convert TTS model to Tensorflow\n", + "!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VJ4NA5If9ljv", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "1520dca8-1db8-4e07-bc0c-b1d5941c775e", + "tags": [] + }, + "source": [ + "# convert Vocoder model to Tensorflow\n", + "!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7d5vTkBZ-BYQ", + "colab_type": "text" + }, + "source": [ + "## Converting Tensorflow to TFLite" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "33hTfpuU99cg", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 927 + }, + "outputId": "8a0e5be1-23a2-4128-ee37-8232adcb8ff0", + "tags": [] + }, + "source": [ + "# convert TTS model to TFLite\n", + "!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "e00Hm75Y-wZ2", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 364 + }, + "outputId": "42381b05-3c9d-44f0-dac7-d81efd95eadf", + "tags": [] + }, + "source": [ + "# convert Vocoder model to TFLite\n", + "!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zlgi8fPdpRF0", + "colab_type": "text" + }, + "source": [ + "# Run Inference with TFLite " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "f-Yc42nQZG5A", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def run_vocoder(mel_spec):\n", + " vocoder_inputs = mel_spec[None, :, :]\n", + " # get input and output details\n", + " input_details = vocoder_model.get_input_details()\n", + " # reshape input tensor for the new input shape\n", + " vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n", + " vocoder_model.allocate_tensors()\n", + " detail = input_details[0]\n", + " vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n", + " # run the model\n", + " vocoder_model.invoke()\n", + " # collect outputs\n", + " output_details = vocoder_model.get_output_details()\n", + " waveform = vocoder_model.get_tensor(output_details[0]['index'])\n", + " return waveform \n", + "\n", + "\n", + "def tts(model, text, CONFIG, p):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", + " backend='tflite')\n", + " waveform = run_vocoder(mel_postnet_spec.T)\n", + " waveform = waveform[0, 0]\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZksegYQepkFg", + "colab_type": "text" + }, + "source": [ + "### Load TF Models" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oVa0kOamprgj", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from mozilla_voice_tts.tts.tf.utils.tflite import load_tflite_model\n", + "from mozilla_voice_tts.tts.tf.utils.io import load_checkpoint\n", + "from mozilla_voice_tts.utils.io import load_config\n", + "from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes\n", + "from mozilla_voice_tts.utils.audio import AudioProcessor\n", + "from mozilla_voice_tts.tts.utils.synthesis import synthesis" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EY-sHVO8IFSH", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# runtime settings\n", + "use_cuda = False" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_1aIUp2FpxOQ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# model paths\n", + "TTS_MODEL = \"data/tts_model.tflite\"\n", + "TTS_CONFIG = \"data/config.json\"\n", + "VOCODER_MODEL = \"data/vocoder_model.tflite\"\n", + "VOCODER_CONFIG = \"data/config_vocoder.json\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CpgmdBVQplbv", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zmrQxiozIUVE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "outputId": "21cda136-de87-4d55-fd46-7d5306103d90", + "tags": [] + }, + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8fLoI4ipqMeS", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the models\n", + "model = load_tflite_model(TTS_MODEL)\n", + "vocoder_model = load_tflite_model(VOCODER_MODEL)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws_YkPKsLgo-", + "colab_type": "text" + }, + "source": [ + "## Run Sample Sentence" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FuWxZ9Ey5Puj", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "outputId": "535c2df1-c27c-458b-e14b-41a977635aa1", + "tags": [] + }, + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb b/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb new file mode 100644 index 00000000..0fe81d4e --- /dev/null +++ b/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb @@ -0,0 +1,3406 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "TTS_PATH = \"/home/erogol/projects/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "import glob\n", + "import librosa\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import norm\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from multiprocessing import Pool\n", + "from matplotlib import pylab as plt\n", + "from collections import Counter\n", + "from mozilla_voice_tts.tts.datasets.preprocess import *\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "DATA_PATH = \"/home/erogol/Data/Spectie/audio/output/\"\n", + "META_DATA = \"metadata.txt\"\n", + "NUM_PROC = 8" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/erogol/Data/Spectie/audio/output/metadata.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# use your own preprocessor at this stage - TTS/datasets/proprocess.py\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mitems\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmozilla_de\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMETA_DATA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Number of audio files: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/TTS/tts_namespace/TTS/datasets/preprocess.py\u001b[0m in \u001b[0;36mmozilla_de\u001b[0;34m(root_path, meta_file)\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0mitems\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mspeaker_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"mozilla\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtxt_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ISO 8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mttf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 84\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mttf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0mcols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'|'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/erogol/Data/Spectie/audio/output/metadata.txt'" + ] + } + ], + "source": [ + "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", + "items = mozilla_de(DATA_PATH, META_DATA)\n", + "print(\" > Number of audio files: {}\".format(len(items)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# check wavs if exist\n", + "wav_files = []\n", + "for item in items:\n", + " wav_file = item[1].strip()\n", + " wav_files.append(wav_file)\n", + " if not os.path.exists(wav_file):\n", + " print(wav_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav']\n" + ] + } + ], + "source": [ + "# show duplicate items\n", + "c = Counter(wav_files)\n", + "duplicates = [item for item, count in c.items() if count > 1]\n", + "print(duplicates)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "folders = [w.split('/')[5] for w in wav_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'BATCH_10_FINAL',\n", + " 'BATCH_11_FINAL',\n", + " 'BATCH_12_FINAL',\n", + " 'BATCH_13_FINAL',\n", + " 'BATCH_14_FINAL',\n", + " 'BATCH_15_FINAL',\n", + " 'BATCH_16_FINAL',\n", + " 'BATCH_17_FINAL',\n", + " 'BATCH_18_FINAL',\n", + " 'BATCH_19_FINAL',\n", + " 'BATCH_1_FINAL',\n", + " 'BATCH_20_FINAL',\n", + " 'BATCH_2_FINAL',\n", + " 'BATCH_3_FINAL',\n", + " 'BATCH_4_FINAL',\n", + " 'BATCH_5_FINAL',\n", + " 'BATCH_6_FINAL',\n", + " 'BATCH_7_FINAL',\n", + " 'BATCH_8_FINAL',\n", + " 'BATCH_9_FINAL'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(folders)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "afbb94c274fe4913b256a8756584c0f6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "def load_item(item):\n", + " file_name = item[1].strip()\n", + " text = item[0].strip()\n", + " audio = librosa.load(file_name, sr=None)\n", + " sr = audio[1]\n", + " audio = audio[0]\n", + " audio_len = len(audio) / sr\n", + " text_len = len(text)\n", + " return file_name, text, text_len, audio, audio_len\n", + "\n", + "# This will take a while depending on size of dataset\n", + "if NUM_PROC == 1:\n", + " data = []\n", + " for m in tqdm(items):\n", + " data += [load_item(m)]\n", + "else:\n", + " with Pool(8) as p:\n", + " data = list(tqdm(p.imap(load_item, items), total=len(items)))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "53b7f6adb4db47279927ec064addb3c7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " > Number of words: 27102\n" + ] + } + ], + "source": [ + "# count words in the dataset\n", + "w_count = Counter()\n", + "for item in tqdm(data):\n", + " text = item[1].lower().strip()\n", + " for word in text.split():\n", + " w_count[word] += 1\n", + "print(\" > Number of words: {}\".format(len(w_count)))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8b48c3415e2a4ac1a174502c2308501d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "text_vs_durs = {} # text length vs audio duration\n", + "text_len_counter = Counter() # number of sentences with the keyed length\n", + "for item in tqdm(data):\n", + " text = item[1].lower().strip()\n", + " text_len = len(text)\n", + " text_len_counter[text_len] += 1\n", + " audio_len = item[-1]\n", + " try:\n", + " text_vs_durs[text_len] += [audio_len]\n", + " except:\n", + " text_vs_durs[text_len] = [audio_len]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# text_len vs avg_audio_len, median_audio_len, std_audio_len\n", + "text_vs_avg = {}\n", + "text_vs_median = {}\n", + "text_vs_std = {}\n", + "for key, durs in text_vs_durs.items():\n", + " text_vs_avg[key] = np.mean(durs)\n", + " text_vs_median[key] = np.median(durs)\n", + " text_vs_std[key] = np.std(durs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Avg audio length per char" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_7.wav', 'Schickes Heimkino!', 18, array([1.28518932e-05, 1.68334354e-05, 1.03571265e-05, ...,\n", + " 2.77877753e-05, 1.10460878e-05, 2.05760971e-05], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_12.wav', 'Das sieht ihm ähnlich.', 23, array([7.6380376e-05, 9.3327515e-05, 6.1386294e-05, ..., 3.4380835e-05,\n", + " 2.6692895e-05, 2.2882025e-06], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_20.wav', 'Oh, das Programm ist mir neu.', 29, array([-3.6327918e-05, -5.8332487e-05, -5.0294046e-05, ...,\n", + " -3.2606560e-05, -5.3037817e-05, -3.6754736e-05], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_21.wav', 'Niemand ist ein Alleskönner.', 29, array([2.5469655e-05, 1.5675920e-05, 2.6378759e-05, ..., 3.4840865e-05,\n", + " 3.4687979e-05, 2.3448023e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_25.wav', 'Dagegen ist kein Kraut gewachsen.', 33, array([8.6409571e-05, 1.6211446e-04, 1.2149933e-04, ..., 1.4264301e-05,\n", + " 2.6473885e-05, 4.1174495e-05], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_39.wav', 'Seid gegrüÃ\\x9ft!', 15, array([-4.95165441e-05, -9.18527076e-05, -1.06668835e-04, ...,\n", + " -4.00948884e-05, -6.23805026e-05, -4.42093369e-05], dtype=float32), 1.1808541666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_43.wav', 'Nicht mit dem FuÃ\\x9f!', 19, array([-2.4153460e-05, -9.5195399e-05, -1.8093537e-04, ...,\n", + " 2.0667248e-05, 2.7399163e-05, 5.0344559e-05], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_44.wav', 'Wissen ist Macht.', 17, array([-1.9221216e-05, -2.1811753e-05, -4.0165878e-06, ...,\n", + " -5.0537183e-06, -1.3825783e-05, -2.8384518e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_45.wav', 'Guck mal, ein Eichhörnchen!', 28, array([-8.8387278e-05, -7.1484370e-05, -9.1183894e-05, ...,\n", + " -2.6602589e-05, 1.1369466e-05, -1.4236821e-06], dtype=float32), 1.5245208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_58.wav', 'Ich bin mein eigener Hund.', 26, array([-1.3441265e-05, -1.3771249e-05, 2.1415319e-06, ...,\n", + " -2.9998329e-05, 6.4692267e-06, 1.6420488e-05], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_68.wav', 'Lach ich, oder was?', 19, array([1.20631594e-04, 2.69133277e-04, 3.61918297e-04, ...,\n", + " 2.52288628e-05, 1.12787602e-05, 2.01150815e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_76.wav', 'Moment mal, das ist neu.', 24, array([-4.0444505e-05, -5.6087447e-05, -7.0869857e-05, ...,\n", + " -5.9735464e-07, 1.4513580e-05, 1.7241922e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_79.wav', 'Wie lange zieht der Tee schon?', 30, array([ 1.3359761e-05, 1.4845427e-06, -8.4266394e-06, ...,\n", + " 8.4090761e-06, 5.6682808e-07, 1.4266146e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_95.wav', 'Schlaf gut!', 11, array([-8.3705861e-05, -1.3769916e-04, -1.0772650e-04, ...,\n", + " -1.2876300e-05, -3.5042558e-05, -1.5538299e-05], dtype=float32), 1.0839166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_97.wav', 'Entschuldigen Sie die Verwechslung!', 35, array([-4.3585667e-05, -4.9360351e-05, -2.4610319e-05, ...,\n", + " -1.4282005e-05, -7.0760620e-07, -2.8634834e-06], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_99.wav', 'Schönes Ding!', 14, array([-4.9598326e-05, -4.2029962e-05, -2.2566113e-05, ...,\n", + " 7.5142352e-06, -3.1275456e-05, -1.8421564e-05], dtype=float32), 0.9252916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_101.wav', 'Dann nichts wie weg hier!', 25, array([ 1.2582598e-05, 1.4227808e-05, 1.0588883e-05, ...,\n", + " 1.8725707e-07, -4.0784824e-05, -7.0644560e-06], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_120.wav', \"Wie geht's?\", 11, array([ 3.6131805e-05, 2.3445213e-05, 4.7948160e-05, ...,\n", + " -3.3656095e-05, -4.0791183e-05, -4.5296023e-05], dtype=float32), 0.9341041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_179.wav', 'Das ganze Haus hat gewackelt.', 29, array([ 1.31893430e-05, -2.02163919e-05, -5.92077959e-06, ...,\n", + " -8.03239527e-06, -1.91841791e-05, -1.46886205e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_186.wav', 'Woher kommt all der Hass?', 25, array([-1.0393358e-05, -4.2540119e-05, -1.8952907e-05, ...,\n", + " 1.9931360e-05, 2.8833035e-06, 2.6874868e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_189.wav', 'Stillgestanden!', 15, array([ 4.4343769e-06, 1.3210945e-05, 1.7683087e-05, ...,\n", + " 2.6131744e-05, -5.4923967e-06, 9.4311863e-06], dtype=float32), 1.2689791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_199.wav', 'Eine Sache zur Zeit.', 20, array([5.1501018e-05, 6.3279913e-05, 7.3763011e-05, ..., 1.0348874e-05,\n", + " 1.0562905e-05, 3.0424892e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_218.wav', 'Nichts für ungut!', 18, array([-4.0355466e-05, -4.5107645e-05, -7.7510209e-05, ...,\n", + " -2.0305148e-05, -3.0419576e-05, -1.7718892e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_220.wav', 'Sieh genau hin!', 15, array([-1.2045763e-02, -1.6849384e-02, -1.4799301e-02, ...,\n", + " 1.6059141e-06, -1.4713467e-05, 1.0609662e-05], dtype=float32), 1.3042291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_224.wav', 'Und welches Baujahr?', 20, array([-3.5566740e-05, -2.3342436e-05, -2.8526230e-05, ...,\n", + " 3.1306794e-05, 3.2872085e-05, 2.9171426e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_226.wav', 'Sofort umkehren!', 16, array([ 1.2734158e-04, 1.4998924e-04, 1.2418727e-04, ...,\n", + " -6.3872926e-06, -5.1714401e-06, -1.2052229e-05], dtype=float32), 1.3923541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_232.wav', 'Da muss man locker bleiben.', 27, array([-3.2585725e-05, -3.3840271e-05, 1.3126293e-05, ...,\n", + " -1.8632261e-05, -6.3017387e-06, -5.6675367e-06], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_237.wav', 'Probier es mal mit Aceton.', 26, array([ 7.5771743e-05, 1.0223542e-04, 1.0343192e-04, ...,\n", + " -2.1570906e-05, -3.1918564e-05, -1.1135696e-05], dtype=float32), 1.8858125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_244.wav', 'Kommt drauf an.', 15, array([ 2.7207607e-05, 1.8057373e-05, 1.2512723e-05, ...,\n", + " -6.0103289e-06, -2.1828011e-05, -8.1472344e-06], dtype=float32), 1.3571041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_246.wav', 'Man darf gespannt sein.', 23, array([-2.3668355e-03, -3.7321844e-03, -3.6732492e-03, ...,\n", + " 1.7768043e-06, 2.0778492e-05, 5.1516781e-06], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_251.wav', 'Daran scheiden sich die Geister.', 32, array([-2.39492147e-05, -4.70898958e-05, -2.53186899e-05, ...,\n", + " -4.88899059e-06, -1.34801885e-05, 1.04552892e-05], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_258.wav', 'Was habt ihr heute erlebt?', 26, array([ 3.5868085e-05, 8.2530729e-05, 4.6677309e-05, ...,\n", + " -8.4167405e-06, -2.0942105e-05, -6.2113932e-06], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_265.wav', 'Lass das sein!', 14, array([2.4356419e-05, 5.5347311e-05, 5.1189338e-05, ..., 2.7182332e-05,\n", + " 1.6106302e-05, 2.1714099e-05], dtype=float32), 1.2425208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_267.wav', 'Auch heute noch.', 16, array([ 1.6202603e-05, 1.8275598e-05, 1.5345126e-05, ...,\n", + " -9.9319268e-06, -1.4463866e-05, 7.9376441e-06], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_270.wav', 'Wir sehen uns in Bielefeld.', 27, array([5.0975410e-05, 4.6619494e-05, 5.2299667e-05, ..., 2.4641362e-05,\n", + " 2.0409352e-05, 1.7508868e-05], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_274.wav', 'Gerald muss Dampf ablassen.', 27, array([-1.4112990e-04, -2.2197423e-04, -2.2060136e-04, ...,\n", + " -4.0291343e-05, -3.2744192e-05, -1.7507429e-05], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_277.wav', 'Sehen Sie selbst!', 17, array([-3.6524234e-05, -2.8097162e-05, 4.4066533e-06, ...,\n", + " 2.1528131e-06, -1.2273627e-05, -8.5409883e-06], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_282.wav', 'Haben wir jemanden vergessen?', 29, array([-2.1900923e-05, -8.0311016e-05, -4.5058856e-05, ...,\n", + " 8.6369282e-06, 2.3358027e-05, 1.4141980e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_300.wav', 'Oh, der Besuch ist da!', 22, array([-1.1763951e-06, -6.4509544e-07, -2.1343028e-05, ...,\n", + " 8.3751611e-06, -2.0755753e-05, -3.9365756e-07], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_303.wav', 'Kannst du das bitte übernehmen?', 32, array([1.9790201e-05, 2.5795589e-05, 2.3016226e-05, ..., 4.4700668e-05,\n", + " 2.9440445e-05, 4.1151830e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_309.wav', 'Ich muss verrückt sein.', 24, array([-3.7773843e-05, -2.5238944e-05, -4.5549310e-05, ...,\n", + " -1.4228171e-05, -1.3738420e-05, -2.5079733e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_312.wav', 'Gestrichen!', 11, array([4.6765574e-05, 8.2428312e-05, 6.1315681e-05, ..., 1.7959255e-06,\n", + " 5.7119927e-08, 3.7900886e-06], dtype=float32), 0.9693541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_321.wav', 'Gott atmet nicht.', 17, array([3.9337472e-05, 4.7041980e-05, 5.6819965e-05, ..., 1.6601467e-05,\n", + " 1.5404070e-05, 3.0179035e-05], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_327.wav', 'Das ist mir auch klar.', 22, array([ 6.4578126e-05, 9.0902526e-05, 7.7864941e-05, ...,\n", + " -1.0411938e-05, -3.7324537e-06, 1.4365208e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_329.wav', 'Es sieht nach Unsinn aus.', 25, array([ 1.1480927e-06, 7.0667493e-06, -3.8140864e-05, ...,\n", + " 5.6332779e-06, 3.7668069e-05, 7.3043757e-06], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_333.wav', 'Das ist nur von auÃ\\x9fen.', 23, array([-3.8521201e-05, -4.7468315e-05, -3.4236415e-05, ...,\n", + " 5.2493826e-05, 3.7984686e-05, 3.3584591e-05], dtype=float32), 1.9915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_334.wav', 'Ich habe gerade ein DéjÃ\\xa0-vu.', 30, array([ 4.4728897e-04, 3.7400136e-04, -4.0894563e-04, ...,\n", + " 2.4757979e-05, 1.1479871e-05, 2.5551706e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_336.wav', 'Ich muss mich verzählt haben.', 30, array([-3.9173494e-05, -2.9986420e-05, -1.9012801e-05, ...,\n", + " -6.0724019e-06, 2.7600961e-05, -3.4350986e-05], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_342.wav', 'So kann man sich täuschen.', 27, array([-3.5296402e-05, -6.0332448e-05, -5.2051670e-05, ...,\n", + " -1.2274999e-05, -6.2373409e-05, 1.2240975e-05], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_345.wav', 'Ich weiÃ\\x9f nicht woher.', 22, array([-2.05518299e-05, -1.30783865e-05, -1.48754107e-05, ...,\n", + " -5.49699544e-05, -3.01012133e-05, -1.70801268e-05], dtype=float32), 1.4980833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_352.wav', 'Bist du jetzt beleidigt?', 24, array([-1.0385954e-05, 1.1672010e-05, -2.3844843e-05, ...,\n", + " 6.0053999e-06, -2.3204884e-05, -9.7573111e-06], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_357.wav', 'Gib mir zwei Minuten, ja?', 25, array([-1.8705783e-05, -3.0273133e-05, -2.4814160e-05, ...,\n", + " 1.4705538e-05, 9.7520942e-06, 1.7873571e-06], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_360.wav', 'Voll der Psycho-Blick!', 22, array([ 5.0691519e-06, 1.2665058e-05, 1.4902340e-06, ...,\n", + " 9.9865492e-06, -2.0948526e-05, -1.1750392e-05], dtype=float32), 1.4980833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_365.wav', 'Mein Freund ist Musiker.', 24, array([ 4.2413834e-05, 2.3999601e-05, 1.0646096e-05, ...,\n", + " -1.9632445e-05, -2.5183452e-05, -1.8877656e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_386.wav', 'Hast du Knoblauch gegessen?', 27, array([ 4.2124993e-06, 1.6061234e-05, 1.6008022e-05, ...,\n", + " 4.7057729e-05, -5.8230005e-05, -6.6850065e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_391.wav', 'Ist mir gar nicht aufgefallen.', 30, array([-1.2801524e-04, -1.8332504e-04, -1.6864720e-04, ...,\n", + " -1.7935792e-05, 1.3743926e-05, 4.5144670e-06], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_396.wav', 'Verdammt noch mal!', 18, array([-1.9188805e-05, 2.9282862e-06, 3.1274089e-06, ...,\n", + " 3.8011989e-05, 4.4447512e-05, 3.0465781e-05], dtype=float32), 1.3218541666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_403.wav', 'Klingt moralisch einwandfrei.', 29, array([-1.5154625e-06, -1.1907745e-05, -3.7140951e-06, ...,\n", + " 1.4816231e-06, -1.0694354e-05, -2.7909247e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_412.wav', 'Wie wunderschön du bist.', 25, array([ 8.1452117e-06, 1.2316134e-05, 1.2410718e-05, ...,\n", + " -2.5919973e-05, -1.5394140e-05, -1.6787388e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_419.wav', 'Ich kann nichts erkennen.', 25, array([-2.1261691e-05, -2.6662590e-05, -3.2895186e-05, ...,\n", + " -8.6166056e-06, 1.0871788e-06, -5.8716050e-06], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_423.wav', 'Jetzt aber zackig!', 18, array([ 2.4374567e-06, 2.0842881e-05, -1.5250983e-05, ...,\n", + " -1.6002667e-05, -4.2002972e-05, -2.0723968e-05], dtype=float32), 1.2953958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_425.wav', 'Ich bin schon ganz wirr im Kopf.', 32, array([2.9025901e-05, 3.5920395e-05, 4.5607205e-05, ..., 1.6718976e-05,\n", + " 2.1111184e-05, 3.3797973e-05], dtype=float32), 1.98275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_430.wav', 'Ihr gefällt die Kulisse.', 25, array([ 2.0069625e-05, 6.2984320e-05, 4.6121866e-05, ...,\n", + " -3.1357740e-05, -2.2353357e-05, -2.2545100e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_445.wav', 'GrüÃ\\x9f dich!', 12, array([-1.0602423e-05, -7.0546007e-06, 1.1231577e-05, ...,\n", + " -4.8423290e-06, -2.5039872e-05, -2.4532073e-05], dtype=float32), 0.7842916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_456.wav', 'Nach mir die Sintflut!', 22, array([ 2.0728099e-05, -9.0359263e-06, -4.4944873e-06, ...,\n", + " 6.8659042e-06, -1.2404760e-05, -2.2153192e-06], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_460.wav', 'Was soll das denn bringen?', 26, array([ 3.9292016e-05, 5.6996982e-05, 6.4746971e-05, ...,\n", + " -3.1001658e-05, -9.7075417e-06, -1.9902369e-05], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_461.wav', 'Er lädt immer noch.', 20, array([-1.6651324e-05, -5.8167420e-06, 5.8412393e-06, ...,\n", + " -5.8599158e-05, -5.3942535e-05, -2.6054968e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_479.wav', 'Was sollen wir nur tun?', 23, array([-4.4440752e-05, -5.3991145e-05, -4.1732972e-05, ...,\n", + " -5.2980035e-06, 1.0908753e-05, 1.9730707e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_481.wav', 'Schluss damit!', 14, array([-2.9023191e-05, -4.2109135e-05, -3.8624265e-05, ...,\n", + " -1.9805097e-05, -6.0203884e-06, 1.1789062e-05], dtype=float32), 0.9605416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_483.wav', 'Können sie mir ihr Passwort geben?', 35, array([ 2.5537942e-05, 5.2574283e-05, 5.7736743e-05, ...,\n", + " -5.4731267e-06, -2.9014491e-05, 3.6238887e-06], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_491.wav', 'Sie muss zum BogenschieÃ\\x9fen.', 28, array([-3.1108371e-05, -5.1357423e-05, -7.0860064e-05, ...,\n", + " -4.0438888e-05, -2.6810346e-06, -1.3582417e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_513.wav', 'Gib ihm die Schaufel wieder!', 28, array([-2.5840678e-05, -2.4174828e-05, -1.2895588e-05, ...,\n", + " 3.6998503e-05, 3.0887943e-05, 1.9229607e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_514.wav', 'Ich will mich kurzfassen.', 25, array([-5.4538796e-06, 1.6863480e-05, -2.4184583e-05, ...,\n", + " -7.9238208e-07, 9.8597202e-06, 2.5041477e-06], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_515.wav', 'Die ist hart im Nehmen.', 23, array([ 3.2496322e-05, 3.8166479e-05, 3.2249674e-05, ...,\n", + " -1.0363748e-05, 1.9095280e-05, 9.2708688e-06], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_516.wav', 'Oh mein Gott!', 13, array([ 1.0293347e-05, 2.3256578e-05, -2.6419082e-06, ...,\n", + " -1.2127157e-05, 1.4263560e-06, 3.2800324e-06], dtype=float32), 0.8812291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_517.wav', 'Einer noch!', 11, array([ 1.8490386e-05, 9.7866017e-05, 1.1555837e-04, ...,\n", + " -5.3282761e-08, -1.5481584e-05, 1.1070631e-06], dtype=float32), 0.7578541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_536.wav', 'Da hat er sich verhaspelt.', 26, array([-1.2101016e-05, -4.1350278e-05, -2.5068364e-05, ...,\n", + " -9.8568984e-05, 1.2527088e-04, 2.5078503e-04], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_538.wav', 'Kann ich mir nicht vorstellen.', 30, array([-7.1259085e-05, -6.6917557e-05, -7.5606287e-05, ...,\n", + " -1.7281625e-05, 1.9208239e-06, 9.8984492e-06], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_541.wav', 'Kannst du sie mal anstupsen?', 28, array([-3.0119493e-06, 3.5770699e-06, 8.4955855e-06, ...,\n", + " 1.3389642e-05, 2.2122082e-05, 1.8456800e-05], dtype=float32), 1.67875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_554.wav', 'Das wird nicht billig.', 22, array([-1.2833251e-05, -2.6942225e-05, -1.1592191e-05, ...,\n", + " -1.1226616e-05, 2.4460544e-05, 4.6120007e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_555.wav', 'Ã\\x9cberall wird hier gebaut.', 26, array([ 3.0397489e-06, 1.6576083e-05, 1.7184460e-05, ...,\n", + " -4.7443868e-06, 1.7984281e-07, 1.7898132e-05], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_556.wav', 'Was möchten Sie zu trinken?', 28, array([3.6597925e-05, 3.9522194e-05, 3.4265908e-05, ..., 4.9602304e-04,\n", + " 4.0240673e-04, 2.1699475e-04], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_559.wav', 'Waren Sie schon einmal bei uns?', 31, array([ 2.5204083e-06, -9.7146321e-06, 1.0508998e-05, ...,\n", + " 1.6337053e-05, 4.2958636e-05, 3.6466561e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_579.wav', 'Traut sich sonst noch jemand?', 29, array([-3.4311914e-05, -1.9934920e-05, -3.6420348e-05, ...,\n", + " -8.5477677e-06, -8.7745884e-06, -2.7311040e-05], dtype=float32), 1.9739583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_587.wav', 'Hier noch mal die Kurzform.', 27, array([ 4.8683055e-06, -9.0082349e-06, -6.4492651e-06, ...,\n", + " 1.2890940e-05, 1.4272653e-05, 9.0988487e-06], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_594.wav', 'Haste mal nen Euro?', 19, array([-8.6395357e-06, -1.0812845e-05, -3.0906973e-05, ...,\n", + " 9.5510404e-06, 1.9230547e-05, 3.1346096e-06], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_599.wav', 'Wie schreibt man das?', 21, array([-3.6024519e-06, -2.5525418e-05, -2.9170100e-05, ...,\n", + " -1.0803048e-05, 3.5519159e-05, 6.3340508e-06], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_600.wav', 'Er kann es nicht mehr hören.', 29, array([-3.8066657e-05, -3.2469205e-05, -5.3206204e-05, ...,\n", + " 2.6021740e-05, -1.0833596e-06, 1.9787998e-05], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_601.wav', 'Bleib einfach cool.', 19, array([-4.1984731e-05, -2.3916245e-05, -3.1576215e-05, ...,\n", + " -1.8820670e-05, 6.2404342e-07, -9.7557686e-06], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_603.wav', 'Davon können Sie ausgehen.', 27, array([ 1.0824577e-05, -1.7968627e-05, -1.6179658e-05, ...,\n", + " -5.5361601e-05, -4.2508735e-05, -3.1106232e-05], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_606.wav', 'So ist das im Leben.', 20, array([ 1.0786475e-05, -1.3495748e-05, 6.5641157e-06, ...,\n", + " -3.1349493e-05, -2.5596510e-05, -2.9100025e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_625.wav', 'Du musst anders fragen.', 23, array([ 4.8367940e-03, 6.8724523e-03, 6.1804145e-03, ...,\n", + " -7.8923513e-06, 1.7550767e-06, 7.2876783e-06], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_628.wav', 'Es war nicht alles schlecht.', 28, array([ 1.08825125e-05, 1.04639130e-05, 8.46001694e-06, ...,\n", + " -2.05042506e-05, 7.06381434e-06, 2.37766089e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_643.wav', 'Das lasse ich mir nicht bieten!', 31, array([-8.2775728e-07, -4.0987805e-05, -1.7558119e-05, ...,\n", + " -2.1388867e-06, -4.9800960e-06, -1.3807499e-05], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_665.wav', 'Hallo, ich bin der Neue!', 24, array([-2.4004371e-04, -3.8098267e-04, -3.8909691e-04, ...,\n", + " -3.5481004e-05, 3.5560199e-05, -1.3612277e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_667.wav', 'Fastest du?', 11, array([-6.0218765e-05, -8.1393919e-05, -8.6645297e-05, ...,\n", + " 6.8678496e-06, -8.2385115e-05, -5.4868913e-05], dtype=float32), 1.2072708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_675.wav', 'Nur um das klarzustellen.', 25, array([ 2.7598284e-05, 4.3499585e-05, -7.3542742e-06, ...,\n", + " 4.4517365e-06, -9.3571025e-06, 3.8795395e-05], dtype=float32), 1.8681875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_699.wav', 'Jetzt wird es gemein.', 21, array([ 2.8973442e-05, 5.4584369e-05, 2.5356880e-05, ...,\n", + " 7.6631528e-05, 5.6628844e-05, -4.1394928e-06], dtype=float32), 1.8681875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_704.wav', 'So sieht das aus.', 17, array([7.2620540e-05, 1.0683333e-04, 1.9689680e-04, ..., 2.9477818e-05,\n", + " 1.5229379e-05, 4.7805424e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_710.wav', 'Gute Nacht ihr Lausbuben!', 25, array([-3.4681521e-04, -4.7425818e-04, -4.6133957e-04, ...,\n", + " 8.0735008e-06, -6.7210376e-06, 6.1622823e-06], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_727.wav', 'Tschüss, Mädels!', 18, array([ 5.8768086e-07, -7.6773445e-05, -4.4017674e-05, ...,\n", + " -7.9999263e-05, 3.1158263e-06, 9.4530027e-05], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_750.wav', 'Geh mir nicht auf den Keks.', 27, array([ 3.7033031e-05, -1.8765691e-05, 3.5605895e-05, ...,\n", + " -4.1894207e-05, -5.0918239e-05, -8.2971856e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_758.wav', \"Dir werd ich's zeigen.\", 22, array([ 5.9986287e-05, 3.1676023e-05, 9.2681257e-05, ...,\n", + " -2.7595996e-05, -4.2494954e-05, -1.1851616e-06], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_764.wav', 'Macht euch bereit!', 18, array([1.5598367e-04, 1.9868747e-04, 1.1692408e-04, ..., 8.2378487e-05,\n", + " 6.5455366e-05, 4.8687412e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_766.wav', 'Da kiekste wa?', 14, array([ 5.4184136e-07, -6.1094812e-05, -6.1461476e-05, ...,\n", + " 9.7159907e-05, 2.3223305e-05, 8.9147768e-05], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_778.wav', 'Das gibt es ja nicht!', 21, array([ 2.0350570e-04, 3.1676778e-04, 2.1080665e-04, ...,\n", + " -6.1200735e-05, 1.1813832e-05, -2.1792879e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_789.wav', 'Das ist nicht mein Problem.', 27, array([-5.5885310e-05, -6.4690561e-05, -3.0270432e-05, ...,\n", + " -7.1330876e-05, -1.6931441e-05, -1.1536635e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_23.wav', 'Finde dich damit ab.', 20, array([ 7.2009592e-05, -2.1050539e-05, -8.4551131e-05, ...,\n", + " 5.7306173e-05, 9.7603959e-05, 1.5820342e-04], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_41.wav', 'Wie im Wilden Westen!', 21, array([ 1.4756477e-05, 3.1426986e-05, 9.2355578e-05, ...,\n", + " 8.1666811e-05, 7.9924212e-06, -1.6274511e-05], dtype=float32), 1.9915729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_63.wav', 'Da gehe ich mit.', 16, array([-1.10742374e-04, -1.88132090e-05, 1.54691588e-05, ...,\n", + " 2.89936361e-06, -3.01086147e-05, 3.05973408e-05], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_75.wav', 'Warum nur werktags?', 19, array([-0.00052728, -0.00052381, -0.00042873, ..., -0.00014365,\n", + " -0.00010449, -0.00010741], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_76.wav', 'Geht ihr zur Kommunion?', 23, array([-1.0898075e-04, -9.7388023e-05, -6.8978305e-05, ...,\n", + " -5.0831288e-05, -1.5921889e-05, 6.4072694e-05], dtype=float32), 1.7271979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_80.wav', 'Ihr Blick spricht Bände.', 25, array([-4.6483423e-05, -1.6536529e-04, -9.5357966e-05, ...,\n", + " -8.0715154e-06, -4.8390953e-05, -5.0536739e-05], dtype=float32), 1.6655104166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_85.wav', 'Ruhe in Frieden.', 16, array([ 1.12481954e-04, 1.02392871e-04, 1.89193961e-05, ...,\n", + " -1.02047234e-05, -6.91346722e-05, -7.76782108e-05], dtype=float32), 1.7095729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_97.wav', 'Es wird hart gekämpft.', 23, array([-0.0001628 , -0.00018412, -0.00010292, ..., 0.0001769 ,\n", + " 0.00018152, 0.00018817], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_98.wav', 'Warum das alles?', 16, array([-9.8717544e-05, -8.1991704e-05, -1.4659751e-04, ...,\n", + " -6.5778313e-06, -7.7343866e-05, 1.8901783e-05], dtype=float32), 1.3218333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_109.wav', 'Und Action!', 11, array([-2.8484770e-05, 8.8463985e-06, 5.4628901e-05, ...,\n", + " 6.9029898e-05, -7.5049247e-06, 2.7110993e-05], dtype=float32), 1.23371875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_112.wav', 'Bist du dir sicher?', 19, array([ 1.8312603e-05, -8.6757791e-07, -5.3837293e-06, ...,\n", + " 1.1187289e-05, -3.2346459e-05, 9.6363983e-06], dtype=float32), 1.6302708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_113.wav', 'Nur über meine Leiche!', 23, array([ 7.7449629e-05, 1.5036203e-04, 1.0243297e-04, ...,\n", + " -9.4819125e-06, -6.9288013e-05, 2.3950559e-05], dtype=float32), 1.8858229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_120.wav', 'Hoffentlich schafft er das.', 27, array([-1.6298418e-05, 1.6150392e-05, 2.2071041e-04, ...,\n", + " 5.1459443e-05, -2.1589445e-05, 3.2091139e-05], dtype=float32), 1.9210729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_147.wav', 'Komm, spiel mit mir!', 20, array([ 1.9483854e-05, 1.7799211e-06, 3.3775228e-05, ...,\n", + " 2.8417478e-05, -4.2961314e-05, -3.5597783e-05], dtype=float32), 1.9386979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_152.wav', 'Ui ui ui!', 9, array([5.5120941e-05, 5.6017692e-05, 4.3216096e-06, ..., 7.1505703e-05,\n", + " 3.5192006e-05, 7.0440023e-05], dtype=float32), 1.14559375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_157.wav', 'Riech mal!', 10, array([ 1.6765174e-05, 6.2451771e-05, 1.0707039e-04, ...,\n", + " -7.5908087e-05, -1.0923214e-04, -7.9517071e-05], dtype=float32), 1.03984375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_165.wav', 'Ich war nicht dabei.', 20, array([-9.2572387e-05, -7.4509240e-05, -3.5020537e-05, ...,\n", + " 2.8946462e-05, 6.8536661e-05, 1.4004428e-05], dtype=float32), 1.8065104166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_170.wav', 'Danke für die Einladung.', 25, array([-5.4829288e-05, -5.2409945e-05, -1.6216440e-05, ...,\n", + " 1.8202516e-05, 1.6152997e-05, 7.3245174e-05], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_197.wav', 'So soll es sein.', 16, array([ 6.0843304e-05, 1.4244186e-05, -1.4521269e-05, ...,\n", + " -1.3551622e-04, -8.4085783e-05, -1.3086156e-04], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_200.wav', 'Erschütternd!', 14, array([-1.85466139e-04, -1.61985561e-04, -1.26282161e-04, ...,\n", + " 6.37752237e-05, 1.00840225e-04, 1.20959485e-04], dtype=float32), 1.1543958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_203.wav', 'Nur das Ã\\x9cbliche.', 17, array([ 7.9542246e-05, 8.5164116e-05, 5.9246326e-05, ...,\n", + " -2.9600615e-05, 4.1036237e-05, 5.5239609e-05], dtype=float32), 1.8153229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_206.wav', 'Die hat nämlich ein Loch.', 26, array([-1.4263311e-05, 3.4131535e-05, -3.4750206e-05, ...,\n", + " -5.7866608e-05, 1.9035106e-05, 3.3172044e-05], dtype=float32), 1.9827604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_230.wav', 'Hol das Stöckchen.', 19, array([-0.00064988, -0.00065917, -0.00059873, ..., 0.00020419,\n", + " 0.00022752, 0.00016691], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_237.wav', 'Und bei dir?', 12, array([-2.9914919e-04, -2.2948935e-04, -2.3748397e-04, ...,\n", + " 1.1257434e-05, -3.9087045e-05, -2.3366434e-05], dtype=float32), 1.07509375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_264.wav', 'Es liegt in der Natur der Sache.', 32, array([ 3.1785059e-04, 3.4756004e-04, 3.4774767e-04, ...,\n", + " -3.1788899e-05, -7.7856974e-05, -7.3492403e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_268.wav', 'Mission erfolgreich!', 20, array([-5.1757845e-05, -2.9873547e-05, -5.2602922e-05, ...,\n", + " -1.0881226e-04, -7.0386566e-05, -4.1912252e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_274.wav', 'Kommt nicht in die Tüte!', 25, array([-2.6346192e-05, -6.4550313e-06, -4.2296477e-05, ...,\n", + " 6.7257854e-05, 5.5296507e-05, 6.6974962e-06], dtype=float32), 1.8505729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_284.wav', 'Ja, guten Tag!', 14, array([ 3.1975062e-05, 7.6259523e-05, 7.8669080e-05, ...,\n", + " -1.8048113e-05, -4.4206077e-05, -4.7247828e-05], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_308.wav', 'Es ist noch nicht lange her.', 28, array([ 2.2859822e-06, 6.0211198e-05, 5.7821064e-05, ...,\n", + " -8.3175619e-06, -2.3456680e-05, -1.9626390e-05], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_316.wav', 'Wiedersehen!', 12, array([2.8599703e-05, 6.1528997e-05, 8.9646070e-05, ..., 2.7208553e-06,\n", + " 2.9898734e-05, 9.2172457e-05], dtype=float32), 1.12796875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_328.wav', 'Mir ist schwindelig.', 20, array([ 2.4521294e-05, 5.4549360e-05, 2.9534258e-06, ...,\n", + " -8.9185494e-05, -1.0303867e-04, -5.3436386e-05], dtype=float32), 1.7976979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_329.wav', 'Sprechen sie deutsch?', 21, array([-2.4279220e-04, -2.6937225e-04, -2.3713916e-04, ...,\n", + " -2.8695989e-05, -2.7513888e-06, 5.1191882e-06], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_360.wav', 'So war es nicht gemeint.', 24, array([-5.8561371e-05, 8.4504954e-06, 3.6038864e-06, ...,\n", + " 9.6144824e-05, 5.4328477e-05, 8.8002511e-05], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_399.wav', 'Schluss jetzt!', 14, array([ 1.60011361e-04, 1.10784895e-04, 1.05728453e-04, ...,\n", + " 1.56215738e-05, -7.51677726e-06, 3.21154062e-06], dtype=float32), 1.1940625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_414.wav', 'Sehen Sie genau hin!', 20, array([ 4.0775692e-05, 7.8341225e-05, 5.9709568e-05, ...,\n", + " 1.6227934e-05, 3.3044285e-05, -1.1752409e-06], dtype=float32), 1.7448229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_432.wav', 'Christina Habeck?', 17, array([-7.0921145e-05, -8.7887020e-05, -1.0741340e-04, ...,\n", + " 6.9928697e-05, 6.0020051e-05, 4.4092048e-05], dtype=float32), 1.6831354166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_491.wav', 'Olé, olé!', 11, array([-3.5300669e-05, -3.0546897e-05, -4.6127847e-05, ...,\n", + " -4.5910983e-06, 9.3032322e-06, 4.1992083e-05], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_498.wav', 'Nur mal so als Anregung.', 24, array([-5.8754493e-05, -2.6690983e-05, -4.8782116e-05, ...,\n", + " -4.1356816e-05, -3.8702921e-05, -2.8129245e-05], dtype=float32), 1.929875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_503.wav', 'Ich glaube ihr kein Wort.', 25, array([-1.92081643e-06, -2.77346317e-05, -5.22437476e-05, ...,\n", + " 6.71621965e-05, 1.27864005e-05, 3.48269168e-05], dtype=float32), 1.9915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_537.wav', 'Wie könnt ihr es wagen?', 24, array([-1.4561453e-03, -1.4608348e-03, -1.4617005e-03, ...,\n", + " 7.5047151e-06, -8.1957251e-07, 1.6147833e-05], dtype=float32), 1.8417604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_540.wav', 'Nach was schmeckt das genau?', 28, array([5.2316565e-05, 4.9443977e-05, 5.7626901e-05, ..., 2.5021756e-05,\n", + " 4.5578519e-05, 5.3426527e-05], dtype=float32), 1.9651354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_548.wav', 'Gänsehaut pur!', 15, array([-9.5325144e-05, -7.7983823e-05, -6.6722314e-05, ...,\n", + " 5.7276593e-05, 2.5111651e-05, 1.1992834e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_564.wav', 'Höret, höret!', 15, array([-6.9055131e-05, -6.1163970e-05, -7.0053116e-05, ...,\n", + " -1.7221355e-05, -7.2541329e-06, 1.8846076e-06], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_574.wav', 'Das Haus ist umstellt.', 22, array([ 4.3151813e-05, 5.5632776e-05, 2.7663889e-05, ...,\n", + " -4.0600127e-05, -3.0027895e-05, -4.6370071e-05], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_606.wav', 'Den versteht keiner.', 20, array([-6.2417603e-05, -8.2428480e-05, -4.4267428e-05, ...,\n", + " -6.2675332e-05, -4.0452942e-05, -5.3965356e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_612.wav', 'Halten Sie sich fest!', 21, array([2.8007184e-05, 3.2632157e-05, 6.2635645e-06, ..., 5.3581707e-06,\n", + " 1.5780075e-05, 2.3362747e-06], dtype=float32), 1.6390729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_640.wav', 'Können Sie sich ausweisen?', 27, array([-4.1133004e-05, -3.4346365e-05, -2.0997140e-06, ...,\n", + " 2.5395755e-05, 1.5488129e-05, 1.3214269e-05], dtype=float32), 1.9298854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_645.wav', 'Genug ist genug.', 16, array([1.4217473e-04, 1.3088981e-04, 1.2007774e-04, ..., 8.0914921e-05,\n", + " 5.1820301e-05, 7.9144287e-05], dtype=float32), 1.7448229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_647.wav', 'Da bin ich ganz bei Ihnen!', 26, array([-6.2454426e-05, -7.3873220e-05, -9.7365184e-05, ...,\n", + " 1.7943923e-05, 1.8189858e-05, 2.0363577e-05], dtype=float32), 1.7183854166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_666.wav', 'Ich hasse dich!', 15, array([-4.7738231e-06, 1.0362664e-06, 9.6731110e-06, ...,\n", + " 3.2887896e-05, 6.7240894e-06, 7.3296378e-06], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_676.wav', 'Jetzt weiÃ\\x9f ich es wieder.', 26, array([-2.9731807e-05, -2.5498804e-05, -5.7221558e-05, ...,\n", + " -1.3199271e-05, -1.1122796e-05, -1.5994978e-05], dtype=float32), 1.9915729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_708.wav', 'Täuschkörper einsetzen!', 25, array([3.3980694e-05, 5.6047942e-05, 3.6845995e-05, ..., 2.0433601e-05,\n", + " 5.5359560e-05, 3.6635800e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_711.wav', 'So sind die Regeln.', 19, array([ 1.0646171e-05, 2.1217951e-05, -8.0062582e-06, ...,\n", + " -4.2156036e-05, -1.8816583e-05, -4.4005763e-05], dtype=float32), 1.6038229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_712.wav', 'Es schmeckt nach Zimt.', 22, array([ 2.2929296e-05, 2.9111379e-05, 4.6064979e-05, ...,\n", + " -1.8768259e-06, 7.4329464e-06, 1.2982395e-05], dtype=float32), 1.6831354166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_717.wav', 'Auch bei feuchtem Wetter nicht.', 31, array([1.6887316e-05, 6.2355371e-05, 7.5977659e-05, ..., 1.6490449e-05,\n", + " 2.1054177e-05, 1.1164552e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_731.wav', 'Warum denn nicht?', 17, array([ 6.4304750e-06, -6.7788221e-07, -1.0204109e-06, ...,\n", + " -9.7024295e-06, -3.1934254e-05, -2.7286467e-05], dtype=float32), 1.25134375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_746.wav', 'Was isst du da?', 15, array([ 4.1260464e-05, 1.0193682e-05, 3.5085955e-05, ...,\n", + " -3.5494733e-05, -1.2306450e-05, 1.2647797e-05], dtype=float32), 1.6919479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_750.wav', 'Alle schreien hier!', 19, array([-1.3079788e-04, -1.3171590e-04, -1.1580650e-04, ...,\n", + " -2.0512020e-05, -2.3779969e-05, -2.4454272e-05], dtype=float32), 1.7007708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_754.wav', 'Das ist genau mein Ding.', 24, array([-1.1629934e-05, -2.1403244e-05, 1.6778110e-06, ...,\n", + " 1.0532378e-05, 4.3498221e-05, 4.0848565e-05], dtype=float32), 1.6390729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_757.wav', 'Wo denken Sie hin?', 18, array([ 2.1430247e-05, 2.1772265e-05, 2.0838190e-05, ...,\n", + " 2.2910473e-05, -5.1848092e-06, -1.5559262e-06], dtype=float32), 1.4540208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_758.wav', 'Reine Gewöhnungssache.', 23, array([-4.3785589e-05, -4.8620215e-05, -4.8604503e-05, ...,\n", + " 1.0856102e-05, 7.9429465e-06, 6.5844351e-06], dtype=float32), 1.6126458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_760.wav', 'Tschüss!', 9, array([1.6893557e-05, 3.7733011e-05, 4.6923491e-05, ..., 3.5450230e-05,\n", + " 5.7595411e-05, 5.0426086e-05], dtype=float32), 0.6873541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_765.wav', 'Vergiss die Waschtasche nicht!', 30, array([-5.2931227e-05, -5.9350517e-05, -5.4635959e-05, ...,\n", + " -3.9712177e-05, -3.0881067e-05, -1.9957897e-05], dtype=float32), 1.929875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_788.wav', 'Längs oder quer?', 17, array([-5.8456011e-05, -4.5964895e-05, -2.6546955e-05, ...,\n", + " 1.1356072e-05, 1.8672996e-05, -7.0059104e-07], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_14.wav', 'Wer hat euch geschickt?', 23, array([-1.1148760e-04, 2.4612555e-05, 9.3476447e-05, ...,\n", + " -9.7927412e-05, -3.4095574e-05, -1.7279797e-05], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_34.wav', 'Wo bin ich hier nur gelandet?', 29, array([-1.3307537e-05, -1.0089541e-04, -1.2360289e-05, ...,\n", + " -4.9649680e-05, -7.3272109e-05, -6.8251233e-05], dtype=float32), 1.9306666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_40.wav', 'Natürlich behauptet sie das.', 29, array([ 1.2778574e-04, 5.9959311e-05, -8.1008322e-05, ...,\n", + " 1.9905625e-04, 2.6344018e-05, 1.1490170e-04], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_43.wav', 'Du hattest recht.', 17, array([-1.1000242e-04, -1.6242996e-04, -2.2294538e-04, ...,\n", + " 1.1730633e-04, -8.3676481e-05, -2.5764350e-05], dtype=float32), 1.152)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_44.wav', 'Verklagen Sie mich doch!', 24, array([ 1.94306958e-05, 1.91541476e-04, 6.15894969e-05, ...,\n", + " -1.00529454e-04, -2.00755429e-04, 5.24241113e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_45.wav', 'Die Bremse schleift.', 20, array([ 1.8599353e-04, 8.8273533e-05, 1.5005667e-04, ...,\n", + " -1.6525917e-04, -2.2365544e-05, -2.3978014e-04], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_51.wav', 'Hilfe!', 6, array([-1.7958642e-04, -2.2338594e-04, -2.7969983e-04, ...,\n", + " -1.4840752e-04, -3.4539087e-05, 3.2946355e-06], dtype=float32), 0.704)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_57.wav', 'Jetzt liegt es an dir.', 22, array([ 2.1328227e-04, 8.1810067e-05, -1.6158322e-04, ...,\n", + " 1.6350237e-04, 1.0099774e-04, 1.6040609e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_58.wav', 'Wo kann ich das kaufen?', 23, array([-9.1674337e-05, -1.6169342e-04, -1.8347435e-04, ...,\n", + " 4.6268760e-06, 2.3974455e-05, -1.1637783e-04], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_60.wav', 'Kann man jetzt auch nicht mehr ändern.', 39, array([-3.5826775e-04, -3.3033665e-04, -2.3628448e-04, ...,\n", + " -1.9967039e-04, -1.7616056e-05, 6.7053217e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_68.wav', 'Hör mir doch mal zu.', 21, array([-1.0109342e-04, -3.4855773e-06, 9.0611480e-05, ...,\n", + " -1.0345047e-04, -4.0894301e-05, -6.3259591e-05], dtype=float32), 1.4613333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_75.wav', 'Gibt es die Person wirklich?', 28, array([1.8891362e-04, 2.3809298e-04, 1.1160582e-04, ..., 2.3936841e-06,\n", + " 4.5461587e-05, 9.1474227e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_79.wav', 'Wo waren wir stehen geblieben?', 30, array([-6.7620305e-05, 3.2152042e-05, 6.8106332e-05, ...,\n", + " -1.8769420e-04, -6.5137865e-05, -2.5653889e-04], dtype=float32), 1.824)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_91.wav', 'Grundgütiger!', 14, array([ 7.70497209e-05, -5.13312625e-05, 7.22193681e-06, ...,\n", + " -1.11605725e-04, -1.26782295e-04, 8.50337819e-05], dtype=float32), 1.3546666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_92.wav', 'Wer bist du?', 12, array([-4.3348764e-04, -4.4667200e-04, -4.2408684e-04, ...,\n", + " -3.9185648e-05, -3.1797776e-05, -2.2222506e-04], dtype=float32), 1.024)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_95.wav', 'Schon gut.', 10, array([-3.07407812e-04, -4.31929773e-04, -5.19388705e-04, ...,\n", + " -1.07154076e-04, -7.57433227e-05, -1.24133236e-04], dtype=float32), 0.9173333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_99.wav', 'Murat, was ist los mit dir?', 27, array([-3.84323685e-05, 6.48807691e-05, -5.84455011e-05, ...,\n", + " 1.45171012e-04, -1.50349506e-05, 1.20676006e-04], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_101.wav', 'HeiÃ\\x9fe Würstchen!', 18, array([-0.00027939, -0.00039175, -0.00025548, ..., 0.00027689,\n", + " 0.00011903, 0.00012768], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_106.wav', 'Ich will auch mal einer werden.', 31, array([ 1.36086979e-04, -1.76298781e-05, -4.00176577e-05, ...,\n", + " 1.72844579e-04, 1.29597363e-04, -1.02162725e-04], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_112.wav', 'Ich will auch haben!', 20, array([-4.40885342e-05, -2.34828622e-04, -3.29593284e-04, ...,\n", + " -3.05666414e-04, -1.31685141e-04, -1.00833015e-04], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_116.wav', 'Setz dich bitte gerade hin!', 27, array([-2.2211492e-04, -2.0630175e-04, -1.4655131e-04, ...,\n", + " 1.6456892e-04, 1.0634777e-06, -1.4669505e-04], dtype=float32), 1.9306666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_122.wav', 'Findest du mich erwachsen?', 26, array([3.0208268e-04, 3.6579225e-04, 3.3154435e-04, ..., 6.2579543e-06,\n", + " 4.9250040e-05, 1.8107957e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_123.wav', 'Schrei nicht so!', 16, array([ 8.03208750e-05, 1.33657450e-04, -1.13144284e-04, ...,\n", + " 4.64295183e-04, 4.82034549e-04, 2.86602415e-04], dtype=float32), 1.152)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_136.wav', 'Das kam unerwartet.', 19, array([-3.3067852e-05, -4.8878199e-05, 5.8831414e-05, ...,\n", + " -3.5621467e-04, -3.7723745e-04, -2.3875662e-04], dtype=float32), 1.7386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_149.wav', 'Das ergibt doch keinen Sinn.', 28, array([6.0471892e-05, 8.1125305e-05, 2.7437322e-04, ..., 9.1583250e-05,\n", + " 2.0055164e-04, 2.2477485e-04], dtype=float32), 1.9733333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_155.wav', 'Aller Abschied fällt schwer.', 29, array([-2.2813781e-04, -5.5478893e-05, 1.6814301e-04, ...,\n", + " 1.2765558e-04, 1.7368943e-04, 2.6105065e-04], dtype=float32), 1.6533333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_165.wav', 'Erkennst du mich nicht?', 23, array([-2.3624673e-04, -3.1934463e-04, -2.9434697e-04, ...,\n", + " 1.7059442e-04, 1.9742029e-06, 1.3172596e-04], dtype=float32), 1.4293333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_169.wav', 'Willst du sie mal streicheln?', 29, array([ 1.9991475e-04, 3.4090909e-04, 3.2008073e-04, ...,\n", + " 4.6425943e-05, -8.5656990e-05, -1.2934266e-05], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_181.wav', 'Zur Anmeldung klicken Sie hier.', 31, array([ 5.3989668e-05, -9.8630007e-05, -1.1361165e-04, ...,\n", + " -2.2555150e-05, 3.3015600e-05, 1.0129590e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_194.wav', 'Elvis war nie tot.', 18, array([-6.78355209e-05, -5.90024465e-05, -1.47034181e-04, ...,\n", + " 1.19253775e-04, 2.40493591e-05, 3.28276219e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_196.wav', 'Irgendetwas zu verzollen?', 25, array([-1.2399687e-04, -3.0497483e-06, -1.2210968e-04, ...,\n", + " 1.4703360e-05, 4.4073422e-05, 2.5880148e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_216.wav', 'Du bist doch nicht aus Zucker.', 30, array([-3.7417009e-05, -2.1370529e-04, -1.0503333e-04, ...,\n", + " -3.4687804e-05, -1.0006884e-04, 8.2270970e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_217.wav', 'Bald hat er sein Abi.', 21, array([-7.6955817e-05, -7.4724245e-05, -5.4779473e-05, ...,\n", + " -3.2609492e-05, -1.9532166e-04, -4.0988740e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_227.wav', 'Da lacht das Herz.', 18, array([0.000232 , 0.00019664, 0.00015979, ..., 0.00012966, 0.0001156 ,\n", + " 0.00015061], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_238.wav', 'Steht mir die Bluse?', 20, array([ 5.00293754e-05, 1.15090246e-04, -1.61606382e-04, ...,\n", + " -1.10758898e-04, 9.87306703e-05, 2.25929121e-04], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_240.wav', 'Kommt ihr zurecht?', 18, array([-1.4166623e-04, -1.7185905e-04, -1.0146119e-04, ...,\n", + " -1.9281202e-05, -4.6475827e-05, -7.9622550e-05], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_246.wav', 'Her damit!', 10, array([-1.0743736e-04, -6.3287393e-05, 5.4618115e-05, ...,\n", + " 1.7166793e-04, 1.5052129e-04, -4.3305259e-05], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_256.wav', 'Talente muss man fördern.', 26, array([ 2.9789119e-06, 2.0445570e-05, 3.6582744e-05, ...,\n", + " -8.0595542e-05, 2.8049317e-06, -2.4196431e-04], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_298.wav', 'Kein Kommentar!', 15, array([2.0757825e-04, 2.0225085e-05, 1.0584419e-04, ..., 2.2611262e-05,\n", + " 2.2597586e-04, 5.2457988e-05], dtype=float32), 1.1093333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_304.wav', 'Der atmet noch.', 15, array([-0.0001642 , -0.00022683, -0.00021831, ..., 0.00013961,\n", + " 0.00017319, 0.00013602], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_308.wav', 'Das dauert aber lange!', 22, array([4.1067542e-05, 4.3461972e-05, 1.7915755e-04, ..., 1.1849359e-04,\n", + " 1.6261388e-04, 1.4937650e-05], dtype=float32), 1.44)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_311.wav', 'Du kennst mich, Danton.', 23, array([-5.2089547e-04, -4.7035489e-04, -5.9835758e-04, ...,\n", + " -9.4374191e-05, -2.0053205e-05, 1.2992002e-06], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_314.wav', 'Mein Gott, Walter!', 18, array([ 4.9858125e-05, -2.4514409e-05, -4.7797763e-05, ...,\n", + " -2.9001143e-05, -1.4190034e-04, -2.5762929e-05], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_318.wav', 'Und was machst du sonst so?', 27, array([ 0.00041733, 0.00037329, 0.00035271, ..., -0.00016106,\n", + " -0.00041058, -0.00029774], dtype=float32), 1.6106666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_331.wav', 'Dort wird dir geholfen.', 23, array([-1.9671346e-04, -1.1574107e-04, 5.4965103e-06, ...,\n", + " 4.3039094e-05, -3.2543256e-05, -7.8007070e-05], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_336.wav', 'Was ist denn hier los?', 22, array([0.00012079, 0.00029083, 0.00013022, ..., 0.00036718, 0.00031168,\n", + " 0.00049887], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_358.wav', 'Gleich sind wir dort.', 21, array([ 1.5992192e-04, 2.5509403e-04, 2.3052108e-04, ...,\n", + " 1.9194868e-04, 6.2326435e-05, -2.0080882e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_366.wav', 'Sind sie gut informiert?', 24, array([-1.2915327e-04, 5.4154119e-05, 9.4311297e-05, ...,\n", + " 1.4842945e-04, 1.6595995e-04, 1.6055972e-04], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_376.wav', \"Was soll's, ich bin bereit.\", 27, array([-0.00025371, -0.00037118, -0.00054651, ..., -0.00013142,\n", + " 0.000133 , 0.0001903 ], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_387.wav', 'Was soll das heiÃ\\x9fen?', 21, array([ 6.26799228e-05, -1.15550021e-04, -1.60253039e-04, ...,\n", + " -1.14853225e-04, 3.62789683e-06, -1.25641367e-04], dtype=float32), 1.6106666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_398.wav', 'Oder so!', 8, array([-0.00011172, -0.00021632, -0.0003379 , ..., 0.00016637,\n", + " 0.00021105, 0.00035037], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_431.wav', 'Fauche mich nicht so an!', 24, array([-1.69856430e-04, -2.14659201e-04, -1.17017007e-04, ...,\n", + " 1.06098436e-04, 1.30685687e-04, 8.11223654e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_439.wav', 'Genau zweihundert.', 18, array([ 4.3691549e-04, 4.2721629e-04, 2.1283170e-04, ...,\n", + " -1.0831581e-05, 6.4474931e-05, 1.3399551e-04], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_446.wav', 'Ja ja, das schickt!', 19, array([-1.5079082e-05, 1.2119063e-04, 1.9518439e-04, ...,\n", + " -8.6470172e-05, -3.4930470e-04, -3.7717246e-04], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_460.wav', 'Stein schlägt Schere.', 22, array([ 5.7708825e-05, 1.6740670e-04, 1.9982990e-04, ...,\n", + " -3.3077580e-05, 1.1591193e-04, 7.5874494e-05], dtype=float32), 1.936)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_468.wav', 'Simsalabim!', 11, array([-1.8192175e-05, -1.2427589e-04, 4.0916457e-05, ...,\n", + " -3.6532696e-05, 2.9238325e-05, 2.0148496e-05], dtype=float32), 1.0506666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_478.wav', 'Bitte Zutreffendes ankreuzen.', 29, array([-5.4858734e-05, -6.8480607e-05, -7.1117909e-05, ...,\n", + " -3.5092820e-05, 4.6205354e-05, 3.1237360e-05], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_480.wav', 'Dich kenne ich doch!', 20, array([-3.4106572e-04, -2.6489299e-04, -1.9887066e-04, ...,\n", + " 5.8086891e-05, 2.0823347e-04, -4.3870667e-05], dtype=float32), 1.4026666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_484.wav', 'Und los!', 8, array([ 2.0759732e-04, 2.4903464e-04, -3.9741102e-05, ...,\n", + " -1.4017121e-04, -2.2582384e-04, -2.2852831e-04], dtype=float32), 0.8906666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_485.wav', 'Der Patient ist eh schon tot.', 29, array([ 2.8383749e-04, 1.6098749e-04, 5.8996215e-05, ...,\n", + " -1.5776475e-04, -1.0137054e-04, -1.0374457e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_487.wav', 'Und zwar nicht zu knapp!', 24, array([-4.9983555e-05, 1.0859955e-04, 1.3262806e-04, ...,\n", + " 1.4716771e-04, 2.1034098e-04, 2.6678585e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_488.wav', 'Was ist mit dem Co-Piloten?', 27, array([-4.6707326e-04, -3.3664281e-04, -1.6913723e-04, ...,\n", + " 9.7057833e-05, -3.0600113e-05, -3.3933247e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_510.wav', 'Sie würde ihr letztes Hemd geben.', 34, array([ 1.5112071e-04, 9.9046929e-06, -7.1756775e-05, ...,\n", + " 1.4958363e-04, 2.2523174e-04, 4.5510088e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_521.wav', 'Das wird eh nur Werbung sein.', 29, array([-0.00043494, -0.00045403, -0.00052693, ..., -0.00037776,\n", + " -0.00013905, -0.00029146], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_539.wav', 'Jetzt gibt es Kloppe.', 21, array([ 5.6757370e-05, 1.2752461e-05, -1.0132902e-04, ...,\n", + " -2.8363563e-04, -4.8957689e-04, -4.9631519e-04], dtype=float32), 1.4666666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_549.wav', 'Nee, lieber nicht.', 18, array([-6.2041539e-03, -6.1025852e-03, -5.7721483e-03, ...,\n", + " -4.7201215e-06, -8.9430447e-05, -4.9632461e-05], dtype=float32), 1.5626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_563.wav', 'Er soll schlieÃ\\x9flich etwas lernen.', 34, array([-5.03349729e-05, -2.22053477e-05, 5.14282438e-05, ...,\n", + " 1.08890556e-04, 3.83222614e-05, 6.10036659e-05], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_566.wav', 'Angeblich ja.', 13, array([ 1.7242544e-04, 1.8572621e-04, 1.3631192e-04, ...,\n", + " -4.0973751e-05, -1.5965881e-04, -1.0953719e-04], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_567.wav', 'Wie wäre es mit Wiesbaden?', 27, array([-9.5517004e-05, -2.3826263e-04, -1.0132407e-04, ...,\n", + " 4.5667308e-05, 1.4000830e-04, 2.1524900e-05], dtype=float32), 1.9093333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_576.wav', 'Hört mal zu, ihr Checker!', 26, array([-0.00049925, -0.00049119, -0.00044878, ..., 0.00019171,\n", + " 0.00023476, 0.00022403], dtype=float32), 1.7013333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_580.wav', \"Irgendwann wird's langweilig.\", 29, array([-0.00039041, -0.00038523, -0.00025343, ..., -0.00031044,\n", + " -0.00019142, -0.00014154], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_586.wav', 'Spuck ihn wieder aus!', 21, array([ 0.00012375, 0.00025117, 0.0001871 , ..., -0.00021903,\n", + " -0.00034992, -0.00024192], dtype=float32), 1.712)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_589.wav', 'Unterschätze den Knirps nicht.', 31, array([2.5606243e-04, 2.5400775e-04, 2.3841709e-04, ..., 2.1033855e-05,\n", + " 1.9420990e-04, 1.0694992e-04], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_596.wav', 'Darf ich vorkosten?', 19, array([-1.3477511e-04, -2.3315112e-04, 1.3153857e-05, ...,\n", + " 1.0751128e-04, 1.8084023e-04, 1.6106233e-04], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_597.wav', 'Ich traue mich nicht!', 21, array([-2.9329595e-04, -3.9892262e-04, -2.9478277e-04, ...,\n", + " -1.0763263e-04, 1.1553553e-04, 7.1091476e-05], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_602.wav', 'Warum seid ihr so leise?', 24, array([ 2.9226076e-05, 1.6949150e-04, 1.3950269e-04, ...,\n", + " 2.4965027e-05, 7.3044146e-05, -1.8916466e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_603.wav', 'Nun stellt euch nicht so an!', 28, array([1.4806543e-04, 1.4012858e-04, 7.7195640e-05, ..., 1.4235765e-04,\n", + " 1.3738184e-04, 1.3289873e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_608.wav', 'Das Essen wird kalt.', 20, array([ 2.36780070e-05, -1.06394495e-04, -1.18256241e-04, ...,\n", + " 8.05624004e-05, -4.60968913e-05, -8.52375670e-05], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_614.wav', 'Fachidioten soll es auch geben.', 31, array([ 7.9924423e-05, 2.0709680e-04, -6.6771558e-05, ...,\n", + " 2.4189356e-05, 6.7659719e-05, -2.3424522e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_627.wav', 'Du bist vielleicht eine Knalltüte!', 35, array([ 1.7171216e-04, -3.8676033e-05, -8.2237340e-05, ...,\n", + " -1.8530877e-04, -1.3380373e-04, -1.6169780e-04], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_629.wav', 'Natürlich nicht seine eigene.', 30, array([-2.2751655e-04, -1.5005520e-04, -9.8528086e-05, ...,\n", + " 1.8771169e-04, 2.7484499e-04, 3.0332521e-04], dtype=float32), 1.8026666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_630.wav', 'Halten Sie die Presse zurück!', 30, array([ 3.1129293e-06, 7.3669260e-05, 3.3459681e-05, ...,\n", + " -1.5276406e-04, 2.6472675e-05, -1.9852230e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_636.wav', 'Ruf schnell die Polizei!', 24, array([5.1400399e-05, 6.7014749e-05, 5.1501669e-05, ..., 1.8976731e-04,\n", + " 2.0147586e-04, 1.5075490e-04], dtype=float32), 1.5573333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_637.wav', 'Dann nimmt man sie sich.', 24, array([-0.00050762, -0.00047607, -0.00053025, ..., 0.00035113,\n", + " 0.00017673, 0.00026363], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_674.wav', 'Gibst du mir deine Nummer?', 26, array([-1.0660516e-04, -1.8238377e-05, 9.7913333e-05, ...,\n", + " 3.0329258e-05, 9.0803427e-05, 2.0600615e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_675.wav', 'Man kann nicht alles haben.', 27, array([ 3.6246947e-04, 3.3836463e-04, 3.9515106e-04, ...,\n", + " 1.9603693e-05, -1.0797187e-07, 4.7195343e-05], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_686.wav', 'Wie oft denn noch?', 18, array([-0.00025807, -0.00045327, -0.00041516, ..., -0.00053778,\n", + " -0.00065512, -0.00057833], dtype=float32), 1.2906666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_703.wav', 'Der Erste in was?', 17, array([3.7513164e-05, 2.3692524e-05, 9.2795723e-05, ..., 1.8559145e-04,\n", + " 8.4898209e-05, 1.3820640e-05], dtype=float32), 1.4323645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_720.wav', 'Wie denn nun?', 13, array([-7.8975081e-06, -2.1718148e-05, 2.7641279e-05, ...,\n", + " 3.3564411e-05, 3.3564411e-05, 1.9743769e-05], dtype=float32), 0.9525625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_722.wav', 'Ihm wäre das zu müÃ\\x9fig.', 25, array([ 5.1333802e-05, 6.3180065e-05, -1.3820640e-05, ...,\n", + " -1.9743769e-05, 3.9487541e-06, -4.7385049e-05], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_724.wav', 'Ã\\x96l ist ausgelaufen.', 20, array([-3.7513164e-05, -7.8975081e-06, -1.5795016e-05, ...,\n", + " -1.3820640e-05, -1.3820640e-05, 4.5410670e-05], dtype=float32), 1.6087708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_730.wav', 'Willkommen im Neuland!', 22, array([-6.910320e-05, -6.515444e-05, 1.382064e-05, ..., -3.356441e-05,\n", + " -1.974377e-06, 8.489821e-05], dtype=float32), 1.6652083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_737.wav', 'Kannst du mich mal zwicken?', 27, array([ 3.9487539e-05, 3.9487541e-06, 3.3564411e-05, ...,\n", + " -1.3820640e-05, -3.1590032e-05, 5.9231312e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_752.wav', 'Friede sei mit dir.', 19, array([-0.00018362, -0.00025075, -0.00027839, ..., -0.00025864,\n", + " -0.0002389 , -0.00026457], dtype=float32), 1.2347916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_753.wav', 'Mit Speck fängt man Mäuse.', 28, array([-1.61898919e-04, -1.04641986e-04, -8.68725911e-05, ...,\n", + " -5.92313118e-05, 6.31800649e-05, 7.70007027e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_766.wav', 'Bin ich die Auskunft oder was?', 30, array([2.96156559e-05, 1.04641986e-04, 1.26360130e-04, ...,\n", + " 2.46797135e-04, 2.94182188e-04, 3.25772213e-04], dtype=float32), 1.99684375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_767.wav', 'Sesam, öffne dich!', 19, array([-3.8500351e-04, -3.3366971e-04, -3.5933661e-04, ...,\n", + " -5.9231312e-05, -2.3692524e-05, 2.9615656e-05], dtype=float32), 1.4253125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_772.wav', 'Er kennt seine Pappenheimer.', 28, array([-3.7513164e-05, -1.9743769e-05, -1.3820640e-05, ...,\n", + " -8.6872591e-05, -1.5202703e-04, -1.7177081e-04], dtype=float32), 1.7146145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_792.wav', 'Da geht noch was.', 17, array([ 2.0336083e-04, 1.6979642e-04, 1.6189892e-04, ...,\n", + " -4.9359427e-05, -2.9615656e-05, -7.3051953e-05], dtype=float32), 1.25596875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_803.wav', 'Er macht es eben gründlich.', 28, array([-5.5282559e-05, -8.2923834e-05, 1.9743769e-05, ...,\n", + " -9.4770097e-05, -1.8361707e-04, -2.5469463e-04], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_816.wav', 'Spionierst du mich aus?', 23, array([3.5538786e-04, 4.5015797e-04, 4.8767112e-04, ..., 4.3436296e-05,\n", + " 1.7769393e-04, 1.7769393e-04], dtype=float32), 1.7992708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_817.wav', 'Komm zurück!', 13, array([4.0672167e-04, 2.2902773e-04, 6.3180065e-05, ..., 3.7513164e-05,\n", + " 4.7385049e-05, 6.3180065e-05], dtype=float32), 1.11484375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_819.wav', 'Sie schwebt auf Wolke sieben.', 29, array([7.5026328e-05, 1.2438576e-04, 1.5005266e-04, ..., 1.1056512e-04,\n", + " 1.4215514e-04, 1.3820639e-04], dtype=float32), 1.9756770833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_826.wav', 'Wehret den Anfängen!', 21, array([ 1.4610391e-04, 1.3425764e-04, 1.2636013e-04, ...,\n", + " -5.9231311e-06, -1.5795016e-05, -2.9615656e-05], dtype=float32), 1.8486666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_834.wav', 'Altes Haus, lass dich drücken!', 31, array([1.75719557e-04, 1.63873294e-04, 8.88469658e-05, ...,\n", + " 1.04641986e-04, 2.15207096e-04, 1.46103906e-04], dtype=float32), 1.9333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_837.wav', 'Nicht nötig.', 13, array([-1.6189892e-04, -7.7000703e-05, -5.7256933e-05, ...,\n", + " 3.5538786e-05, 4.5410670e-05, 1.9743769e-05], dtype=float32), 1.2277395833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_838.wav', 'Wir sind eine Familie.', 22, array([-1.2241138e-04, -1.5992454e-04, -2.3100211e-04, ...,\n", + " 7.3051953e-05, 5.9231312e-05, 6.9103196e-05], dtype=float32), 1.7146041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_842.wav', 'Was schlagen Sie vor?', 21, array([ 3.1590032e-05, 3.5538786e-05, 4.9359427e-05, ...,\n", + " -8.6872591e-05, -6.1205690e-05, -1.2438576e-04], dtype=float32), 1.3406458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_844.wav', 'Probier mal!', 12, array([ 1.4018077e-04, 1.6782204e-04, 2.2902773e-04, ...,\n", + " -2.1718148e-05, 4.9359427e-05, 7.3051953e-05], dtype=float32), 1.0583958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_855.wav', 'Der Schein trügt.', 18, array([ 1.3228325e-04, 4.3436296e-05, 9.8718847e-06, ...,\n", + " 7.5026328e-05, 7.8975081e-06, -3.9487541e-06], dtype=float32), 1.45353125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_861.wav', 'Du hast mich nie geliebt.', 25, array([ 1.02667604e-04, 1.57950155e-04, 1.50052656e-04, ...,\n", + " -2.17181478e-05, 2.76412793e-05, 0.00000000e+00], dtype=float32), 1.7146041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_872.wav', 'Chili ist scharf.', 17, array([-1.1253949e-04, -8.6872591e-05, -1.1648824e-04, ...,\n", + " -1.1846262e-04, -2.5666901e-05, 1.9743770e-06], dtype=float32), 1.7710520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_874.wav', 'Das lässt mich kalt.', 21, array([ 2.1718148e-05, 3.3564411e-05, 5.3308180e-05, ...,\n", + " -1.1846262e-05, -1.9743769e-05, -7.3051953e-05], dtype=float32), 1.5805416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_881.wav', 'Kinder brauchen Helden.', 23, array([-1.8361707e-04, -1.4610391e-04, -1.1846262e-04, ...,\n", + " -1.9743770e-06, -2.7641279e-05, 5.9231312e-05], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_3.wav', 'Voll der gute Vergleich!', 24, array([-3.94875406e-06, -1.08590735e-04, -1.40180768e-04, ...,\n", + " 3.94875387e-05, 1.12539492e-04, 1.16488241e-04], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_14.wav', 'Gibt es das überhaupt?', 23, array([-1.0069323e-04, -1.5202703e-04, -1.8164268e-04, ...,\n", + " -6.9103196e-05, -3.9487539e-05, -6.5154440e-05], dtype=float32), 1.5523125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_35.wav', 'Bleib wachsam.', 14, array([-1.5597578e-04, -1.4807828e-04, -3.1590032e-05, ...,\n", + " -1.9743770e-06, -5.9231311e-06, 4.5410670e-05], dtype=float32), 1.2983020833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_43.wav', 'Jeder hat das Recht auf Bildung.', 32, array([ 5.72569334e-05, 1.04641986e-04, 1.89540195e-04, ...,\n", + " -7.50263280e-05, -5.92313118e-05, -1.14513867e-04], dtype=float32), 1.8204479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_64.wav', 'Nur nicht politisch werden!', 27, array([-7.8975081e-06, 8.2923834e-05, 1.3425764e-04, ...,\n", + " -8.0949460e-05, -6.3180065e-05, -1.3623202e-04], dtype=float32), 1.6652083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_67.wav', 'Wir sprechen uns später noch mal.', 34, array([ 6.8037030e-03, 6.8649091e-03, 7.0327311e-03, ...,\n", + " 5.9231311e-06, -3.1590032e-05, -1.5795016e-05], dtype=float32), 1.9051145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_69.wav', 'Wem gehört welcher Becher?', 27, array([ 6.0810812e-04, 1.8756582e-04, 8.8846966e-05, ...,\n", + " 8.6872591e-05, -1.5795016e-05, -2.1323272e-04], dtype=float32), 1.7498854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_74.wav', 'Was kann der Arbeiter dafür?', 29, array([ 6.71288217e-05, 7.89750775e-05, 1.02667604e-04, ...,\n", + " -5.52825586e-05, -2.56669009e-05, -1.57950162e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_94.wav', 'Wir möchten abreisen.', 22, array([-1.2636013e-04, -7.3051953e-05, -7.7000703e-05, ...,\n", + " -3.1590032e-05, -4.1461917e-05, -1.7769393e-05], dtype=float32), 1.7075520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_97.wav', 'Halbe Fahrt voraus!', 19, array([ 5.3308180e-05, 2.7641279e-05, -1.1253949e-04, ...,\n", + " -7.8975081e-06, 1.9743769e-05, 7.3051953e-05], dtype=float32), 1.5382083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_101.wav', 'Gute Wahl!', 10, array([-5.3308180e-05, -4.1461917e-05, -4.3436296e-05, ...,\n", + " 1.9743769e-05, 2.5666901e-05, -1.9743769e-05], dtype=float32), 0.8608333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_111.wav', 'Ich kenne den doch gar nicht!', 29, array([ 4.9359427e-05, 3.5538786e-05, 6.9103196e-05, ...,\n", + " -2.7641279e-05, 1.3228325e-04, 7.7000703e-05], dtype=float32), 1.98978125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_135.wav', 'Die Hände auf den Rücken!', 27, array([-7.7000703e-05, -5.1333802e-05, -7.1077571e-05, ...,\n", + " -2.7641279e-05, -4.1461917e-05, 1.7769393e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_142.wav', 'Am Deal wird nichts geändert.', 30, array([1.4412952e-04, 1.6979642e-04, 1.7571956e-04, ..., 4.5410670e-05,\n", + " 5.7256933e-05, 6.1205690e-05], dtype=float32), 1.9051145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_148.wav', 'Das ist eine Wucht.', 19, array([-4.93594271e-05, -1.57950155e-04, -1.08590735e-04, ...,\n", + " 2.44822761e-04, 1.61898919e-04, 1.16488241e-04], dtype=float32), 1.58053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_152.wav', 'Renitent!', 9, array([2.8233591e-04, 2.6061776e-04, 2.2902773e-04, ..., 1.5795015e-04,\n", + " 1.5202703e-04, 2.9615656e-05], dtype=float32), 1.3124166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_161.wav', 'Ist noch Kaffee da?', 19, array([-1.46103906e-04, -6.91031964e-05, -1.02667604e-04, ...,\n", + " -7.89750775e-05, -2.17181478e-05, 7.89750811e-06], dtype=float32), 1.6369895833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_166.wav', 'Da werden Erinnerungen wach.', 28, array([ 2.1718148e-05, 1.9743769e-05, -9.8718854e-05, ...,\n", + " 8.4898209e-05, 9.2795723e-05, 1.1846262e-05], dtype=float32), 1.7922291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_175.wav', 'Suchen Sie die Herausforderung?', 31, array([-1.4215514e-04, -9.4770097e-05, -1.2833450e-04, ...,\n", + " -4.5410670e-05, -8.2923834e-05, -6.9103196e-05], dtype=float32), 1.764)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_213.wav', 'Kommt ihr mit zur Demo?', 23, array([-7.3051953e-05, -3.7513164e-05, -6.3180065e-05, ...,\n", + " 6.1205690e-05, 1.2241138e-04, 1.4807828e-04], dtype=float32), 1.7781041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_214.wav', 'Was sagt er?', 12, array([-2.6456651e-04, -2.2507898e-04, -2.0928397e-04, ...,\n", + " 4.3436296e-05, 8.0949460e-05, 1.8164268e-04], dtype=float32), 1.622875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_215.wav', 'Ich will mehr Geld!', 19, array([-8.4898209e-05, -9.4770097e-05, -1.1451387e-04, ...,\n", + " -1.1056512e-04, -8.2923834e-05, -1.1846262e-04], dtype=float32), 1.5664270833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_219.wav', 'Du bist überstimmt.', 20, array([ 1.04641986e-04, 6.91031964e-05, 2.76412793e-05, ...,\n", + " -1.02667604e-04, -2.58643384e-04, -2.05335207e-04], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_242.wav', 'Rutsch mir doch den Buckel runter.', 34, array([-5.7256933e-05, -3.9487541e-06, 4.5410670e-05, ...,\n", + " 1.6979642e-04, 7.5026328e-05, -1.5795016e-05], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_280.wav', 'Und ab dafür!', 14, array([ 7.1077571e-05, 1.1056512e-04, 2.0138646e-04, ...,\n", + " -4.3436296e-05, 2.7641279e-05, -6.9103196e-05], dtype=float32), 1.2030416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_283.wav', 'Er meint den Doppeldecker.', 26, array([-8.0949460e-05, -7.7000703e-05, -2.9615656e-05, ...,\n", + " -1.2833450e-04, -8.0949460e-05, -1.8164268e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_284.wav', 'Oder spricht etwas dagegen?', 27, array([ 2.0533521e-04, 1.4215514e-04, 1.4018077e-04, ...,\n", + " -1.3820639e-04, -7.8975077e-05, -1.6584767e-04], dtype=float32), 1.7851666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_288.wav', 'Auf zu neuen Ufern!', 19, array([ 3.5736224e-04, 4.6990174e-04, 6.1798003e-04, ...,\n", + " 9.2795723e-05, 2.1718148e-05, -4.9359427e-05], dtype=float32), 1.7216666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_291.wav', 'Kostprobe gefällig?', 20, array([-1.7571956e-04, -2.3889962e-04, -1.9348894e-04, ...,\n", + " -2.5864338e-04, -1.6584767e-04, -2.9615656e-05], dtype=float32), 1.4182604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_299.wav', 'Der Wein muss noch atmen.', 25, array([-3.5341349e-04, -2.4482276e-04, -2.2705336e-04, ...,\n", + " -6.1205690e-05, 5.9231311e-06, 4.5410670e-05], dtype=float32), 1.9333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_300.wav', 'Das ist nichts Ernstes.', 23, array([1.5597578e-04, 1.7177081e-04, 6.1205690e-05, ..., 2.7641279e-05,\n", + " 3.1590032e-05, 4.9359427e-05], dtype=float32), 1.9121666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_311.wav', 'Nee, lass mal stecken.', 22, array([ 2.3692524e-05, 3.1590032e-05, -4.7385049e-05, ...,\n", + " 3.8105476e-04, 4.1264479e-04, 6.8313448e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_321.wav', 'Ha, das war die Rache!', 22, array([-1.6979642e-04, 3.3564411e-05, 1.1056512e-04, ...,\n", + " 1.6387329e-04, 2.7048966e-04, 2.0533521e-04], dtype=float32), 1.7922291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_326.wav', 'Eigentlich ist es logisch.', 26, array([ 7.3051953e-05, 3.9487541e-06, 2.5666901e-05, ...,\n", + " -1.5795016e-05, -7.1077571e-05, 7.8975081e-06], dtype=float32), 1.7075520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_351.wav', 'Der Wein muss atmen können.', 28, array([2.9615656e-05, 4.3436296e-05, 8.0949460e-05, ..., 4.7385049e-05,\n", + " 1.7769393e-05, 1.9743770e-06], dtype=float32), 1.8063229166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_354.wav', 'Mieter haben Rechte.', 20, array([-1.5795016e-05, -9.8718847e-06, 3.3564411e-05, ...,\n", + " -2.1520710e-04, -1.5992454e-04, -4.5410670e-05], dtype=float32), 1.7216666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_369.wav', 'Was für eine Erkenntnis!', 25, array([-1.02667604e-04, -8.68725911e-05, -4.73850487e-05, ...,\n", + " 3.35644108e-05, 7.70007027e-05, 8.68725911e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_371.wav', 'Ich schieÃ\\x9fe mit rechts.', 24, array([ 1.3623202e-04, 7.8975077e-05, 4.3436296e-05, ...,\n", + " -1.1056512e-04, -1.1451387e-04, -7.3051953e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_376.wav', 'Ist Scooter nicht eine Band?', 28, array([ 3.5538786e-05, 0.0000000e+00, -5.9231311e-06, ...,\n", + " -6.3180065e-05, -1.3820639e-04, -1.2043700e-04], dtype=float32), 1.9474479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_387.wav', 'Wir sind Dickhäuter.', 21, array([1.52027031e-04, 1.12539492e-04, 1.02667604e-04, ...,\n", + " 1.38206397e-05, 5.92313108e-06, 8.09494595e-05], dtype=float32), 1.5946458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_389.wav', 'Sei nicht so streng mit ihm!', 28, array([-3.5538786e-05, 1.7769393e-05, 7.1077571e-05, ...,\n", + " -1.1451387e-04, -1.6189892e-04, -2.0928397e-04], dtype=float32), 1.9474479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_390.wav', 'Na ja, was willst du machen?', 28, array([-1.3820640e-05, -4.1461917e-05, -4.5410670e-05, ...,\n", + " -9.0821341e-05, -1.1846262e-05, -4.3436296e-05], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_419.wav', 'Die Einschläge kommen näher.', 30, array([ 7.5026328e-05, 5.5282559e-05, 1.5597578e-04, ...,\n", + " 3.1590032e-05, 2.1718148e-05, -4.7385049e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_420.wav', 'Willst du mit mir gehen?', 24, array([-1.2438576e-04, -1.9546332e-04, -1.6782204e-04, ...,\n", + " -3.7513164e-05, -1.0661636e-04, 7.7000703e-05], dtype=float32), 1.8204479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_426.wav', 'Hier bitte eine Unterschrift.', 29, array([1.6979642e-04, 1.8361707e-04, 1.7177081e-04, ..., 1.5202703e-04,\n", + " 2.1718148e-05, 0.0000000e+00], dtype=float32), 1.891)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_430.wav', 'Zum Glück nicht.', 17, array([-6.7128822e-05, -9.8718854e-05, -3.1590032e-05, ...,\n", + " -7.3051953e-05, -9.4770097e-05, -1.1056512e-04], dtype=float32), 1.1571770833333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_437.wav', 'Einfach nur top!', 16, array([-3.9684979e-04, -4.2646544e-04, -4.0277292e-04, ...,\n", + " -2.4087400e-04, -3.7513164e-05, -1.4412952e-04], dtype=float32), 1.35475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_439.wav', 'Mach dir nichts daraus.', 23, array([ 4.2843982e-04, 5.0938927e-04, 4.6595297e-04, ...,\n", + " 7.8975081e-06, -3.1590032e-05, 2.5666901e-05], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_443.wav', 'Lauf doch nicht immer durchs Bild!', 34, array([-3.5538786e-04, -1.7769393e-04, -1.1451387e-04, ...,\n", + " -3.9487539e-05, -4.3436296e-05, -3.9487539e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_464.wav', 'Hände hoch!', 12, array([-2.0138646e-04, -1.3425764e-04, -8.0949460e-05, ...,\n", + " 2.0138646e-04, 1.8756582e-04, 2.6061776e-04], dtype=float32), 1.04428125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_475.wav', 'Was weiÃ\\x9f ich denn?', 19, array([-2.7641279e-05, -1.9743770e-06, 8.2923834e-05, ...,\n", + " 7.3051953e-05, 9.8718854e-05, -4.9359427e-05], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_487.wav', 'Ich will noch nicht ins Bett!', 29, array([ 5.7256933e-05, -7.8975081e-06, 1.7769393e-05, ...,\n", + " -2.9615656e-05, -1.1846262e-05, 2.5666901e-05], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_499.wav', 'Darüber kann man streiten.', 27, array([ 0.00011846, 0.00020534, 0.00027839, ..., -0.00031195,\n", + " -0.00021521, -0.00017769], dtype=float32), 1.8486666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_500.wav', 'Dazu braucht man Ruhe.', 22, array([-1.1056512e-04, -1.4610391e-04, -1.3425764e-04, ...,\n", + " 6.9103196e-05, 1.6189892e-04, 2.2507898e-04], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_524.wav', 'Das Zeug ist wirklich gut.', 26, array([-3.9487541e-06, 3.5538786e-05, -9.8718847e-06, ...,\n", + " 1.1846262e-05, 1.9743769e-05, 9.8718847e-06], dtype=float32), 1.891)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_539.wav', 'Betäubungsgewehr geladen!', 26, array([ 3.5538786e-05, 2.3692524e-05, 0.0000000e+00, ...,\n", + " -9.2795723e-05, -1.9151457e-04, -1.8756582e-04], dtype=float32), 1.8768854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_547.wav', 'Was wird wie geschrieben?', 25, array([-0.00012636, -0.00020336, -0.0002231 , ..., 0.00021521,\n", + " 0.00020336, 0.0001619 ], dtype=float32), 1.8768854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_569.wav', 'Nicht schon wieder, bitte.', 26, array([ 0.00035736, 0.00043436, 0.00037316, ..., -0.00013821,\n", + " -0.00013031, -0.0001619 ], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_582.wav', 'Bist du blind?', 14, array([ 1.9743769e-05, -1.5795016e-05, -5.7256933e-05, ...,\n", + " 0.0000000e+00, 8.6872591e-05, 4.5410670e-05], dtype=float32), 1.2841875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_592.wav', 'Blinzeln zählt nicht.', 22, array([ 5.9231311e-06, -4.5410670e-05, -9.8718854e-05, ...,\n", + " 1.6387329e-04, 1.3820639e-04, 7.1077571e-05], dtype=float32), 1.8768958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_595.wav', 'Ja, warum denn bitte schön nicht?', 34, array([-0.00036329, -0.00033959, -0.00036131, ..., -0.00016585,\n", + " -0.00021521, -0.0001619 ], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_597.wav', 'Mir wäre das peinlich.', 23, array([ 5.9231312e-05, 1.2241138e-04, 7.5026328e-05, ...,\n", + " -1.5795016e-05, -8.2923834e-05, -6.7128822e-05], dtype=float32), 1.8345520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_637.wav', 'So kann es gehen.', 17, array([-7.8975081e-06, 3.9487541e-06, 4.1461917e-05, ...,\n", + " 5.1333802e-05, 1.3030888e-04, 3.9487539e-05], dtype=float32), 1.3688645833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_642.wav', 'Es bleibt spannend.', 19, array([1.8559145e-04, 1.8559145e-04, 1.5597578e-04, ..., 7.3051953e-05,\n", + " 5.7256933e-05, 1.1451387e-04], dtype=float32), 1.559375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_647.wav', 'Marek will noch mal.', 20, array([-2.44822761e-04, -1.04641986e-04, -8.09494595e-05, ...,\n", + " 1.48078281e-04, 1.81642681e-04, 2.50745885e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_661.wav', 'Ruhig Brauner!', 14, array([-1.02667604e-04, -4.73850487e-05, 8.09494595e-05, ...,\n", + " -9.87188469e-06, -8.88469658e-05, -1.12539492e-04], dtype=float32), 1.2771354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_682.wav', 'Meine Rede!', 11, array([5.9428747e-04, 5.0544052e-04, 2.0730958e-04, ..., 7.5026328e-05,\n", + " 6.5154440e-05, 6.5154440e-05], dtype=float32), 1.0725104166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_685.wav', 'Was versprichst du dir davon?', 29, array([-1.1846262e-05, -9.8718847e-06, 4.3436296e-05, ...,\n", + " -2.3692524e-05, 1.9743770e-06, 2.7641279e-05], dtype=float32), 1.7710520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_693.wav', 'Ich nehme euch alle.', 20, array([-3.1590032e-05, -5.9231311e-06, -7.5026328e-05, ...,\n", + " -8.8846966e-05, -7.3051953e-05, -5.1333802e-05], dtype=float32), 1.7851666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_711.wav', 'Warum nämlich?', 15, array([ 5.9231312e-05, 5.9231312e-05, 3.1590032e-05, ...,\n", + " 1.1846262e-05, -5.9231311e-06, -7.5026328e-05], dtype=float32), 1.3688645833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_712.wav', 'Das hätte ich beinahe vergessen.', 33, array([-2.1125835e-04, -2.4482276e-04, -1.4610391e-04, ...,\n", + " 9.0821341e-05, 1.7966831e-04, 1.0661636e-04], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_726.wav', 'Möchtest du auch einen Muffin?', 31, array([-3.9487539e-05, -2.7641279e-05, 6.3180065e-05, ...,\n", + " 1.7769393e-05, 6.7128822e-05, 7.1077571e-05], dtype=float32), 1.9545104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_727.wav', 'Es hat nichts mit dir zu tun.', 29, array([-1.6584767e-04, -1.9348894e-04, -2.7641279e-04, ...,\n", + " 6.5154440e-05, 4.3436296e-05, 1.2438576e-04], dtype=float32), 1.7569479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_732.wav', 'Vielleicht war ich etwas vorschnell.', 36, array([1.7177081e-04, 1.6584767e-04, 8.6872591e-05, ..., 1.9546332e-04,\n", + " 1.8954019e-04, 1.5597578e-04], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_735.wav', 'Hatschi!', 8, array([ 1.2043700e-04, -1.7769393e-05, -1.9743770e-06, ...,\n", + " -1.1846262e-05, -4.5410670e-05, -7.7000703e-05], dtype=float32), 0.8114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_739.wav', 'Ich bleibe dabei.', 17, array([-2.0533521e-04, -1.2438576e-04, -5.5282559e-05, ...,\n", + " 4.5410670e-05, -1.3820640e-05, -7.7000703e-05], dtype=float32), 1.2418541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_762.wav', 'Nicht zu fassen!', 16, array([1.7414006e-03, 1.4353720e-03, 9.6547039e-04, ..., 6.3180065e-05,\n", + " 1.8164268e-04, 8.0949460e-05], dtype=float32), 1.1712916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_773.wav', 'Gute Besserung!', 15, array([-5.1333802e-05, 0.0000000e+00, 2.1718148e-05, ...,\n", + " -1.2636013e-04, -1.9546332e-04, -1.4215514e-04], dtype=float32), 1.2771354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_775.wav', 'Ja, so ist es wohl.', 19, array([-1.61898919e-04, 1.97437694e-05, 1.02667604e-04, ...,\n", + " -6.51544397e-05, -1.26360130e-04, -6.71288217e-05], dtype=float32), 1.44646875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_778.wav', 'Mich selbst hat das überrascht.', 32, array([7.7000703e-05, 1.1846262e-04, 1.2241138e-04, ..., 1.3820639e-04,\n", + " 9.8718847e-06, 1.3820640e-05], dtype=float32), 1.8275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_782.wav', 'Wer kennt das nicht?', 20, array([-2.7641279e-05, 7.8975081e-06, -3.7513164e-05, ...,\n", + " -2.3297648e-04, -2.2902773e-04, -2.4087400e-04], dtype=float32), 1.72871875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_787.wav', 'Ich liebe diese Musik!', 22, array([-1.8361707e-04, -6.9103196e-05, -9.0821341e-05, ...,\n", + " 5.6862057e-04, 6.2587752e-04, 5.3110742e-04], dtype=float32), 1.8580729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_800.wav', 'Na endlich!', 11, array([-1.1846262e-04, -1.5202703e-04, -8.4898209e-05, ...,\n", + " 9.0821341e-05, -9.0821341e-05, -7.8975081e-06], dtype=float32), 0.91021875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_805.wav', 'Juliane gruselt sich.', 21, array([1.3425764e-04, 7.1077571e-05, 6.5154440e-05, ..., 9.8718854e-05,\n", + " 8.6872591e-05, 5.1333802e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_808.wav', 'Der andere nimmt.', 17, array([-8.6872591e-05, -1.1451387e-04, -8.2923834e-05, ...,\n", + " 2.5666901e-05, -7.3051953e-05, -7.5026328e-05], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_814.wav', 'Wieso ich?', 10, array([-1.14513867e-04, -1.02667604e-04, -1.77693932e-04, ...,\n", + " -1.18462622e-05, 0.00000000e+00, 1.38206397e-05], dtype=float32), 0.9031666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_816.wav', 'Die Haare müssen ab.', 21, array([ 1.9546332e-04, 1.2636013e-04, 2.1125835e-04, ...,\n", + " 9.8718847e-06, -4.1461917e-05, -5.5282559e-05], dtype=float32), 1.2065729166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_844.wav', 'Die ganze Woche steht das schon an.', 35, array([ 1.0602404e-03, 1.1017023e-03, 9.0031594e-04, ...,\n", + " -3.3564411e-05, -3.5538786e-05, 0.0000000e+00], dtype=float32), 1.8839479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_7.wav', 'Meinen Respekt hast du.', 23, array([-8.1613541e-07, 3.6258320e-05, 5.8615900e-05, ...,\n", + " -3.0361010e-05, 4.6051988e-05, 6.1613529e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_12.wav', 'Mein SchweiÃ\\x9f stinkt nicht.', 27, array([1.2758464e-03, 1.4472028e-03, 1.4819785e-03, ..., 1.1448720e-05,\n", + " 2.5002395e-05, 5.3266147e-05], dtype=float32), 1.872)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_24.wav', 'So sieht es jedenfalls aus.', 27, array([ 3.5462443e-05, -3.6511621e-05, -2.4387444e-05, ...,\n", + " 7.4399744e-05, 7.2159133e-07, 2.3660252e-05], dtype=float32), 1.808)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_31.wav', 'Es brennt lichterloh.', 21, array([-7.8527468e-05, -1.9054073e-04, -1.8275550e-04, ...,\n", + " -1.4771417e-05, 2.4868292e-05, -1.4910699e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_36.wav', 'Hat jemand Deo dabei?', 21, array([5.0298637e-05, 4.8803475e-05, 5.4532258e-05, ..., 3.4226623e-06,\n", + " 9.2322180e-06, 3.0618612e-05], dtype=float32), 1.7386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_44.wav', 'Der Hund will raus.', 19, array([-8.2374172e-05, -8.4805586e-05, -9.4096496e-05, ...,\n", + " 2.0108973e-05, 3.4747383e-05, -3.9627314e-05], dtype=float32), 1.5413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_46.wav', 'Nur Fliegen ist schöner.', 25, array([-2.5430196e-05, -6.4560918e-05, -6.8181558e-05, ...,\n", + " 6.0105547e-05, 9.7991426e-05, 2.9888753e-05], dtype=float32), 1.6693333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_57.wav', 'Endlich wieder Nachschub!', 25, array([-3.0662410e-05, -3.7799236e-05, -1.0512020e-04, ...,\n", + " -1.2799338e-04, -3.7069469e-05, 3.4687200e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_63.wav', \"Jetzt langt's dann aber.\", 24, array([ 1.3113129e-06, -5.7142366e-05, 3.9664551e-06, ...,\n", + " 4.8476216e-04, 4.0935431e-04, 5.0957059e-04], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_76.wav', 'Ist hier noch ein Platz frei?', 29, array([ 4.6084756e-06, 2.1333383e-06, 1.0840034e-05, ...,\n", + " 4.7717163e-05, -4.3301993e-06, 5.9024904e-07], dtype=float32), 1.7653333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_91.wav', 'Möchten Sie durch?', 19, array([5.3242915e-05, 1.1775635e-04, 9.1564674e-05, ..., 6.9772730e-05,\n", + " 3.2825061e-05, 5.5504606e-05], dtype=float32), 1.1786666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_94.wav', 'Du hast sie angemalt.', 21, array([-8.2009647e-06, -7.8560508e-05, -1.1781590e-04, ...,\n", + " 5.8809797e-05, 3.5827401e-05, -3.8682600e-05], dtype=float32), 1.5946666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_97.wav', 'Anfassen heiÃ\\x9ft kaufen.', 23, array([ 6.7132327e-04, 6.4567651e-04, 4.5344225e-04, ...,\n", + " -2.1742040e-05, -1.2411790e-04, -3.8199389e-05], dtype=float32), 1.472)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_104.wav', 'Warum nicht lieber hier?', 24, array([-1.0701143e-05, -1.5738879e-06, 6.8153045e-06, ...,\n", + " -6.3156702e-05, -1.6941859e-04, -6.0139148e-05], dtype=float32), 1.4986666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_112.wav', 'Das war ein Abenteuer.', 22, array([ 2.6408197e-05, -6.0915321e-05, -9.1295704e-05, ...,\n", + " -5.6715970e-05, -3.1489210e-05, 1.5612791e-06], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_127.wav', 'Das wäre fatal.', 16, array([ 4.4660061e-05, -6.5924425e-05, -5.6830704e-05, ...,\n", + " -5.5352357e-06, 3.0260082e-05, 9.7271128e-05], dtype=float32), 1.4666666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_141.wav', 'Nicht doch!', 11, array([-1.4546166e-04, -1.4626759e-04, -9.7611184e-05, ...,\n", + " 9.3360104e-05, 3.5025540e-05, -1.6926177e-06], dtype=float32), 0.928)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_146.wav', 'Heiliger Strohsack!', 19, array([-3.7175673e-04, -2.1206291e-04, -8.9090288e-05, ...,\n", + " 1.0547445e-04, 1.0614831e-04, 5.8346381e-05], dtype=float32), 1.376)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_153.wav', 'Gehen wir in die Eisdiele?', 26, array([-3.72752729e-05, -6.43968451e-05, -1.19852075e-05, ...,\n", + " 6.90084271e-05, -1.81738214e-05, -2.24471933e-05], dtype=float32), 1.4826666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_158.wav', 'Das ist halt so.', 16, array([ 2.1661433e-05, -9.2656213e-05, -2.0038491e-05, ...,\n", + " 3.4980503e-06, 8.1309692e-05, -1.6156602e-05], dtype=float32), 1.2853333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_174.wav', 'Ich habe dich noch nie gesehen.', 31, array([ 1.68298247e-05, 2.35711445e-06, -1.13152724e-04, ...,\n", + " -5.31522637e-05, 5.38938584e-05, 1.89053408e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_179.wav', 'Das muss hart für dich sein.', 29, array([-9.2038817e-06, -9.7612574e-06, -6.3460277e-05, ...,\n", + " -5.0950723e-05, 2.0168585e-05, -1.5738755e-05], dtype=float32), 1.5893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_183.wav', \"Packen wir's!\", 13, array([-2.2114466e-05, 6.0876686e-05, -8.3392551e-05, ...,\n", + " 3.5826326e-06, -1.4385004e-05, -5.6348257e-05], dtype=float32), 0.9546666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_194.wav', 'Wir werden siegen!', 18, array([ 1.6911860e-04, 7.4598174e-05, 1.0261347e-04, ...,\n", + " 6.5378241e-05, 3.2076507e-06, -6.6169787e-06], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_197.wav', 'Darf ich mal bei dir abbeiÃ\\x9fen?', 31, array([-1.0340806e-05, 7.1646286e-06, 3.3313339e-05, ...,\n", + " -7.5323747e-05, -2.6892374e-07, -3.3816039e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_199.wav', 'Das ging aber fix!', 18, array([-9.3143040e-05, -4.3784836e-05, -1.1206182e-04, ...,\n", + " 8.7669920e-05, 1.0557293e-05, 4.2041685e-07], dtype=float32), 1.328)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_211.wav', 'Ich habe nachgedacht.', 21, array([ 5.0232731e-05, 1.2072114e-04, 1.8210443e-04, ...,\n", + " -6.5402834e-05, -5.1763345e-05, -6.0046054e-06], dtype=float32), 1.5093333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_229.wav', 'Wir lassen uns nicht erpressen.', 31, array([1.37981799e-04, 1.52958339e-04, 1.10953624e-04, ...,\n", + " 6.50644288e-05, 8.02592767e-05, 1.01248879e-04], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_236.wav', 'Sag du es mir.', 14, array([ 7.4462928e-06, -2.0409609e-05, -3.6314952e-05, ...,\n", + " -2.1986765e-05, -8.3042978e-05, 8.2145634e-06], dtype=float32), 1.216)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_240.wav', 'Ich vermisse ihn seit gestern.', 30, array([ 2.9365596e-04, 3.4678026e-04, 3.5397714e-04, ...,\n", + " -1.5735781e-05, -2.9272232e-05, 4.2558597e-05], dtype=float32), 1.9893333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_257.wav', 'So kannte ich sie gar nicht.', 28, array([ 4.4733344e-05, 7.7341829e-05, 1.1480036e-04, ...,\n", + " -1.8965245e-04, -1.4387793e-04, -1.2223862e-04], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_258.wav', 'Dem Kind geht es gut.', 21, array([ 2.3389544e-05, -1.0488247e-05, 1.0429079e-05, ...,\n", + " -8.0030593e-05, -9.8967379e-05, -4.5314195e-05], dtype=float32), 1.3066666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_260.wav', 'Lasst es krachen!', 17, array([-2.1083563e-04, -8.3892046e-05, -3.2037347e-05, ...,\n", + " -6.8306355e-05, -1.3884228e-04, -6.5104126e-05], dtype=float32), 1.2)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_266.wav', 'Wie sehen Sie überhaupt aus?', 29, array([-1.0680479e-05, -1.9320854e-05, -7.0852952e-06, ...,\n", + " -1.0408241e-05, 3.3198389e-06, 2.1512881e-06], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_281.wav', 'Damit könnte es klappen.', 25, array([-2.3432081e-05, -2.4900844e-05, -1.3450766e-04, ...,\n", + " 2.1617279e-05, 3.1534404e-05, -2.2315735e-05], dtype=float32), 1.488)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_307.wav', 'Tut das Husten weh?', 19, array([ 9.1145994e-06, 1.5820089e-05, 5.0116945e-05, ...,\n", + " 1.9206882e-05, -2.6969181e-05, -2.7526901e-05], dtype=float32), 1.5626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_310.wav', 'Und jetzt kräftig kurbeln!', 27, array([-8.4867512e-05, -1.3528325e-05, 6.7344299e-05, ...,\n", + " -5.5355646e-05, 3.2757125e-05, -1.3706725e-05], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_311.wav', 'Und was bekommt man geboten?', 28, array([-9.42486338e-07, -6.20736901e-05, -1.13615904e-04, ...,\n", + " 1.05647247e-04, 4.75407724e-05, 7.68981190e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_313.wav', 'Nimm doch mal den Hut ab!', 25, array([-1.4411381e-06, 1.8580539e-04, 1.8933907e-04, ...,\n", + " -1.0257358e-04, -9.1900030e-05, -2.2193763e-04], dtype=float32), 1.5733333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_322.wav', 'Der ist sauber.', 15, array([1.3459381e-04, 1.1068168e-04, 1.4088971e-04, ..., 1.4206764e-04,\n", + " 1.0958829e-05, 9.0381429e-05], dtype=float32), 1.344)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_333.wav', 'Danke für nichts!', 18, array([-2.6258719e-04, -2.9124424e-04, -4.0630574e-04, ...,\n", + " 9.1923815e-05, -9.6123731e-06, 3.9555922e-05], dtype=float32), 1.408)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_351.wav', 'Hier ist sie.', 13, array([-3.23740860e-05, -1.03745086e-04, -6.84802653e-05, ...,\n", + " 6.36538107e-06, 6.47425259e-05, -2.68384956e-05], dtype=float32), 1.2693333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_354.wav', 'Ist sie international bekannt?', 30, array([ 1.5060005e-05, 5.7448578e-05, 1.3811006e-04, ...,\n", + " 6.0413648e-05, -4.7934391e-05, -1.9190535e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_363.wav', 'Ich meine ja nur.', 17, array([ 5.6321147e-05, 9.9655284e-05, -8.9936962e-05, ...,\n", + " 1.1549123e-05, 3.7268135e-05, 7.3645397e-06], dtype=float32), 1.1253333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_390.wav', 'Gib mal die Seriennummer durch.', 31, array([ 7.2849958e-05, 9.1718932e-05, 5.6555116e-05, ...,\n", + " -2.9702240e-05, 3.8465154e-05, 2.2035034e-05], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_399.wav', 'Steht das Wasser auf dem Herd?', 30, array([6.5801214e-05, 1.3084775e-04, 8.1372353e-05, ..., 6.8494905e-05,\n", + " 2.1234882e-06, 2.7409065e-05], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_401.wav', 'Oh ja!', 6, array([ 2.2632883e-05, -2.7574149e-05, 2.7717488e-05, ...,\n", + " 2.9032512e-07, 1.7548422e-05, -1.3465881e-05], dtype=float32), 0.7146666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_409.wav', 'Ja oder nein?', 13, array([ 3.4988134e-05, -6.8858870e-05, -8.5955844e-06, ...,\n", + " -4.4800227e-06, 1.7184280e-05, 3.7901282e-05], dtype=float32), 1.4346666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_411.wav', 'Ist doch Jacke wie Hose.', 24, array([ 1.1507938e-04, 5.0565839e-05, -2.7287895e-05, ...,\n", + " 3.7775626e-05, -1.4040452e-05, 1.4159415e-06], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_413.wav', 'Ich habe es nie gelernt.', 24, array([ 2.58978853e-05, 6.50478396e-05, -1.03702390e-04, ...,\n", + " 8.01785427e-05, 3.00699157e-05, -1.05522995e-04], dtype=float32), 1.776)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_429.wav', 'Nicht schon wieder eine Razzia!', 31, array([-5.1378167e-05, -2.5352152e-05, -3.2764001e-05, ...,\n", + " 2.1145966e-05, 5.4651609e-05, -7.9359561e-05], dtype=float32), 1.888)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_431.wav', 'Niemand will es gewesen sein.', 29, array([6.13634029e-06, 1.00043821e-04, 1.26646410e-04, ...,\n", + " 4.00160025e-05, 6.57281998e-05, 1.20079676e-04], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_443.wav', 'Ihr seid doch bloÃ\\x9f neidisch.', 29, array([ 4.71922749e-06, -1.42986255e-05, 4.10590292e-05, ...,\n", + " -1.13690789e-04, -4.82848300e-05, 3.64537264e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_453.wav', 'Lesen lohnt sich.', 17, array([-1.1143904e-04, -9.7466742e-05, -1.4505965e-04, ...,\n", + " -1.1429377e-04, -8.0892445e-05, -8.6921274e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_458.wav', 'Oder er wurde dabei gestört.', 29, array([-1.8823694e-05, -3.1060394e-05, -9.3846960e-05, ...,\n", + " -1.2105788e-05, -3.4755056e-05, 3.5802004e-05], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_459.wav', 'Die Seele baumeln lassen.', 25, array([-4.6934008e-05, -1.4115409e-04, -1.9004452e-04, ...,\n", + " -4.7015623e-05, -2.2894224e-07, -4.3300730e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_468.wav', 'Der Nächste, bitte!', 20, array([ 8.1093880e-05, 2.9958397e-05, -3.9947310e-05, ...,\n", + " 6.6704742e-05, 1.2609754e-04, 1.1871241e-04], dtype=float32), 1.3386666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_469.wav', 'Wird schon schiefgehen.', 23, array([-1.8012641e-05, -6.1548446e-05, -1.2534855e-04, ...,\n", + " -2.9845067e-05, 3.1653948e-05, 1.2874776e-04], dtype=float32), 1.552)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_476.wav', 'Keine falsche Bewegung!', 23, array([-1.3065083e-04, -1.9577878e-04, -9.6719399e-05, ...,\n", + " 9.7838973e-05, -1.6546634e-05, 3.1119489e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_484.wav', 'Danach geht es ins Bett.', 24, array([1.4125947e-04, 1.4533960e-04, 1.3352933e-04, ..., 4.6569412e-06,\n", + " 8.5400243e-06, 1.0347654e-04], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_495.wav', 'Vorwärts immer, rückwarts nimmer!', 35, array([ 9.8868964e-05, 1.4638813e-04, 8.2029030e-05, ...,\n", + " 3.1947344e-05, -3.3244356e-05, -8.5653497e-05], dtype=float32), 1.5893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_508.wav', 'Ein Spanngurt ist gerissen.', 27, array([-1.3210842e-05, 5.2183852e-05, 1.1509426e-05, ...,\n", + " -6.6147322e-06, -1.3790486e-05, 4.0188141e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_511.wav', 'Das musst du gerade sagen!', 26, array([ 8.16162283e-05, 1.48853534e-04, 1.20252385e-04, ...,\n", + " -2.43115683e-05, 3.36854064e-05, -3.11621625e-05], dtype=float32), 1.9893333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_517.wav', 'Lösen Sie das Captcha!', 23, array([-3.2288870e-05, 5.6598521e-05, 4.2188087e-05, ...,\n", + " 7.7064447e-05, -4.7475376e-05, 4.4163811e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_520.wav', 'Ihr werdet schon sehen.', 23, array([-6.5363100e-05, 4.7253379e-05, 5.9942446e-05, ...,\n", + " 3.2326661e-05, 8.2957842e-05, 7.4098658e-05], dtype=float32), 1.7973333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_521.wav', 'Ich erkläre es dir.', 20, array([ 5.3491673e-05, -1.2072490e-05, 3.4197161e-05, ...,\n", + " -3.4515979e-05, -5.6132449e-05, 1.3709931e-04], dtype=float32), 1.5093333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_525.wav', 'Hau rein!', 9, array([ 2.57931824e-04, 2.11816674e-04, 1.78339556e-04, ...,\n", + " 7.76832676e-05, 1.51795175e-05, -4.37384588e-05], dtype=float32), 1.104)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_526.wav', 'Tief durchatmen!', 16, array([-2.6787920e-05, -3.2204316e-05, -5.5490927e-05, ...,\n", + " 2.2508255e-05, 5.4639313e-05, 1.8989524e-05], dtype=float32), 1.5253333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_538.wav', 'Und was bringt das?', 19, array([-5.9224880e-05, -4.4477289e-05, 3.8521583e-05, ...,\n", + " 9.5605545e-05, 1.2830349e-06, 1.5070126e-05], dtype=float32), 1.6213333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_541.wav', 'Karnickelfangschlag?', 20, array([3.9227842e-05, 3.2782922e-05, 4.6346566e-05, ..., 1.3389443e-05,\n", + " 3.6067817e-05, 6.0468155e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_562.wav', 'Ist ja mega!', 12, array([-1.1508126e-04, -1.5385580e-04, -1.8046032e-04, ...,\n", + " -4.1180385e-05, 2.7804810e-05, -9.9901524e-07], dtype=float32), 0.992)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_570.wav', 'Jasmin, du bist dran.', 21, array([-6.0017886e-05, 3.1120195e-05, 1.0854354e-04, ...,\n", + " -2.5416332e-06, 4.4546370e-05, -4.6334655e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_579.wav', 'Läuft es separat ab?', 21, array([ 2.2939121e-05, 2.0304271e-05, 4.7305216e-06, ...,\n", + " -4.0958774e-05, 8.3991254e-06, -4.0800154e-05], dtype=float32), 1.7813333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_594.wav', 'Ich zitiere!', 12, array([ 7.3269119e-05, 4.1316580e-06, -7.5483302e-05, ...,\n", + " 4.5700057e-05, 1.0702889e-06, 1.2143076e-05], dtype=float32), 1.2853333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_597.wav', 'Die Karten sind ja markiert!', 28, array([-7.7787427e-06, 1.3373171e-05, 1.1130486e-04, ...,\n", + " -3.4429740e-05, -9.2525712e-05, -3.0399795e-05], dtype=float32), 1.8613333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_600.wav', 'Weniger ist manchmal mehr.', 26, array([-3.2105188e-05, -1.2411436e-04, -1.7373836e-04, ...,\n", + " 1.9536817e-05, 4.0033923e-05, -4.9835093e-05], dtype=float32), 1.6693333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_610.wav', 'Zur Hölle mit ihm!', 19, array([ 4.1287938e-05, -1.5668693e-05, -4.7829257e-05, ...,\n", + " 1.2091287e-04, 3.0301053e-05, 5.0707073e-05], dtype=float32), 1.28)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_622.wav', 'Sonst kommt die Polizei.', 24, array([ 1.33967542e-05, -2.86651575e-05, 1.20430150e-05, ...,\n", + " -4.97728324e-05, -9.77511445e-05, -1.07504595e-04], dtype=float32), 1.9786666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_623.wav', 'Papa fährt immer schneller.', 28, array([-4.1551000e-05, 1.8333099e-05, -4.5995697e-05, ...,\n", + " 7.4864365e-05, -2.8456698e-05, -3.1763777e-06], dtype=float32), 1.7653333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_627.wav', 'Das Problem kenne ich.', 22, array([-1.6575548e-06, -6.4681786e-05, -2.4183499e-05, ...,\n", + " -6.1924133e-05, 4.0877181e-05, -4.8742072e-06], dtype=float32), 1.3973333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_646.wav', 'Gerade jetzt wird es spannend.', 30, array([-7.0382644e-05, -2.6976499e-05, -8.4537001e-05, ...,\n", + " 1.9848225e-05, 1.8570287e-05, 1.1454727e-04], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_650.wav', 'Pass mal auf!', 13, array([ 8.8038476e-05, 6.2287538e-05, 8.6767104e-05, ...,\n", + " -4.7867183e-05, 1.7106903e-06, -2.8001863e-05], dtype=float32), 1.0773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_653.wav', 'Führe mich nicht in Versuchung!', 32, array([ 1.5389375e-04, 8.4856605e-05, 1.1764471e-04, ...,\n", + " -4.1702488e-06, 4.8200640e-05, 3.7042355e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_658.wav', 'Dabei soll es bleiben.', 22, array([-6.8817273e-05, -1.4116750e-04, -2.5068663e-04, ...,\n", + " 3.3109423e-05, -1.2034771e-05, 5.3297503e-05], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_677.wav', 'Ich denke nicht daran.', 22, array([ 2.7965652e-06, -8.1217448e-05, -1.5171595e-04, ...,\n", + " -6.0021226e-05, 5.8105360e-07, -2.3721210e-05], dtype=float32), 1.472)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_702.wav', 'Sieh zu, dass du Land gewinnst!', 31, array([-3.9686485e-05, -4.1371659e-05, -5.1444043e-05, ...,\n", + " -6.5746033e-05, -6.9277223e-05, -3.0258396e-05], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_705.wav', 'Was sagt uns das?', 17, array([-1.11950721e-04, -1.12432775e-04, -1.54395209e-04, ...,\n", + " 1.18786911e-05, -6.98161457e-05, -2.93514750e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_715.wav', 'Von nichts komm nichts.', 23, array([ 5.0694278e-05, -1.0824220e-04, -7.8278521e-05, ...,\n", + " 5.2878531e-05, 3.1005864e-05, 2.5896241e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_718.wav', 'Warum auch?', 11, array([ 2.5824769e-05, 7.0119269e-05, 3.9937982e-05, ...,\n", + " 1.3905319e-05, -2.6308078e-05, -5.1800267e-05], dtype=float32), 0.9493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_721.wav', 'Wo wohne ich noch mal?', 22, array([ 1.1702570e-04, 1.8368529e-04, 1.5237987e-04, ...,\n", + " -3.3846823e-05, -4.2944125e-06, 2.2590933e-05], dtype=float32), 1.6)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_725.wav', 'Zum Wohl!', 9, array([-2.1576473e-06, 2.8079157e-05, -2.9355248e-05, ...,\n", + " -2.9330091e-05, -3.0764484e-05, -1.3724362e-05], dtype=float32), 0.7466666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_735.wav', 'Wie geht es dir?', 16, array([ 3.0780422e-05, -4.9582297e-05, -8.5829226e-05, ...,\n", + " 2.1407772e-05, -4.8474238e-05, -4.5784309e-05], dtype=float32), 1.232)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_746.wav', 'Einmal drücken reicht.', 23, array([-4.4286557e-05, -5.6155724e-05, -5.2055671e-05, ...,\n", + " -5.5887984e-05, 1.7236773e-05, 9.8498596e-05], dtype=float32), 1.4373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_761.wav', 'Ersatz muss her.', 16, array([ 8.3686442e-05, 9.1279635e-06, -8.3661522e-05, ...,\n", + " 3.3542208e-05, 9.7035401e-05, -4.7421363e-05], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_769.wav', 'Kennen Sie diesen Eisbären?', 28, array([ 1.8226114e-04, 1.1602399e-04, 8.7942906e-05, ...,\n", + " -3.1415253e-05, 6.8828485e-05, 2.8598015e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_774.wav', 'Du tüdelst wohl!', 17, array([4.2244592e-05, 4.7479767e-05, 4.4327684e-05, ..., 2.9398587e-05,\n", + " 1.3265206e-04, 9.8947305e-05], dtype=float32), 1.312)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_776.wav', 'Einen Versuch ist es wert.', 26, array([-2.0919964e-05, -8.0129103e-05, -7.8644814e-05, ...,\n", + " 3.4572986e-05, 8.1091166e-05, 5.6626621e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_779.wav', 'Kruzifix noch mal!', 18, array([ 5.9276794e-05, 7.1346542e-05, 1.3115312e-05, ...,\n", + " -7.0933937e-05, 2.6771322e-05, 3.3997876e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_781.wav', 'Sind die echt?', 14, array([-3.2039690e-05, -4.8189206e-05, -9.0187306e-05, ...,\n", + " 2.1210299e-05, 9.5539394e-07, -6.0049209e-05], dtype=float32), 1.1946666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_789.wav', 'Wie war euer Jahrgangstreffen?', 30, array([ 9.86098894e-05, 1.05807514e-04, 1.31781504e-04, ...,\n", + " -6.47349443e-05, 5.55652514e-06, 6.68639914e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_796.wav', 'Langt das?', 10, array([-2.58835917e-05, -1.11602596e-04, -2.00994928e-05, ...,\n", + " 3.40378210e-05, 4.15314862e-05, -2.47353237e-05], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_851.wav', 'Nein, das gehört so.', 21, array([4.30460314e-05, 1.00948644e-04, 1.14135793e-04, ...,\n", + " 2.88395531e-04, 1.62498865e-04, 8.75307087e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_852.wav', 'Stellen Sie Blickkontakt her.', 29, array([-2.3877754e-05, -3.1883523e-05, -1.3378897e-04, ...,\n", + " -3.8810729e-05, 4.3067663e-05, 3.8920269e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_858.wav', 'Also echt jetzt!', 16, array([ 1.62354499e-05, 4.22473058e-05, -1.46273105e-05, ...,\n", + " -2.93930316e-05, 5.34094252e-05, 7.98595574e-05], dtype=float32), 1.216)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_7.wav', 'Ich glaube nicht.', 17, array([-1.0143876e-05, -3.8619244e-05, 8.2748767e-05, ...,\n", + " -9.9806406e-05, -4.3946784e-05, 6.9558562e-05], dtype=float32), 1.1946666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_18.wav', 'Hier ist es sicherer.', 21, array([ 4.6870970e-05, 9.9823235e-05, -4.0877108e-05, ...,\n", + " -1.4616339e-05, 7.3614872e-05, 1.0970575e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_49.wav', 'Ja ja, als ob!', 14, array([ 5.3198488e-05, 1.8346685e-04, -2.1753046e-06, ...,\n", + " 1.7834389e-05, 5.3522737e-05, 8.4725587e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_59.wav', 'Geh, such deine Schwester!', 26, array([ 9.13840049e-05, 1.68439132e-04, 3.04173911e-04, ...,\n", + " -8.56241095e-05, -1.02150196e-04, 8.91289255e-06], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_94.wav', 'Gib mir meinen Becher wieder!', 29, array([-2.1092707e-04, -2.3195105e-04, -2.0152969e-04, ...,\n", + " 8.9153917e-05, -2.4260396e-06, 5.9283586e-05], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_100.wav', 'Das führt doch zu nichts.', 26, array([-1.0273771e-04, -8.6229462e-05, -1.2574486e-04, ...,\n", + " 2.4963025e-05, 4.4582037e-05, 4.7964921e-05], dtype=float32), 1.9733333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_105.wav', 'Wo denn?', 8, array([-4.0845240e-05, 1.0149255e-04, 5.9910049e-05, ...,\n", + " -3.8421931e-05, 2.8110459e-05, 1.7339922e-05], dtype=float32), 0.9493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_106.wav', 'Du sitzt hinten.', 16, array([ 1.1350374e-04, 1.3197908e-04, 5.9344729e-05, ...,\n", + " -1.6409816e-04, -7.1399249e-05, -4.2459251e-05], dtype=float32), 1.44)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_112.wav', 'Das kann ich nicht.', 19, array([-9.4199102e-05, -3.3980414e-05, 9.0330948e-05, ...,\n", + " 1.1509175e-04, 2.2319029e-05, 5.1328014e-05], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_139.wav', 'Das hat sie gelernt.', 20, array([ 1.5456244e-04, 3.1872053e-04, 3.7880472e-04, ...,\n", + " -8.6764321e-06, -1.7240205e-05, -5.7155878e-05], dtype=float32), 1.4826666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_153.wav', 'Nicht alle Teenager sind so.', 28, array([7.9220721e-05, 5.8759109e-05, 1.1493213e-04, ..., 6.8786328e-05,\n", + " 1.5815135e-04, 8.5130850e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_156.wav', 'Frische Seeluft macht gesund.', 29, array([ 1.8124521e-04, 1.7306159e-04, 5.9669415e-05, ...,\n", + " 4.9480139e-05, 1.2296322e-04, -5.5897519e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_164.wav', 'Gönn dir!', 10, array([ 5.2993961e-05, 2.8179937e-05, 7.8242076e-05, ...,\n", + " -4.9057824e-05, 1.8003910e-05, 8.8817593e-05], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_176.wav', 'Sag ich doch!', 13, array([ 4.2398951e-05, 5.6847359e-05, 7.0788061e-05, ...,\n", + " -3.2739328e-05, 9.7135853e-05, 6.0795941e-05], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_177.wav', 'Das darf doch nicht wahr sein.', 30, array([-5.1426803e-05, -5.0517308e-05, 4.6803252e-05, ...,\n", + " -8.1146150e-05, 2.9068062e-05, 7.5193479e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_199.wav', 'Jetzt sind wir quitt.', 21, array([-2.4918138e-05, 8.0159109e-05, -7.1328832e-05, ...,\n", + " -2.1099215e-04, -3.0862509e-05, -3.5725458e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_207.wav', 'Eben ging das noch.', 19, array([-5.0324921e-05, 1.3549793e-04, -3.3347860e-05, ...,\n", + " 9.8024408e-05, 1.5384333e-04, 1.5966935e-04], dtype=float32), 1.53875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_208.wav', 'Bug oder Feature?', 17, array([-3.7243055e-06, 6.9413843e-05, 7.5392752e-05, ...,\n", + " 5.2070121e-05, 2.8219682e-05, 8.4193009e-05], dtype=float32), 1.8053020833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_274.wav', 'Wir brauchen mehr davon!', 24, array([-2.0753406e-04, -1.9484414e-05, -2.8117347e-04, ...,\n", + " 1.2726737e-04, 2.6360145e-04, 2.9073044e-04], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_280.wav', 'Lass uns raus gehen.', 20, array([ 1.03469618e-04, 1.97744346e-04, -7.93442814e-06, ...,\n", + " 8.44921742e-05, 2.30915975e-05, -1.33781205e-05], dtype=float32), 1.5508645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_286.wav', 'SchluÃ\\x9f mit lustig.', 19, array([ 2.99623178e-04, 2.43378381e-04, 1.65333462e-04, ...,\n", + " -2.71533063e-05, 7.85075972e-05, -1.17198346e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_302.wav', 'Woher nehmt ihr eure Bildung?', 29, array([1.7700881e-04, 2.1893253e-04, 1.3036304e-04, ..., 1.3868474e-04,\n", + " 1.0062666e-04, 8.4173589e-05], dtype=float32), 1.9749270833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_308.wav', 'Du fährst, ich schieÃ\\x9fe!', 25, array([1.5563566e-04, 1.4856170e-04, 2.2446582e-04, ..., 6.8505600e-05,\n", + " 2.0769508e-04, 1.1925176e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_328.wav', 'Wirkt die Betäubung noch?', 26, array([-8.7537330e-05, -3.0825776e-04, -2.8424736e-04, ...,\n", + " 1.1261477e-04, 2.0012977e-04, 1.0000553e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_385.wav', 'Es kann nur einen geben!', 24, array([-1.8947560e-04, -2.3450297e-05, -1.2145152e-04, ...,\n", + " -6.9378242e-05, -1.1301338e-04, -2.5457976e-04], dtype=float32), 1.8901145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_400.wav', 'Wer weiÃ\\x9f es?', 13, array([ 8.2401210e-05, 1.2261249e-05, 1.3193028e-04, ...,\n", + " -9.9374527e-05, -2.4473227e-05, 7.3499345e-05], dtype=float32), 1.49028125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_406.wav', 'Tja, das ist Pech.', 18, array([2.4313416e-04, 4.7331341e-05, 1.6022228e-04, ..., 3.0806483e-04,\n", + " 2.9170502e-04, 3.0395557e-04], dtype=float32), 1.7810729166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_412.wav', 'Alles muss raus.', 16, array([2.3146431e-04, 2.1641712e-04, 1.4716707e-04, ..., 1.4341300e-04,\n", + " 3.7975753e-06, 9.1287213e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_415.wav', 'Stell die Heizung höher.', 25, array([-3.96930409e-05, 1.02812344e-04, 1.21250734e-04, ...,\n", + " -3.47016321e-05, -2.01824150e-04, -9.76954325e-05], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_418.wav', 'Etwa über mich?', 16, array([-0.00020996, -0.00011494, -0.00010331, ..., -0.00017556,\n", + " -0.00020319, -0.00027111], dtype=float32), 1.7689479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_421.wav', 'Das ist natürlich bitter.', 26, array([-3.3627803e-04, -2.5203897e-04, -2.3072124e-04, ...,\n", + " 4.6018063e-06, 1.7239379e-05, 4.0267703e-05], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_440.wav', 'Hier knicken.', 13, array([-0.000481 , -0.00023708, -0.00018911, ..., -0.00022185,\n", + " -0.00025873, -0.00026997], dtype=float32), 1.30853125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_464.wav', 'Alles Lügen!', 13, array([-0.00027017, -0.00016623, -0.00022159, ..., -0.00033337,\n", + " -0.00044782, -0.00022404], dtype=float32), 1.4175833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_465.wav', 'Alles oder nichts!', 18, array([2.8375158e-05, 6.5034241e-05, 9.6457785e-05, ..., 1.0699107e-04,\n", + " 9.6596435e-05, 1.2572719e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_467.wav', 'Warum bleibst du stehen?', 24, array([-1.4808709e-04, -1.8631479e-04, -1.2836477e-04, ...,\n", + " -6.0794730e-05, -1.5104183e-05, -2.5347929e-04], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_473.wav', 'Zumindest ein bisschen.', 23, array([-0.00024013, -0.00025727, -0.00025987, ..., -0.00023257,\n", + " -0.00033333, -0.00025996], dtype=float32), 1.5993229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_474.wav', 'Sprich mir nach!', 16, array([-1.7584162e-04, -1.6248986e-04, -8.6785782e-05, ...,\n", + " 3.5318243e-04, 3.7314874e-04, 3.2366288e-04], dtype=float32), 1.4175833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_500.wav', 'Sehr witzig!', 12, array([ 7.5077987e-05, 1.1926649e-04, 1.8323194e-04, ...,\n", + " -3.8680941e-04, -3.2216642e-04, -3.3234112e-04], dtype=float32), 1.39334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_502.wav', 'Achtung, Achtung!', 17, array([-4.0950408e-04, -2.9606355e-04, -3.7786187e-04, ...,\n", + " -2.1742952e-05, 3.0543149e-05, 8.8129680e-05], dtype=float32), 1.5145104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_512.wav', 'Wo bitte schön steht das?', 26, array([ 2.2647387e-04, 1.4740237e-04, 1.2381608e-04, ...,\n", + " -1.1670060e-04, -5.8438465e-05, -5.2704141e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_513.wav', 'SchlieÃ\\x9fen Sie bitte die Luke.', 30, array([ 0.00012086, 0.00019177, 0.00012352, ..., -0.00014259,\n", + " -0.00024671, -0.00014045], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_549.wav', 'Ich hasse meinen Wecker.', 24, array([-1.9575720e-05, -1.5009989e-04, -1.6873972e-04, ...,\n", + " -6.5268898e-05, -1.8595096e-04, -1.7330179e-04], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_576.wav', 'Nicht so laut!', 14, array([-1.6541444e-04, -8.3816949e-06, -1.0135791e-04, ...,\n", + " 3.1510697e-04, 4.1878404e-04, 3.6531710e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_595.wav', 'Ich tu mein Bestes.', 19, array([ 8.3501960e-05, 1.7197721e-04, 2.2250456e-04, ...,\n", + " -1.2569079e-04, -1.3276993e-04, -2.5823418e-04], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_597.wav', 'Alle guten Dinge sind drei.', 27, array([-1.1909505e-05, -8.7172106e-05, -1.2401433e-04, ...,\n", + " -1.4987224e-04, -1.3219267e-05, -7.9211000e-05], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_607.wav', 'Welche Vase?', 12, array([-1.8119848e-04, -2.7736003e-04, -1.8833524e-04, ...,\n", + " 5.6385907e-05, 1.3869893e-04, 1.9968288e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_651.wav', 'Zeig mal deine Muckis.', 22, array([-0.00038406, -0.0003124 , -0.00026326, ..., 0.00032153,\n", + " 0.00029355, 0.0004676 ], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_661.wav', 'Wir sind umzingelt.', 19, array([ 4.0317194e-05, 2.1714004e-04, 1.5210512e-04, ...,\n", + " 1.1821459e-04, 9.8579549e-05, -3.1008281e-06], dtype=float32), 1.57509375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_669.wav', 'Du zitterst ja!', 15, array([-0.0002655 , -0.00018808, -0.00023504, ..., 0.00028222,\n", + " 0.00025013, 0.00041103], dtype=float32), 1.2116145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_687.wav', 'Ob sie schon Hunger haben?', 26, array([-7.1925861e-05, 1.8567745e-06, -5.7103756e-05, ...,\n", + " 2.6770154e-04, 7.6355340e-05, 2.2662200e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_699.wav', 'Das wird schon wieder.', 22, array([-2.5816666e-04, -8.4095438e-05, -1.2401373e-05, ...,\n", + " -1.9085000e-04, -2.3972438e-04, -1.5835713e-04], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_700.wav', 'Köpfe runter!', 14, array([ 8.14295272e-05, 1.14302085e-04, 1.28549975e-04, ...,\n", + " -2.10746948e-04, -2.65351351e-04, -3.40027531e-04], dtype=float32), 1.32065625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_712.wav', 'Sie sollten sich schämen!', 26, array([ 2.6346499e-04, 9.5443167e-05, 1.6159609e-04, ...,\n", + " -2.1241463e-04, -1.5395934e-04, -8.9938527e-05], dtype=float32), 1.6477916666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_732.wav', 'Schwund ist überall.', 21, array([-0.00039054, -0.00025168, -0.00026237, ..., 0.00020222,\n", + " 0.0002156 , 0.00019633], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_734.wav', 'Schon fertig?', 13, array([-6.8748363e-06, 5.9082297e-05, -3.8726441e-05, ...,\n", + " -1.3909466e-04, -2.0350730e-04, -1.0977411e-04], dtype=float32), 1.2237291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_743.wav', 'Musst du da reinschieÃ\\x9fen?', 26, array([0.00038867, 0.00026221, 0.0002308 , ..., 0.0001513 , 0.00017203,\n", + " 0.00012958], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_764.wav', 'Das wäre mir neu.', 18, array([-1.6335897e-04, -1.3920359e-04, -6.9949492e-05, ...,\n", + " 3.2939854e-05, 3.5769459e-05, -3.7220154e-05], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_773.wav', 'Mission gescheitert!', 20, array([ 5.22215014e-05, 1.20894714e-04, 1.96668057e-04, ...,\n", + " -2.58956774e-04, -1.39872835e-04, -1.39142721e-04], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_782.wav', 'Dir kann geholfen werden.', 25, array([-5.4091932e-05, -2.9271763e-05, 1.2364880e-04, ...,\n", + " -1.4125406e-04, -2.3545137e-04, -2.5170582e-04], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_790.wav', 'Vertraust du mir blind?', 23, array([-1.3496955e-04, -4.5282133e-05, 1.7263924e-04, ...,\n", + " 1.0330205e-05, -1.9022463e-04, -1.3715150e-04], dtype=float32), 1.6235520833333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_793.wav', 'Wie stellen Sie sich das vor?', 29, array([5.7090012e-05, 9.3246163e-05, 1.4314597e-04, ..., 1.8600497e-04,\n", + " 1.2342732e-04, 2.2610810e-04], dtype=float32), 1.8901145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_802.wav', 'Ist es nicht so?', 16, array([ 8.5881460e-05, 1.9039282e-04, 2.1635044e-04, ...,\n", + " 1.2600829e-04, 4.5968747e-05, -1.7667595e-05], dtype=float32), 1.4297083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_808.wav', 'Willst du mich umbringen?', 25, array([3.4704231e-04, 2.2213293e-04, 1.1007244e-04, ..., 1.0426929e-05,\n", + " 6.0499657e-05, 4.4495686e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_817.wav', 'Da ist die Tür!', 16, array([ 0.00014472, 0.00027025, 0.00040617, ..., -0.0001791 ,\n", + " -0.00014576, -0.00017543], dtype=float32), 1.7931770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_820.wav', 'Ihr könnt nicht fliehen.', 25, array([ 3.3208958e-04, 1.8373384e-04, 2.8849186e-05, ...,\n", + " -1.9994991e-04, -4.2732576e-05, 5.1437601e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_8.wav', 'Erkennst du ihn wieder?', 23, array([-7.1132112e-05, 1.8191178e-04, 2.2640963e-04, ...,\n", + " -1.5948209e-04, -4.8810096e-05, -7.1736489e-05], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_29.wav', 'Du bist so ein Charmeur!', 24, array([ 8.7156383e-05, -7.5441625e-05, -8.7413508e-05, ...,\n", + " -3.7287452e-04, -2.6756592e-04, -2.7199855e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_30.wav', 'Das Wochenende war sehr schón.', 31, array([0.00010696, 0.00019241, 0.00022398, ..., 0.00018996, 0.00018264,\n", + " 0.00021606], dtype=float32), 1.9506875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_44.wav', 'Na, GroÃ\\x9fer!', 12, array([2.9556373e-05, 1.2606342e-04, 2.0366564e-04, ..., 1.8486078e-04,\n", + " 1.2593277e-04, 1.4429759e-04], dtype=float32), 1.7083645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_53.wav', 'Lassen wir das!', 15, array([-1.0015550e-03, -1.1123064e-03, -1.0633026e-03, ...,\n", + " -8.7814760e-06, 1.5665671e-04, 2.6885752e-04], dtype=float32), 1.2843020833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_57.wav', 'Es kann jeden treffen.', 22, array([-1.2930187e-04, -3.5622310e-05, 1.1325534e-04, ...,\n", + " 2.8466255e-05, -1.7107872e-04, -3.0454184e-04], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_61.wav', 'Das dürfen Sie nicht!', 22, array([7.9696401e-05, 2.5238540e-05, 2.6919068e-05, ..., 2.0004300e-04,\n", + " 1.7159608e-04, 2.0384404e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_69.wav', 'Oder muss man die einfrieren?', 29, array([ 2.3387831e-04, 2.0287969e-04, 2.3305746e-04, ...,\n", + " -2.0109433e-04, -1.5938835e-04, 1.9864538e-06], dtype=float32), 1.9628020833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_75.wav', 'Nur für einen Tag.', 19, array([ 1.1010072e-04, 7.5059768e-05, 1.5811465e-04, ...,\n", + " -1.6034159e-04, 6.0707155e-09, -5.6600587e-05], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_85.wav', 'Ã\\x9cberall lauern Fallen.', 23, array([ 7.3672440e-05, 1.1084337e-04, 5.4723707e-05, ...,\n", + " -3.4976221e-04, -1.6772485e-04, -2.3993225e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_89.wav', 'Schön, dass du da warst.', 25, array([-1.5644990e-04, -1.6062504e-04, -1.5125731e-04, ...,\n", + " -1.4215022e-04, -3.6906120e-05, -1.1689674e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_104.wav', 'Bleib, wo du bist!', 18, array([-6.7565779e-05, -2.1604590e-06, 1.6737869e-04, ...,\n", + " -5.7721576e-05, -1.0027820e-05, -4.2661872e-05], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_113.wav', 'Erwischt!', 9, array([ 8.53675301e-05, -1.39195807e-04, -1.12849986e-04, ...,\n", + " -6.49508947e-05, -6.88307264e-05, -2.25101539e-04], dtype=float32), 1.06621875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_119.wav', 'Dann lass es liegen.', 20, array([-1.4928725e-04, 2.6696865e-05, -8.1158723e-05, ...,\n", + " 1.0134692e-04, 7.8540448e-05, -3.6887606e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_125.wav', \"Mach 'ne Fliege!\", 16, array([-4.2133670e-05, -4.1710995e-05, -9.2710856e-05, ...,\n", + " 6.1932937e-05, 5.9015078e-05, 1.2269965e-04], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_132.wav', 'Bei wem?', 8, array([ 3.2050626e-05, -1.8802975e-05, 6.2951531e-06, ...,\n", + " 3.6152644e-05, 5.9682232e-05, 1.7530509e-04], dtype=float32), 1.2479583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_164.wav', 'Einer reicht.', 13, array([-2.7248763e-05, -1.8096254e-04, -6.8749752e-05, ...,\n", + " -5.8457640e-06, -6.7224923e-06, -2.3102484e-05], dtype=float32), 1.5145104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_167.wav', 'Komm du mal hier her!', 21, array([-1.5554769e-04, 3.7891259e-06, 4.7066398e-05, ...,\n", + " -2.3639805e-05, 2.0737947e-05, 4.9913662e-05], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_187.wav', 'Die Dämmerung bricht an.', 25, array([-1.3250955e-06, 2.9998255e-05, 7.1768205e-05, ...,\n", + " 8.1620914e-05, -2.1789680e-05, -2.0792277e-04], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_209.wav', 'Ich will nur mal gucken!', 24, array([ 2.0323754e-05, -4.8527312e-05, 7.2813884e-05, ...,\n", + " 5.2759733e-05, -1.1957207e-05, -4.8190817e-05], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_219.wav', 'Weg damit!', 10, array([-3.5334317e-05, -1.1389485e-04, -8.2927254e-05, ...,\n", + " 9.7957432e-05, 2.3025880e-04, 8.2124512e-05], dtype=float32), 0.9935208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_228.wav', 'Der kleine Tümpel?', 19, array([-1.11052366e-04, -1.58417228e-04, 1.12858004e-04, ...,\n", + " -7.95750821e-05, 1.25983679e-05, 3.80305464e-05], dtype=float32), 1.7810729166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_261.wav', 'Danke vielmals!', 15, array([-1.0886707e-04, -2.8663597e-04, -2.3995244e-04, ...,\n", + " -9.9315126e-05, -1.0518550e-04, 8.9717643e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_262.wav', 'Greifen Sie zu!', 15, array([ 1.7402765e-04, 5.4675427e-05, -2.1378555e-05, ...,\n", + " -3.0241612e-05, -1.6510607e-05, 1.9972253e-05], dtype=float32), 1.5145208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_270.wav', 'Sein Telefon ist verwanzt.', 26, array([ 1.7227376e-05, 1.3369569e-04, 2.4036576e-04, ...,\n", + " -1.2941840e-04, -7.5057469e-05, 4.6790487e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_280.wav', 'Das kann ich nicht gutheiÃ\\x9fen.', 30, array([-2.46016367e-04, -1.46169405e-04, -1.01338104e-04, ...,\n", + " -2.12353916e-06, -4.44089965e-05, 4.71521271e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_284.wav', 'Nicht im Geringsten.', 20, array([ 6.9896785e-05, 4.9565413e-05, -5.2745858e-05, ...,\n", + " 4.9021692e-05, 4.1371193e-05, -4.8943206e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_290.wav', 'Magst du Sushi?', 15, array([ 4.5281922e-06, -7.7349956e-05, -9.6111427e-05, ...,\n", + " 6.7945102e-06, 5.8605725e-05, -4.7947608e-05], dtype=float32), 1.5993229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_294.wav', 'Ich hätte warten sollen.', 25, array([ 1.3215349e-05, 2.5886698e-05, 9.2406181e-06, ...,\n", + " 3.3613727e-05, -7.8962090e-05, 3.6267331e-05], dtype=float32), 1.5872083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_300.wav', 'Vielen Dank für den Hinweis.', 29, array([ 1.21899466e-04, 1.44075893e-04, 1.06153289e-04, ...,\n", + " 1.94679887e-04, -1.92022708e-05, -8.20819259e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_306.wav', 'Her mit dem Zaster!', 19, array([ 9.2032889e-05, -7.7123856e-05, 1.8857928e-06, ...,\n", + " 5.2272848e-05, 1.2463648e-04, -4.8004724e-05], dtype=float32), 1.90221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_324.wav', 'Moment mal!', 11, array([-9.6486969e-05, -8.5642452e-05, 1.3726056e-05, ...,\n", + " 3.6692109e-05, 2.4882122e-05, -5.4820499e-05], dtype=float32), 1.2721875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_332.wav', 'Lass es sein.', 13, array([-6.2611114e-05, 8.5420121e-05, 1.1575574e-06, ...,\n", + " 1.8824625e-05, 2.6618896e-05, 5.5844474e-05], dtype=float32), 1.4296979166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_334.wav', 'Wir kommen bei ihnen vorbei.', 28, array([ 3.2983281e-04, 5.1712846e-05, -1.6061698e-04, ...,\n", + " 8.1734914e-05, -2.4410097e-05, 1.5291570e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_350.wav', 'Es fehlt nicht mehr viel.', 25, array([ 3.4581102e-05, -3.2403619e-05, 6.4223466e-05, ...,\n", + " -4.1160070e-05, 2.3247363e-05, 1.4443042e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_351.wav', 'So entdeckt man Fehler.', 23, array([-1.5804017e-05, -7.4724383e-05, 1.1222719e-05, ...,\n", + " 4.8898462e-05, 3.6749603e-05, -3.3983986e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_356.wav', 'Salve!', 6, array([-1.3447071e-04, 5.3523188e-05, 8.5717998e-05, ...,\n", + " 4.4749868e-05, -5.5393906e-05, 1.0913220e-05], dtype=float32), 1.0056354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_374.wav', 'Angeber!', 8, array([ 4.8461781e-05, 1.5487269e-04, 9.4685849e-05, ...,\n", + " -1.4769383e-04, -1.8351457e-05, -1.8764535e-05], dtype=float32), 1.1146875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_376.wav', 'Wer duckt sich da weg?', 22, array([ 8.9025889e-05, 2.0651723e-04, -8.5901571e-05, ...,\n", + " 8.8148518e-05, 1.3756873e-04, 1.2379605e-04], dtype=float32), 1.6356770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_396.wav', 'Schlaf schön.', 14, array([ 1.56835347e-04, 2.10795515e-05, 6.19498023e-05, ...,\n", + " -4.29836909e-05, -1.05784595e-04, 4.19116714e-06], dtype=float32), 1.1631458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_22.wav', 'Eindeutig nein.', 15, array([ 1.7040480e-06, -2.4771760e-05, 2.0656289e-05, ...,\n", + " -4.9639581e-05, -6.2789266e-05, -6.4883228e-05], dtype=float32), 1.885)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_26.wav', 'Sie nickte.', 11, array([ 1.3571361e-04, 1.4810856e-04, 1.6444136e-04, ...,\n", + " -8.4158353e-05, -6.3345658e-05, -6.6707049e-05], dtype=float32), 1.3556458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_34.wav', 'Von wegen Rabenmutter!', 22, array([ 3.9614300e-05, 3.0917236e-05, 1.4100775e-05, ...,\n", + " 3.3664131e-05, -3.6520869e-05, -5.6032222e-05], dtype=float32), 1.7171666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_35.wav', 'Woran liegt das?', 16, array([ 1.03992148e-04, 8.12370126e-05, 1.09074477e-04, ...,\n", + " 5.26995609e-05, -2.80062741e-05, -1.37729285e-05], dtype=float32), 1.4718541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_68.wav', 'Das ist schlecht fürs Geschäft.', 33, array([ 3.3433552e-04, 4.7215325e-04, 3.9332887e-04, ...,\n", + " -3.3291522e-05, -7.3073941e-05, -6.2871884e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_70.wav', 'Das überlege ich mir noch.', 27, array([-2.7926452e-04, -4.7232458e-04, -4.5905521e-04, ...,\n", + " -5.0401053e-05, -7.6573851e-05, -1.9868592e-05], dtype=float32), 1.7688125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_71.wav', 'Oder er behält ihn.', 20, array([-1.95691573e-05, 1.42454119e-05, -1.12822245e-05, ...,\n", + " 6.27729896e-05, 6.37731318e-06, 7.33020497e-05], dtype=float32), 1.7429791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_76.wav', 'Viel Vergnügen!', 16, array([-1.4641756e-04, -2.3690579e-04, -2.0291538e-04, ...,\n", + " -6.4597036e-05, -3.9596798e-05, -5.9615340e-05], dtype=float32), 1.2975625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_82.wav', 'Sehr schön erklärt.', 21, array([ 2.8675116e-04, 4.4330378e-04, 3.8435950e-04, ...,\n", + " 9.6497361e-06, 3.9338884e-06, -3.2766162e-05], dtype=float32), 1.5880625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_104.wav', 'Du bist nicht fair.', 19, array([ 8.8045017e-05, 1.6864744e-04, 1.3682757e-04, ...,\n", + " -9.7046555e-05, -1.7125324e-04, -8.5282416e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_119.wav', 'Die in Pulverform.', 18, array([ 2.4460370e-04, 3.1504090e-04, 2.7829470e-04, ...,\n", + " 4.2608990e-05, -1.4765085e-05, -1.9486206e-05], dtype=float32), 1.7817083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_141.wav', 'Hier machen wir einen Schnitt.', 30, array([ 1.7057944e-04, 2.5346698e-04, 2.6541931e-04, ...,\n", + " -5.5827346e-05, -5.5662604e-05, -4.4612902e-05], dtype=float32), 1.6074166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_161.wav', 'Ganz und gar nicht!', 19, array([-1.09976885e-04, -1.06159037e-04, -9.40025275e-05, ...,\n", + " 5.14636531e-06, -7.86106375e-06, -1.38592986e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_162.wav', 'Du schnarchst.', 14, array([ 9.8031691e-05, 1.0789345e-04, 1.0408189e-04, ...,\n", + " 2.8527650e-06, 1.8555178e-05, -1.7833072e-05], dtype=float32), 1.2911041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_175.wav', 'Die in der zweiten Reihe.', 25, array([-7.3503418e-04, -1.0330433e-03, -9.6690352e-04, ...,\n", + " 1.0845856e-04, 9.5128053e-05, 1.3117766e-04], dtype=float32), 1.6590625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_185.wav', 'Viertel nach neun.', 18, array([-0.00025316, -0.00042128, -0.00041847, ..., 0.00012852,\n", + " 0.00010431, 0.00010823], dtype=float32), 1.794625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_214.wav', 'Der hat gut reden!', 18, array([-3.1999915e-04, -4.8188152e-04, -4.3341244e-04, ...,\n", + " 7.4479853e-05, 1.0070496e-04, 9.9988407e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_230.wav', \"Was gibt's denn?\", 16, array([ 5.3894956e-04, 7.5124111e-04, 6.7086820e-04, ...,\n", + " -4.5820485e-05, -5.6413213e-05, -2.6967809e-05], dtype=float32), 1.6719791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_233.wav', 'Fahren Sie bitte schneller.', 27, array([ 4.8254660e-04, 7.2192971e-04, 6.9296843e-04, ...,\n", + " -3.3325745e-05, 1.5315249e-05, 3.6237780e-05], dtype=float32), 1.8204583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_243.wav', 'Keine Ursache!', 14, array([-2.2485174e-04, -3.4637007e-04, -2.4121681e-04, ...,\n", + " -5.3969983e-05, -1.2160699e-05, -7.7381246e-06], dtype=float32), 1.2588333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_245.wav', 'Ich glaube, es geht los.', 24, array([-7.1201968e-05, -1.1457155e-04, -8.4426887e-05, ...,\n", + " 6.9712019e-05, 1.4468420e-05, 7.2575887e-05], dtype=float32), 1.8398125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_266.wav', 'Nicht sehr lange.', 17, array([-4.25688399e-04, -5.72862104e-04, -4.54291090e-04, ...,\n", + " 1.15649045e-05, -7.03342175e-06, 9.42021143e-06], dtype=float32), 1.34275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_268.wav', 'Fahr vorsichtig!', 16, array([ 4.7249952e-05, 6.6685003e-05, 8.1438702e-05, ...,\n", + " -7.7767829e-05, -4.4103599e-05, -3.7954072e-05], dtype=float32), 1.4589375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_278.wav', 'Dann aber mit Fanfare.', 22, array([ 3.0009818e-04, 5.0011458e-04, 4.6210812e-04, ...,\n", + " -1.1364354e-04, -6.8604320e-05, -7.7980949e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_280.wav', 'Habe ich doch!', 14, array([-0.00020733, -0.00032169, -0.00027389, ..., -0.00016337,\n", + " -0.00020018, -0.00013392], dtype=float32), 1.233)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_311.wav', 'Regnet es drauÃ\\x9fen?', 19, array([ 7.4172771e-04, 9.9716149e-04, 9.2472351e-04, ...,\n", + " -1.0082213e-04, -1.2750884e-04, -8.1061611e-05], dtype=float32), 1.8721041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_316.wav', 'Das ist eine lange Geschichte.', 30, array([ 2.24287433e-04, 1.93610642e-04, 1.16401294e-04, ...,\n", + " -1.26720734e-05, 2.45919164e-05, 5.34417049e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_342.wav', 'Welches Rad?', 12, array([4.8121543e-05, 4.5563989e-05, 2.0835963e-05, ..., 3.9729348e-05,\n", + " 3.7650581e-05, 3.3080996e-05], dtype=float32), 1.6397083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_367.wav', 'Nichts zu danken!', 17, array([-3.7277619e-05, -4.9238584e-05, -7.1403243e-05, ...,\n", + " -3.3696429e-05, 3.0755796e-06, -3.4646106e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_376.wav', 'Bitte noch einmal!', 18, array([ 0.00030744, 0.00045197, 0.00040104, ..., -0.00010688,\n", + " -0.00015312, -0.00013671], dtype=float32), 1.6267916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_378.wav', 'Immer in diese Richtung!', 24, array([-4.00174977e-05, 3.99114288e-05, 1.92868242e-06, ...,\n", + " -1.14653565e-04, -7.80621922e-05, -3.85478379e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_380.wav', 'Gefällt dir die Farbe rot?', 27, array([ 3.0378540e-04, 4.3046009e-04, 3.8851614e-04, ...,\n", + " 2.1661093e-05, -2.6406319e-06, -1.4788465e-05], dtype=float32), 1.9495625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_393.wav', 'Gib mir mal die Knarre.', 23, array([-4.2177099e-04, -5.7642709e-04, -4.9111585e-04, ...,\n", + " -8.2453604e-05, -1.6147584e-05, -7.7549201e-05], dtype=float32), 1.8075416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_410.wav', 'Einer geht noch.', 16, array([ 1.5394030e-04, 2.1875372e-04, 2.0080485e-04, ...,\n", + " -5.6117624e-05, -5.4007505e-05, -1.0993878e-05], dtype=float32), 1.3169166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_418.wav', 'Setzen Sie sich!', 16, array([-2.4320874e-05, -3.2748470e-05, -2.0884192e-05, ...,\n", + " 6.3705025e-05, 1.3131127e-04, 7.7887824e-05], dtype=float32), 1.4976875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_423.wav', 'Es geht ja nicht anders.', 24, array([-2.8326374e-04, -3.8826582e-04, -3.3924755e-04, ...,\n", + " -6.5105633e-05, -6.3098807e-05, -8.6217944e-05], dtype=float32), 1.9495625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_451.wav', 'Kopf hoch!', 10, array([-1.68412909e-04, -1.73757420e-04, -1.55442147e-04, ...,\n", + " -8.23870796e-05, -1.52904060e-04, -1.15380506e-04], dtype=float32), 1.3685625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_458.wav', 'Endlich geht es weiter!', 23, array([-1.5851386e-03, 2.6465717e-03, 5.2893539e-03, ...,\n", + " 3.7729558e-06, 3.5277069e-05, -3.3997758e-06], dtype=float32), 1.70425)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_460.wav', 'Schluss mit der Raserei!', 24, array([ 2.88873882e-04, 4.21624194e-04, 4.14417736e-04, ...,\n", + " -1.55140384e-04, -1.10896304e-04, -8.53765887e-05], dtype=float32), 1.6526041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_470.wav', 'Der Kerl ist dufte.', 19, array([-7.1235799e-04, -1.0205780e-03, -9.3518692e-04, ...,\n", + " -1.5202124e-04, -1.4708345e-04, -7.5756463e-05], dtype=float32), 1.9624791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_475.wav', 'Nicht hauen!', 12, array([-3.9160714e-04, -5.2419491e-04, -4.0734027e-04, ...,\n", + " -3.5391298e-05, -1.9862022e-05, -4.2017076e-05], dtype=float32), 1.613875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_489.wav', 'Davon ist auszugehen.', 21, array([-3.8098158e-05, -1.8117305e-05, -9.3444651e-05, ...,\n", + " -4.6410118e-05, -5.4083579e-05, -6.1566949e-05], dtype=float32), 1.8591875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_490.wav', 'Ã\\x84ndern wir das!', 16, array([-3.1039584e-04, -5.0911406e-04, -3.8009215e-04, ...,\n", + " -1.0358073e-05, 2.3063526e-06, -3.8572562e-05], dtype=float32), 1.4847708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_537.wav', 'Viel hilft viel.', 16, array([-7.0020906e-04, -9.7590697e-04, -8.4232452e-04, ...,\n", + " 2.6748754e-05, 3.9436178e-05, -1.5542679e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_561.wav', 'Voll abgezogen!', 15, array([0.0009425 , 0.00131688, 0.00114336, ..., 0.00054311, 0.00053014,\n", + " 0.00059172], dtype=float32), 1.304)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_569.wav', 'Was ist Liebe?', 14, array([ 1.7119097e-04, 2.4002905e-04, 1.4028113e-04, ...,\n", + " -1.1777198e-05, 4.3154125e-07, 1.1548834e-05], dtype=float32), 1.3814791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_613.wav', 'Bitte wenden Sie.', 17, array([-3.2704248e-04, -4.7001868e-04, -4.4811977e-04, ...,\n", + " 3.9887604e-05, 4.2593329e-05, -1.2635800e-05], dtype=float32), 1.5751458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_618.wav', \"Das spar'n wir uns jetzt.\", 25, array([-2.1968294e-04, -2.5130660e-04, -2.3470224e-04, ...,\n", + " 4.6512545e-05, 1.0168094e-04, 8.9639499e-05], dtype=float32), 1.9882916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_625.wav', 'Doppelt hält besser.', 21, array([1.0242802e-04, 1.4422902e-04, 1.5433358e-04, ..., 1.8618872e-05,\n", + " 2.6657151e-05, 8.0320706e-06], dtype=float32), 1.3169166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_645.wav', 'Die beiden werden bestimmt schwer.', 34, array([ 2.2716461e-04, 3.7214963e-04, 3.4043228e-04, ...,\n", + " -7.0017355e-05, -5.9255068e-05, -4.9753759e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_650.wav', 'Dort steppt der Bär.', 21, array([3.4111144e-05, 3.9471229e-06, 1.3943841e-05, ..., 2.8798750e-04,\n", + " 3.4306329e-04, 2.3900693e-04], dtype=float32), 1.8204583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_658.wav', 'Offensichtlich nicht.', 21, array([-6.4643849e-05, -1.4843927e-04, -1.9616121e-04, ...,\n", + " 6.0427959e-05, 2.8176541e-05, 1.0887287e-04], dtype=float32), 1.8462708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_690.wav', 'Ganz sicher.', 12, array([-2.1219352e-04, -2.6916104e-04, -2.2152660e-04, ...,\n", + " -8.3999286e-05, -3.9927592e-05, -1.1057539e-04], dtype=float32), 1.4460208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_734.wav', 'Bin ich ein Mensch?', 19, array([7.3225739e-05, 8.4229468e-05, 6.0397753e-05, ..., 1.4409037e-04,\n", + " 5.4610227e-05, 2.8432718e-05], dtype=float32), 1.8721041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_737.wav', 'Wohl bekommts.', 14, array([3.8544985e-04, 5.4862851e-04, 4.7615587e-04, ..., 1.1308860e-05,\n", + " 1.5347328e-05, 3.9165672e-05], dtype=float32), 1.5880416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_747.wav', 'So eine will ich auch.', 22, array([-0.00023466, -0.00034498, -0.00035786, ..., 0.00014857,\n", + " 0.00014895, 0.00018565], dtype=float32), 1.9366458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_751.wav', 'Guter Rat ist teuer.', 20, array([-5.8482616e-05, -9.7700511e-05, -1.4372601e-04, ...,\n", + " 8.8569423e-06, 4.0626270e-05, -2.2441051e-05], dtype=float32), 1.885)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_785.wav', 'Noch Fragen?', 12, array([-5.4637530e-05, -9.7329437e-05, -6.5443433e-05, ...,\n", + " 1.3526098e-05, -1.7008400e-05, -2.3395469e-05], dtype=float32), 1.542875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_790.wav', 'Wie tut man das?', 16, array([ 1.4673925e-06, -7.7766053e-06, 2.2737586e-05, ...,\n", + " -2.2371720e-04, -2.6603421e-04, -2.1358255e-04], dtype=float32), 1.6009583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_794.wav', 'Ein billiger Trick.', 19, array([ 2.5642010e-05, 5.9448335e-05, 7.9047953e-05, ...,\n", + " -1.4398795e-05, -2.7475784e-05, -3.0437941e-05], dtype=float32), 1.4912291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_1.wav', 'Woher soll ich sie kennen?', 26, array([-7.0670452e-05, -2.2751169e-04, 3.6274258e-05, ...,\n", + " 6.2137144e-05, -1.4069478e-04, 1.5651318e-04], dtype=float32), 1.865875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_4.wav', 'Wo soll es hingehen?', 20, array([-4.3062766e-05, 6.9635964e-05, 2.7200711e-05, ...,\n", + " 7.3389943e-05, 9.7813630e-05, 7.5023250e-05], dtype=float32), 1.526625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_5.wav', 'Ã\\x84tsch!', 7, array([-5.5343335e-05, -1.0754153e-04, 1.0636374e-04, ...,\n", + " -2.3993191e-04, -1.1428300e-04, -1.9587418e-04], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_24.wav', 'Den mit dem Hund.', 17, array([-2.9083933e-05, -4.5743432e-06, -1.1590145e-04, ...,\n", + " -6.4060594e-05, -5.3663935e-06, -6.9100148e-05], dtype=float32), 1.5993333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_42.wav', 'Sieh mal schnell nach!', 22, array([ 1.1187345e-05, -2.7101662e-04, -4.0457569e-05, ...,\n", + " 3.8478026e-04, 1.3185160e-04, 1.9724603e-04], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_43.wav', 'Zieh Leine!', 11, array([ 1.4326118e-04, 1.4733149e-04, 2.3666536e-04, ...,\n", + " -8.1889502e-06, -2.2159066e-04, -1.0789347e-04], dtype=float32), 1.4115416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_55.wav', 'Meistens eher nicht.', 20, array([-1.2833555e-04, -4.5777502e-04, -2.9062675e-04, ...,\n", + " 3.7303114e-05, 1.7912805e-04, 9.5502997e-05], dtype=float32), 1.968875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_89.wav', \"Komm Du 'mal hier her!\", 22, array([-2.4593925e-05, -1.5391175e-04, -3.5177112e-05, ...,\n", + " -2.8054212e-05, -8.3761133e-06, -3.3427594e-05], dtype=float32), 1.8840625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_102.wav', 'Keine halben Sachen.', 20, array([-0.00010684, -0.00018609, -0.00036967, ..., 0.00014736,\n", + " 0.00013171, 0.00024668], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_172.wav', 'Zugriff!', 8, array([-1.7191633e-04, -2.6422989e-04, -1.8970467e-04, ...,\n", + " 1.4085844e-05, -6.5849432e-05, -1.2668260e-04], dtype=float32), 1.3994166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_187.wav', 'Viel SpaÃ\\x9f dabei!', 17, array([ 1.26890489e-04, 4.78873408e-04, 3.36644967e-04, ...,\n", + " -1.14277915e-04, 1.15070587e-04, -4.50995103e-05], dtype=float32), 1.8234791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_189.wav', 'Krass, oder?', 12, array([ 5.2673863e-06, -3.2042470e-05, 6.3032145e-05, ...,\n", + " 4.9474946e-04, 4.8315409e-04, 3.1584961e-04], dtype=float32), 1.4054791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_195.wav', 'Hat es geregnet?', 16, array([-1.5500655e-05, -2.4765370e-05, -1.3535780e-04, ...,\n", + " 1.0218658e-04, -7.7519953e-06, 8.1419450e-05], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_200.wav', 'Die Maschine läuft heiÃ\\x9f.', 26, array([-2.9662095e-05, -1.3571499e-04, -4.9048278e-05, ...,\n", + " 4.0860983e-04, 3.3467117e-04, 2.8713685e-04], dtype=float32), 1.890125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_232.wav', 'Friss ScheiÃ\\x9fe!', 15, array([-3.1462018e-04, -4.3994249e-04, -1.8601233e-04, ...,\n", + " 1.2004693e-04, 6.4006366e-05, 1.4038217e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_244.wav', 'Wasser marsch!', 14, array([-5.2966818e-05, -1.3111959e-06, -2.3756520e-05, ...,\n", + " -4.7830945e-05, -1.0526282e-04, 5.8504538e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_255.wav', 'Ein Halbstarker!', 16, array([8.9307170e-05, 4.3556365e-04, 5.6998286e-04, ..., 7.5660588e-05,\n", + " 1.9409347e-04, 7.0803260e-05], dtype=float32), 1.6841458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_266.wav', 'Stell den Fernseher ab!', 23, array([-6.1183324e-05, -1.4089182e-04, -1.1948228e-04, ...,\n", + " -1.9923897e-04, -1.7150129e-04, -2.3940729e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_283.wav', 'Kopf oder Zahl?', 15, array([-1.3454640e-04, -4.2848653e-05, -2.3553993e-04, ...,\n", + " -6.3240882e-06, -5.2672884e-05, -1.6467538e-04], dtype=float32), 1.550875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_300.wav', 'Jetzt verstanden?', 17, array([-1.7701862e-04, 3.8073360e-06, 6.6768931e-05, ...,\n", + " 1.5635177e-04, 2.4184166e-04, 2.0308173e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_305.wav', 'Jeder nur eine Kugel!', 21, array([ 1.0893906e-04, 3.5140860e-05, -8.6934997e-05, ...,\n", + " -1.5842280e-04, -7.1798029e-05, -2.1561602e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_327.wav', 'Leider nein.', 12, array([ 0.00020925, 0.00038225, 0.00030209, ..., -0.0002834 ,\n", + " -0.00024066, -0.000164 ], dtype=float32), 1.0783333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_345.wav', 'Irgendwas ist anders.', 21, array([-0.00026138, -0.00012453, -0.00022627, ..., -0.00013074,\n", + " -0.00016786, -0.00011485], dtype=float32), 1.9991666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_371.wav', 'Ein bisschen.', 13, array([ 6.19466882e-05, 1.81855256e-04, 2.56517378e-04, ...,\n", + " 9.61327260e-06, 2.89863237e-05, -1.07233864e-04], dtype=float32), 1.211625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_375.wav', 'Wir sitzen fest.', 16, array([-1.0016920e-05, -5.8360743e-05, -5.3961080e-06, ...,\n", + " -1.4201126e-07, -8.1081940e-05, -1.3083526e-05], dtype=float32), 1.5326875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_424.wav', 'Hat er nicht gesagt.', 20, array([1.4237937e-04, 3.5439979e-04, 4.2451522e-04, ..., 2.9889754e-05,\n", + " 4.3811939e-05, 5.3790947e-05], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_428.wav', 'Mach das ordentlich!', 20, array([-2.2249017e-04, -3.4736985e-04, -2.4423364e-04, ...,\n", + " -4.8614937e-05, 1.6576583e-04, 1.4303469e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_430.wav', 'Zurück zum Thema.', 18, array([0.00021488, 0.00048195, 0.00039156, ..., 0.00020808, 0.0002092 ,\n", + " 0.00014525], dtype=float32), 1.7023125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_433.wav', 'Auf mich hört sowieso niemand.', 31, array([-4.4078504e-05, 1.2701395e-04, 1.5659831e-04, ...,\n", + " 3.2407068e-05, 1.3882274e-04, 3.7292095e-06], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_434.wav', 'Weiter so!', 10, array([ 7.8434707e-05, 2.3782127e-04, 2.0620505e-04, ...,\n", + " -3.0293613e-06, 7.3579846e-05, 2.1203174e-04], dtype=float32), 0.9571666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_437.wav', 'Darüber herrscht Konsens.', 26, array([ 2.0915098e-04, 1.6340525e-04, -4.4762099e-05, ...,\n", + " 3.0228088e-05, -5.6204710e-05, 1.4202976e-04], dtype=float32), 1.9991666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_440.wav', 'Was ist so schlimm daran?', 25, array([ 5.3402138e-05, -1.7599798e-04, 1.1747003e-04, ...,\n", + " 1.8220089e-04, 2.5114723e-04, 2.9130204e-04], dtype=float32), 1.8053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_447.wav', 'Brüllend komisch!', 18, array([2.5463186e-04, 3.0699532e-04, 1.7949699e-04, ..., 1.3379526e-04,\n", + " 6.0049937e-05, 4.3341170e-05], dtype=float32), 1.4660416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_462.wav', 'Sehr einfallsreich!', 19, array([ 1.6625131e-04, 1.4804797e-04, 6.6010347e-05, ...,\n", + " -2.8519373e-05, -1.5197203e-05, -1.2542940e-04], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_468.wav', 'Einer fehlt hier noch.', 22, array([0.00021585, 0.0002281 , 0.00034421, ..., 0.00031288, 0.00025684,\n", + " 0.00014126], dtype=float32), 1.5448125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_505.wav', 'Wollen wir?', 11, array([-0.000173 , -0.00033364, -0.00012876, ..., 0.00012244,\n", + " 0.00032144, 0.00014797], dtype=float32), 1.029875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_514.wav', 'Und wir singen zusammen!', 24, array([ 0.00028886, 0.00030063, 0.00037314, ..., -0.00011231,\n", + " -0.00017524, -0.00013442], dtype=float32), 1.890125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_541.wav', 'Hier, fang!', 11, array([-9.6539197e-06, 9.8090044e-05, 7.5100412e-05, ...,\n", + " 1.8568999e-04, 3.1414471e-04, 1.8397035e-04], dtype=float32), 1.5326875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_548.wav', 'Ignorieren Sie die Warnung nicht.', 33, array([-7.0703449e-05, -2.1341034e-06, -2.6835096e-05, ...,\n", + " 1.0051801e-04, 6.5389222e-06, 2.1216212e-04], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_558.wav', 'Nirgends ist ein Ausweg.', 24, array([ 0.0002789 , 0.00025432, 0.00026059, ..., -0.0001307 ,\n", + " -0.00015316, -0.0001602 ], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_563.wav', 'Er will schmusen.', 17, array([ 8.3865758e-05, -4.9942853e-05, 5.9117421e-05, ...,\n", + " -4.3004973e-05, -1.0278272e-04, -8.9234527e-05], dtype=float32), 1.3146041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_590.wav', 'GrüÃ\\x9f Gott!', 12, array([ 3.8686660e-05, 8.4167688e-05, -4.1444160e-05, ...,\n", + " 7.9078745e-05, 6.6285960e-05, 7.3457479e-05], dtype=float32), 1.1328541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_591.wav', 'Doch, muss es.', 14, array([-2.7301039e-05, -9.8715776e-05, -5.1679286e-05, ...,\n", + " 1.7480909e-04, 8.8697474e-05, -8.7942986e-05], dtype=float32), 1.5811666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_594.wav', 'Höchst verdächtig!', 20, array([-1.5668831e-04, -1.4814634e-05, 1.2133464e-06, ...,\n", + " 1.1010807e-04, 5.0348262e-05, 3.2340708e-05], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_597.wav', 'Hat man das schon mal gehört?', 30, array([ 2.9468083e-05, 8.5217485e-05, -1.1223685e-05, ...,\n", + " 1.4429020e-05, -3.4263925e-05, -1.7569761e-04], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_628.wav', 'Habt noch ein wenig Geduld.', 27, array([-3.1721203e-05, -6.6361958e-05, 6.2947714e-05, ...,\n", + " 9.7825025e-05, -1.3173591e-04, 3.6439680e-05], dtype=float32), 1.9143541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_630.wav', 'Och, Schnucki!', 14, array([-3.5877591e-05, -2.9018152e-04, -1.0041694e-04, ...,\n", + " 1.2557590e-04, 8.4289997e-05, 1.0620209e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_654.wav', 'Womit kann ich dienen?', 22, array([-3.79744961e-05, 4.58159229e-05, 5.13197449e-07, ...,\n", + " 5.17356311e-05, 2.12984141e-05, 1.14942064e-04], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_657.wav', 'Ich bin der Gerichtsvollzieher.', 31, array([ 2.9084453e-05, -2.4720324e-05, 1.8879551e-06, ...,\n", + " -2.5064335e-04, -1.8888044e-04, -4.7750240e-05], dtype=float32), 1.9749166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_670.wav', 'Gute Nacht zusammen!', 20, array([ 1.36263785e-04, 8.22485454e-05, 1.07259955e-04, ...,\n", + " -1.70976884e-04, -4.60869487e-05, -1.28792832e-04], dtype=float32), 1.7810833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_691.wav', 'Läuft die Waschmaschine noch?', 30, array([-1.7628371e-04, 3.7217360e-05, 5.7620698e-05, ...,\n", + " 4.7630738e-06, -1.4578988e-04, -2.1564976e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_728.wav', 'Der zweite war nicht mehr so chic.', 34, array([ 3.7413691e-05, 2.5557930e-04, 3.8776739e-06, ...,\n", + " -1.6214621e-04, -2.7943292e-05, -4.3322394e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_748.wav', 'Das Licht wird schwächer.', 26, array([-8.6605805e-06, -9.4557421e-05, -4.0338778e-05, ...,\n", + " -4.2446409e-05, 4.2122399e-05, -6.5777012e-06], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_789.wav', 'Du hast mich durchschaut.', 25, array([-7.2653616e-05, -5.6117566e-05, -2.1032026e-04, ...,\n", + " -1.6650984e-05, -4.1212854e-05, 1.1137113e-04], dtype=float32), 1.7144375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_797.wav', 'Kennt ihr den Weg?', 18, array([-1.6756072e-04, -1.5301499e-04, -6.5641878e-05, ...,\n", + " 2.0324395e-04, 1.4747797e-04, 2.2508665e-04], dtype=float32), 1.5205833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_835.wav', 'Alles klar bei dir?', 19, array([ 1.1695884e-04, 1.1995935e-05, -1.2846527e-04, ...,\n", + " -1.9988464e-04, -2.4078601e-05, -4.2752044e-06], dtype=float32), 1.4054583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_841.wav', 'Kommt jemand mit?', 17, array([ 4.9882954e-05, 4.0318602e-05, 1.2408203e-04, ...,\n", + " -1.1336284e-04, -1.6859797e-04, -3.4263285e-05], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_6.wav', 'Nur vom Hörensagen.', 20, array([ 4.0408637e-04, 5.5643718e-04, 5.7215214e-04, ...,\n", + " -7.1763410e-05, -1.0798458e-04, -3.2582655e-05], dtype=float32), 1.7205)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_14.wav', 'Ich weiÃ\\x9f es nicht mehr.', 24, array([0.00023374, 0.00015971, 0.0001749 , ..., 0.00011659, 0.00024648,\n", + " 0.00010209], dtype=float32), 1.6233125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_17.wav', 'Lass es raus!', 13, array([-3.1531116e-04, -3.3344212e-04, -5.9053692e-04, ...,\n", + " 5.4772248e-05, -1.1641844e-05, -6.8900968e-05], dtype=float32), 1.4902916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_68.wav', 'Sie müssen mir glauben!', 24, array([ 1.4851260e-04, 2.9638095e-04, 2.5485444e-04, ...,\n", + " -1.8143297e-05, 4.6757654e-05, 4.2184558e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_92.wav', 'Ihre Bestellung, bitte!', 23, array([ 9.8706114e-05, 2.2661808e-04, 1.6781769e-04, ...,\n", + " 5.1173961e-06, -2.6828362e-04, -2.2934456e-04], dtype=float32), 1.4297083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_95.wav', 'Was können Sie mir anbieten?', 29, array([-1.9375395e-04, -3.1588171e-04, -3.9896931e-04, ...,\n", + " 1.0834881e-04, -1.4949654e-05, -1.3323028e-05], dtype=float32), 1.7689583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_100.wav', 'Also nicht missverstehen!', 25, array([-1.1475936e-04, 3.5450608e-05, 5.9234120e-05, ...,\n", + " 7.9908222e-07, -7.6752185e-05, 3.1952815e-05], dtype=float32), 1.9507083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_101.wav', 'Jeder macht mal Fehler.', 23, array([-2.0121370e-05, 3.3358188e-05, 1.4433647e-05, ...,\n", + " 2.5029780e-04, 1.0649080e-04, 2.8118977e-04], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_107.wav', 'Immer dasselbe mit dir.', 23, array([ 4.04063358e-05, 2.61971072e-05, -1.03683014e-04, ...,\n", + " -2.34830455e-04, -1.33784546e-04, -7.84191070e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_113.wav', 'Jetzt erinnere ich mich.', 24, array([ 5.7016779e-05, 9.8553166e-05, 8.2001083e-05, ...,\n", + " 2.6238111e-05, 1.3704958e-05, -8.3586237e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_131.wav', 'Freiwillige vor!', 16, array([ 5.72854588e-05, 1.07770924e-04, 1.99439557e-04, ...,\n", + " -4.32070919e-05, -3.67913685e-06, 1.42182573e-04], dtype=float32), 1.5300416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_137.wav', 'Ich lehne ihn sogar ab.', 23, array([ 4.1758478e-05, 1.8570285e-05, 2.1333873e-04, ...,\n", + " 2.0144802e-05, -3.2468499e-05, 4.0363415e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_148.wav', 'Setz dich!', 10, array([-1.4053716e-04, -1.2715683e-04, -3.6183195e-04, ...,\n", + " 8.8158406e-05, -4.2700492e-05, 1.4811622e-04], dtype=float32), 1.1631458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_159.wav', 'Wie lief die Klausur?', 21, array([4.5470217e-05, 1.4640424e-04, 9.2724607e-05, ..., 1.4090222e-04,\n", + " 1.8730978e-04, 8.1763144e-05], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_188.wav', 'So viel Zeit muss sein!', 23, array([-7.5860844e-05, -1.8835207e-04, -2.0893685e-04, ...,\n", + " -5.3442498e-05, -6.1138802e-05, -8.8275759e-05], dtype=float32), 1.7810833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_194.wav', 'Zeit fürs Bettchen.', 20, array([-9.7486656e-05, -5.1642677e-05, -8.1966471e-05, ...,\n", + " -7.5118078e-05, -3.0586343e-05, -7.1709837e-05], dtype=float32), 1.6599166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_236.wav', 'Wir sind gleich da.', 19, array([-6.8177519e-06, 6.7671383e-05, -1.0620675e-04, ...,\n", + " 4.5802376e-06, -7.1226568e-05, -5.8944144e-05], dtype=float32), 1.6622916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_243.wav', 'Herrgott noch mal!', 18, array([ 1.7256364e-04, 1.5818405e-04, 2.4684667e-04, ...,\n", + " -1.7978776e-04, -2.2976559e-05, -3.1599044e-05], dtype=float32), 1.4440208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_262.wav', 'Früher war alles besser.', 25, array([ 1.6410025e-04, 2.0620895e-04, 2.0922835e-04, ...,\n", + " 4.5493864e-05, -7.6417935e-05, 7.0160553e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_264.wav', 'Wie heiÃ\\x9ft du?', 14, array([ 2.3004458e-04, 3.3690900e-04, 3.8855671e-04, ...,\n", + " -1.7735986e-04, -6.0517366e-05, 1.4090910e-05], dtype=float32), 1.24025)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_267.wav', 'Siehst du?', 10, array([ 8.0912840e-05, 5.0722783e-06, 6.0588944e-05, ...,\n", + " -1.2716564e-04, 2.9675630e-05, -1.6470523e-05], dtype=float32), 1.187375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_269.wav', 'Totgesagte leben länger.', 25, array([-1.0916409e-05, -1.7836766e-05, -5.1411305e-05, ...,\n", + " -1.2148214e-04, -2.2084620e-04, 8.5974034e-06], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_291.wav', 'Bin ich männlich?', 18, array([-2.0014251e-05, 2.6616051e-05, 1.2375216e-04, ...,\n", + " 1.3375390e-04, 5.5609209e-05, -7.4272582e-05], dtype=float32), 1.4418125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_295.wav', 'Was war in dem Umschlag?', 24, array([-6.2635612e-05, -4.7769913e-06, -1.3995348e-05, ...,\n", + " 7.0862757e-06, 9.2074784e-05, 9.0880349e-06], dtype=float32), 1.9507083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_316.wav', 'Ich bin bedient.', 16, array([2.5768091e-05, 1.6018275e-05, 3.7452736e-04, ..., 7.7061843e-05,\n", + " 1.8039568e-04, 7.1911185e-05], dtype=float32), 1.6599166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_317.wav', 'Tschüssikowski!', 16, array([ 1.3183661e-04, 8.4080348e-05, -2.6853681e-05, ...,\n", + " 5.1806877e-05, 1.5268542e-05, -6.9305977e-05], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_323.wav', 'Fang mich, wenn du kannst!', 26, array([-4.0345873e-05, 3.4187411e-05, -3.7680857e-05, ...,\n", + " -8.6350832e-05, -1.6245214e-04, -5.1246581e-05], dtype=float32), 1.7447291666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_343.wav', 'Ich bin kein Einbrecher!', 24, array([ 2.2356608e-05, -6.4235406e-05, -9.0699705e-06, ...,\n", + " 1.2990409e-04, 7.6688739e-05, -4.0372826e-05], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_346.wav', 'Hör nicht auf ihn.', 19, array([-2.9778299e-05, 3.8957646e-06, -7.7031938e-05, ...,\n", + " 1.9274552e-04, 1.7162508e-04, -1.3842691e-06], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_349.wav', 'Eine letzte Windung noch.', 25, array([-1.8898114e-05, -4.0488834e-05, 1.2324851e-04, ...,\n", + " -7.7293364e-05, 8.3202161e-05, 1.5701227e-04], dtype=float32), 1.5508541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_359.wav', 'Mir nach!', 9, array([ 9.4505500e-05, 2.3980458e-04, 3.7063317e-05, ...,\n", + " -4.1811028e-04, -4.7733358e-04, -4.6703668e-04], dtype=float32), 1.3489375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_360.wav', 'Schon wieder?', 13, array([-2.7792374e-04, -4.0585164e-04, -4.3411212e-04, ...,\n", + " -6.9041176e-05, -2.6838092e-07, 5.3586686e-05], dtype=float32), 1.0783333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_363.wav', 'Heidi funkelt ihn an.', 21, array([-1.39060983e-04, -9.78735334e-05, 9.33348783e-05, ...,\n", + " -1.00029130e-04, -1.25095859e-04, -1.00360034e-04], dtype=float32), 1.9506875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_375.wav', 'Kein Signal gefunden.', 21, array([-1.1299809e-04, -9.9104131e-05, -2.1005377e-05, ...,\n", + " -2.4724935e-04, 5.5919631e-06, 4.7323024e-06], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_403.wav', 'Entschuldigen Sie die Störung!', 31, array([ 6.84832412e-05, 1.86067002e-04, -1.04915016e-04, ...,\n", + " 1.84468547e-04, 4.62387870e-05, -5.50564218e-05], dtype=float32), 1.8174166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_404.wav', 'Guter Mann!', 11, array([ 4.2475749e-05, -3.8101676e-05, 8.2924860e-05, ...,\n", + " -9.0844223e-06, 8.0864724e-05, -4.9711874e-05], dtype=float32), 1.1268125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_416.wav', 'Oder etwa nicht?', 16, array([ 1.6924678e-05, 8.7618108e-05, 1.1962327e-04, ...,\n", + " -1.5572428e-04, -1.2718650e-04, -2.7018292e-05], dtype=float32), 1.5266458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_427.wav', 'Wer weiÃ\\x9f das schon.', 20, array([-1.2090163e-05, -1.1217411e-04, -3.4340650e-05, ...,\n", + " -1.9305095e-05, 1.0599474e-04, -7.2453047e-05], dtype=float32), 1.9157916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_440.wav', 'Walter hat es verpatzt.', 23, array([-9.9328121e-05, -3.7155328e-07, -5.4411164e-05, ...,\n", + " 1.3715628e-04, -4.9349186e-05, -1.4098950e-04], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_451.wav', 'So läuft das nicht.', 20, array([-1.21481185e-04, -1.13304653e-04, -2.73915475e-07, ...,\n", + " 1.47375540e-04, 1.44234422e-04, -2.10445778e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_457.wav', 'Unverhofft kommt oft.', 21, array([-1.8882036e-05, -2.5487921e-05, 2.6220470e-04, ...,\n", + " 5.6016044e-05, -7.5536453e-05, -4.1967660e-06], dtype=float32), 1.865875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_461.wav', 'Bist du noch Single?', 20, array([-7.4286567e-05, -1.6158549e-04, -1.6719839e-04, ...,\n", + " -9.1800161e-05, -1.2240406e-04, 3.6517587e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_468.wav', 'Mein Licht ist kaputt.', 22, array([ 4.02122387e-05, -1.00659774e-04, -8.88236755e-05, ...,\n", + " -4.64872028e-05, -2.63940365e-06, 7.19727832e-05], dtype=float32), 1.7735)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_489.wav', 'Gemeinsam sind wir dumm!', 24, array([-1.4583243e-04, -2.6087323e-04, -2.3470599e-05, ...,\n", + " -2.4694938e-04, -1.5543406e-04, -6.1786144e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_518.wav', 'Er hat Mama gesagt!', 19, array([ 2.6662483e-05, -7.8772522e-05, -5.4227519e-05, ...,\n", + " 1.4953410e-05, -6.7233414e-05, -9.8744909e-05], dtype=float32), 1.9264791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_551.wav', 'Ciao!', 5, array([-8.1419050e-05, -2.2554104e-05, -9.1002643e-05, ...,\n", + " 8.3599451e-05, -1.5038802e-05, 1.8543131e-05], dtype=float32), 0.8966041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_557.wav', 'Die Welt ist ungerecht.', 23, array([-7.9495927e-05, -2.2434435e-04, -1.8575993e-05, ...,\n", + " 4.3908138e-05, 4.8930386e-05, 1.4439608e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_573.wav', 'Wer weiÃ\\x9f?', 10, array([-2.4007348e-05, 2.8211702e-05, 1.1010996e-04, ...,\n", + " 3.2032028e-04, 2.8236501e-04, 3.1412503e-04], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_594.wav', 'Feierabend!', 11, array([-1.4223782e-05, -5.6433430e-05, -3.3835067e-06, ...,\n", + " -1.2677837e-04, 4.7294146e-05, 7.4652962e-05], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_599.wav', 'Sag bloÃ\\x9f!', 10, array([ 2.12539035e-05, -1.20294884e-04, -8.79466315e-05, ...,\n", + " 2.56883359e-04, 2.45794392e-04, 4.15721239e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_618.wav', 'Geht das in Ordnung?', 20, array([-1.7039385e-04, -4.3828294e-04, -3.7954788e-04, ...,\n", + " 2.5719850e-04, 3.6655194e-05, 4.4241093e-05], dtype=float32), 1.7204791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_649.wav', 'Komm noch etwas näher!', 23, array([ 2.4222159e-06, -1.3579089e-04, -4.4756231e-05, ...,\n", + " -1.4951664e-04, -2.2786215e-04, -3.1124309e-04], dtype=float32), 1.4418125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_656.wav', 'Lach nicht!', 11, array([ 1.79771829e-04, 1.79155570e-04, 4.07271327e-05, ...,\n", + " 1.34896531e-04, 1.24606095e-05, -4.19603248e-06], dtype=float32), 1.3327708333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_678.wav', 'Ich fasse zusammen.', 19, array([-1.0120855e-04, 6.3165186e-05, -2.2567945e-05, ...,\n", + " 6.0140010e-05, 9.6748437e-05, 3.0506399e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_695.wav', 'Umtausch ausgeschlossen!', 24, array([ 7.3856318e-05, 2.8886712e-05, 1.5315624e-04, ...,\n", + " -9.7581760e-05, 8.5684667e-05, -3.2478438e-05], dtype=float32), 1.6720208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_699.wav', 'Setzt euch.', 11, array([-1.6188849e-04, -1.0612092e-04, -6.7996967e-05, ...,\n", + " -1.1114984e-04, -2.0633070e-04, -1.5339212e-05], dtype=float32), 1.3085416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_703.wav', 'Ja, ist sie.', 12, array([-8.3997344e-05, -2.7474607e-05, -1.9123188e-05, ...,\n", + " 1.8876011e-04, 5.0511160e-05, 9.6139847e-05], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_707.wav', 'Nehmt sie ihnen ab!', 19, array([ 4.0254617e-04, 4.7474771e-04, 3.5727478e-04, ...,\n", + " -1.1594634e-06, -1.5993090e-04, -1.5013713e-05], dtype=float32), 1.8477083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_718.wav', 'Bitte schön lächeln.', 22, array([-5.2708318e-05, -1.2709903e-04, -3.1722573e-04, ...,\n", + " -1.4999519e-04, 1.3614057e-04, -2.6379108e-05], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_732.wav', 'Kamelle!', 8, array([ 3.4038111e-04, 4.9238594e-04, 3.1708140e-04, ...,\n", + " -8.7314249e-05, -4.2823103e-05, 4.8170114e-06], dtype=float32), 1.0541041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_733.wav', 'Nichts daran war schlimm.', 25, array([2.4388860e-04, 1.5891306e-04, 1.7636098e-04, ..., 6.8294656e-05,\n", + " 7.4376767e-05, 9.9975718e-05], dtype=float32), 1.79925)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_742.wav', 'Hast du das auch gehört?', 25, array([ 2.2057726e-04, 3.3742579e-04, 1.5720318e-05, ...,\n", + " 1.6000369e-05, -1.9323647e-04, -1.1723922e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_751.wav', 'Irritiert dich das?', 19, array([ 3.1597829e-05, -1.0975795e-04, -4.8185088e-05, ...,\n", + " -7.8868754e-05, 9.2668552e-06, 1.6543895e-04], dtype=float32), 1.7750208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_758.wav', 'Das ist gar nicht so lange her.', 31, array([-0.00046848, -0.00072762, -0.00048674, ..., 0.00027484,\n", + " 0.00023592, 0.00020132], dtype=float32), 1.7750208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_759.wav', 'Die Chemie muss stimmen.', 24, array([ 2.8143785e-04, 3.1653995e-04, 3.5444429e-04, ...,\n", + " 8.1970691e-05, -5.0139199e-05, -1.7111432e-05], dtype=float32), 1.9446458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_763.wav', 'Stimmt eigentlich.', 18, array([-1.2765415e-06, -4.4488741e-05, -1.0883755e-04, ...,\n", + " 2.9581884e-04, 4.5865582e-04, 6.1051000e-04], dtype=float32), 1.3024791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_782.wav', 'Sagen Sie den Zielort.', 22, array([ 1.0137472e-04, 2.3555224e-04, 2.6113808e-04, ...,\n", + " -2.9943618e-05, 3.1559110e-05, 2.7199069e-06], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_792.wav', 'Meldet euch freiwillig!', 23, array([2.3276571e-04, 3.9564463e-04, 2.9302380e-04, ..., 1.1956793e-04,\n", + " 7.0350601e-05, 1.8581332e-04], dtype=float32), 1.8052916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_793.wav', 'Die Boote liegen auf dem Trockenen.', 35, array([-0.00011364, -0.00017169, -0.00019618, ..., 0.00044204,\n", + " 0.00018713, 0.00049593], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_802.wav', 'Gesundheit!', 11, array([-8.3673913e-05, -7.9538848e-05, -6.8612273e-05, ...,\n", + " 4.4534498e-04, 4.3816061e-04, 2.6374889e-04], dtype=float32), 1.2722083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_18.wav', 'Aber auch nur gerade so.', 24, array([-2.2079168e-05, -1.6145856e-05, 2.9195176e-06, ...,\n", + " -1.0078496e-05, -6.2482263e-06, -5.8464525e-06], dtype=float32), 1.8333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_83.wav', 'Aber gerne!', 11, array([ 8.6439795e-06, -4.9609935e-07, -6.4880319e-06, ...,\n", + " 3.4692115e-05, 2.2026890e-05, 7.4778809e-06], dtype=float32), 1.0416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_84.wav', 'Aber heute bleiben wir nicht so lang.', 37, array([-1.8894493e-06, 2.0465507e-06, 9.1691445e-06, ...,\n", + " -7.1275235e-06, -1.7749519e-05, -2.3891846e-05], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_91.wav', 'Aber ich kÃ\\x83¶nnte das nicht.', 29, array([-1.3084863e-05, -2.4588813e-05, -3.0510082e-05, ...,\n", + " 9.0740468e-06, 7.3771143e-06, 4.7309027e-06], dtype=float32), 1.7916666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_93.wav', 'Aber ich schweife ab.', 21, array([ 2.0572887e-05, 5.2324990e-06, 8.2274501e-06, ...,\n", + " -4.5831721e-06, -5.6718955e-06, 1.2206646e-06], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_107.wav', 'Aber ja!', 8, array([ 6.5076074e-06, 9.5467785e-06, 6.4050842e-06, ...,\n", + " -2.8310139e-06, -1.7247042e-06, 4.6768464e-06], dtype=float32), 1.25)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_115.wav', 'Aber locker!', 12, array([-3.1642696e-05, -3.3065215e-05, -3.9417675e-05, ...,\n", + " 5.7364587e-06, 8.1942826e-06, 2.0739385e-06], dtype=float32), 1.125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_135.wav', 'Aber nicht mein Koch.', 21, array([-3.2864332e-06, 6.4927585e-06, 1.8139610e-05, ...,\n", + " -1.9440764e-05, 6.6915834e-07, -2.3949342e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_150.wav', 'Aber sie wirkt.', 15, array([ 4.7021126e-06, 7.9376932e-06, 1.9524101e-05, ...,\n", + " -1.0560079e-05, 2.2925117e-07, 7.0664414e-06], dtype=float32), 1.6666666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_154.wav', 'Aber sonst schon.', 17, array([ 1.3162755e-05, 5.1608640e-06, 2.6601656e-06, ...,\n", + " -1.9497929e-05, -1.3883044e-05, -2.9709727e-05], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_172.wav', 'Aber wie kann das sein?', 23, array([-1.0407030e-05, -1.3223411e-05, -2.4366140e-05, ...,\n", + " 3.1900552e-06, -6.4861370e-06, -5.3326958e-06], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_183.wav', 'Abgemacht!', 10, array([ 4.3209253e-05, 3.8841117e-05, 2.0105661e-05, ...,\n", + " 3.7174163e-07, -1.4371894e-05, -1.6794727e-05], dtype=float32), 1.375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_203.wav', 'Ach Mann.', 9, array([-1.1161302e-05, -4.8241122e-06, 1.0564104e-06, ...,\n", + " 5.0679973e-06, 7.8539133e-06, 9.7488000e-06], dtype=float32), 1.0833333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_205.wav', 'Ach die!', 8, array([-1.2094329e-05, -6.8277895e-06, -9.1963557e-07, ...,\n", + " 1.1451033e-05, -2.4406472e-06, 1.2908078e-05], dtype=float32), 1.125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_206.wav', 'Ach du ScheiÃ\\x83Â\\x9fe!', 18, array([ 2.9578983e-05, 1.8899245e-05, 2.3418788e-05, ...,\n", + " -2.3013935e-07, 1.0615421e-05, 1.1895302e-05], dtype=float32), 1.5416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_208.wav', 'Ach du liebe Zeit!', 18, array([-1.7297025e-05, -4.8105571e-06, 4.0550490e-06, ...,\n", + " 1.3112809e-06, 2.7569813e-06, -5.3473241e-06], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_209.wav', 'Ach du meine GÃ\\x83¼te!', 21, array([-1.4435645e-06, 1.5456475e-05, 7.5820367e-06, ...,\n", + " -5.9919462e-06, -2.8870822e-06, -8.3686264e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_219.wav', 'Ach nein?', 9, array([ 2.6512873e-05, 3.2190139e-05, 2.3575940e-05, ...,\n", + " 1.2494418e-06, -4.9369064e-06, 5.6602944e-06], dtype=float32), 1.0416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_220.wav', 'Ach so das.', 11, array([1.7692106e-05, 1.0481614e-05, 2.4560395e-05, ..., 1.1682997e-05,\n", + " 1.4096242e-05, 1.0814229e-05], dtype=float32), 1.25)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_221.wav', 'Ach so geht das.', 16, array([-4.7354648e-04, -1.6085681e-04, 6.9589930e-04, ...,\n", + " 2.8736700e-05, 3.1944357e-05, 3.1408650e-05], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_223.wav', 'Ach so.', 7, array([ 1.7158927e-04, 2.4213194e-04, 3.3745603e-04, ...,\n", + " -7.4672876e-06, -9.1694219e-06, 5.6827762e-06], dtype=float32), 0.75)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_227.wav', 'Ach, da bist du ja!', 19, array([2.9949500e-05, 1.6420616e-05, 3.4700156e-06, ..., 1.3191027e-05,\n", + " 1.0943100e-05, 1.8516728e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_247.wav', 'Achte auf den Verkehr.', 22, array([-3.3732314e-05, -1.7520404e-05, 3.1957079e-05, ...,\n", + " 9.2553882e-06, 1.9688600e-06, 8.4563535e-06], dtype=float32), 1.8333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_22.wav', 'Eine letzte Sache noch.', 23, array([6.1327388e-05, 1.8792783e-04, 6.4210355e-05, ..., 9.2773196e-05,\n", + " 9.0997717e-05, 9.3233648e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_23.wav', 'Es ist aus und vorbei.', 22, array([-1.40103046e-04, -1.22702273e-04, 9.30938695e-05, ...,\n", + " 3.74735857e-04, 3.98035394e-04, 1.15837705e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_57.wav', 'Wie machst du das?', 18, array([ 2.4910548e-04, 4.8663982e-04, 3.5670877e-04, ...,\n", + " -7.4250769e-05, -2.8972838e-05, 5.8696533e-05], dtype=float32), 1.4660625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_88.wav', 'Die Göre lügt wie gedruckt.', 29, array([-2.1256006e-04, -1.5941747e-04, -9.0014306e-05, ...,\n", + " 8.4916828e-05, -1.1791480e-04, 2.8579583e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_94.wav', 'Nur wenn das Essen nicht schmeckt.', 34, array([-7.44715726e-05, -1.21678349e-04, 3.31091655e-07, ...,\n", + " -1.03946346e-04, -1.27610518e-04, -1.86876860e-04], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_97.wav', 'Niemals!', 8, array([-4.9271861e-05, 5.3212247e-05, 3.3188411e-05, ...,\n", + " 6.3736064e-05, 4.1986009e-06, 8.9537862e-05], dtype=float32), 1.2479583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_98.wav', 'Und nun zum Wetter.', 19, array([ 5.21471120e-05, -9.25690911e-05, -1.22024496e-04, ...,\n", + " 6.86152780e-05, -3.58715624e-05, 9.09384198e-06], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_99.wav', 'Die Ã\\x96ffnung ist dehnbar.', 25, array([ 5.6826313e-05, 6.8275417e-06, 9.2087415e-05, ...,\n", + " 3.3015142e-05, 6.6053515e-05, -1.5007930e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_108.wav', 'Habt ihr schon angefangen?', 26, array([1.8725816e-04, 1.5125435e-04, 1.8410715e-04, ..., 7.2607516e-05,\n", + " 2.0626400e-04, 8.0785358e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_120.wav', 'Wie konnte das passieren?', 25, array([8.6616979e-05, 1.3365489e-04, 4.9586175e-05, ..., 2.3242908e-06,\n", + " 9.4004557e-05, 2.2714035e-04], dtype=float32), 1.5751041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_121.wav', 'Schnappen Sie die!', 18, array([ 3.8893679e-05, -8.0967751e-05, 9.0245063e-05, ...,\n", + " -1.8313204e-04, 3.8293081e-05, -2.9012112e-06], dtype=float32), 1.38125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_127.wav', 'Kommst du mit auf die Demo?', 27, array([2.5501425e-04, 3.7619186e-04, 2.3280202e-04, ..., 1.0214894e-04,\n", + " 8.1334627e-05, 1.0037446e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_131.wav', 'Neymar schummelt immer.', 23, array([ 9.5439558e-05, -2.0274975e-04, -2.7297903e-05, ...,\n", + " -1.8293603e-04, -8.1430808e-05, 2.3813642e-05], dtype=float32), 1.6962708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_146.wav', 'Hä, wie jetzt?', 15, array([ 1.0428468e-05, 1.2862872e-04, 1.4709163e-04, ...,\n", + " 2.5179393e-06, -3.9250128e-05, 1.4990567e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_154.wav', 'Gehst du mit mir kicken?', 24, array([-2.3674214e-05, 1.5158611e-04, 2.0247647e-04, ...,\n", + " -5.0921575e-05, 1.6530334e-04, 2.6747581e-05], dtype=float32), 1.5508541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_165.wav', 'Die Erlösung naht.', 19, array([-2.7500470e-05, 4.6476634e-05, 9.3239294e-05, ...,\n", + " 1.3720182e-04, 3.3580043e-05, 1.6966692e-04], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_170.wav', 'Worauf wartest du noch?', 23, array([ 1.9643597e-04, 1.8858226e-04, 1.2341220e-04, ...,\n", + " 1.9399264e-04, 7.9539248e-05, -8.9550871e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_174.wav', 'Wegen der Sicherheit.', 21, array([ 5.0312192e-05, -4.7642745e-05, 7.9094330e-05, ...,\n", + " 1.6562216e-04, -3.8164351e-05, -8.3325220e-05], dtype=float32), 1.53875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_179.wav', 'Was tun Sie da?', 15, array([8.1822727e-05, 1.5520566e-04, 2.9996689e-04, ..., 9.4358256e-05,\n", + " 6.1927640e-05, 1.5151841e-04], dtype=float32), 1.550875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_190.wav', 'Die Play-Offs haben begonnen.', 29, array([-2.7691500e-04, -2.5398214e-04, -1.5421546e-04, ...,\n", + " 3.4238459e-05, -1.6769451e-04, -1.3444168e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_193.wav', 'Spinnst du?', 11, array([ 1.0871515e-04, 1.6241276e-04, -7.8830650e-05, ...,\n", + " -1.6421604e-04, -1.6669222e-04, -1.5261788e-04], dtype=float32), 1.5993333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_194.wav', 'Nicht mit mir!', 14, array([ 4.5433408e-05, -1.3075510e-04, 6.4006963e-05, ...,\n", + " -2.2528745e-04, -1.7135930e-05, -1.1135123e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_197.wav', 'Lang lebe die Königin!', 23, array([-1.6047362e-04, -1.5451153e-05, -1.0221335e-04, ...,\n", + " 7.2540395e-05, 9.8553333e-05, -3.9703427e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_220.wav', 'Der Punkt geht an euch.', 23, array([ 6.8754802e-05, -3.1321447e-06, 2.6729414e-05, ...,\n", + " 5.2136878e-05, 6.9546691e-06, 1.5569202e-04], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_253.wav', 'Ich geh jetzt duschen.', 22, array([-3.17401755e-05, 7.48557359e-05, -5.43324859e-05, ...,\n", + " -1.39205178e-04, -6.44034174e-07, 1.28346255e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_270.wav', 'Ich bin stärker.', 17, array([ 9.11816460e-05, 1.44324003e-04, -2.98500763e-05, ...,\n", + " 1.31568195e-05, 6.36509794e-05, 6.90339657e-05], dtype=float32), 1.3933541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_273.wav', 'Wie lange fahre ich noch?', 25, array([ 2.9487488e-05, -1.3105408e-04, 5.8441510e-05, ...,\n", + " 3.1229702e-05, -5.4796135e-05, -6.3286854e-05], dtype=float32), 1.6841458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_275.wav', 'Wer hat es dir verraten?', 24, array([ 1.2313928e-04, 1.3087156e-04, -1.2932777e-04, ...,\n", + " 4.8921556e-05, 1.4495553e-04, -3.3808697e-05], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_293.wav', 'Gib nicht anderen die Schuld.', 29, array([-7.5512668e-05, -3.6905835e-06, 6.9531779e-05, ...,\n", + " 4.3623371e-05, 1.8721327e-04, 7.1873088e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_297.wav', 'Ist es schon so weit gekommen?', 30, array([-4.3128319e-05, -1.7937485e-04, -1.0890597e-04, ...,\n", + " -2.6245858e-04, -1.7716063e-04, 2.2997918e-04], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_318.wav', 'Einfach reinstechen!', 20, array([ 9.1551570e-05, 8.9795518e-05, -6.6505017e-05, ...,\n", + " 1.0614502e-04, 1.8572784e-05, 1.7793228e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_320.wav', 'Also bleibt alles beim Alten.', 29, array([-4.5057204e-06, 1.0390608e-04, 2.8324797e-05, ...,\n", + " -9.8345605e-05, -4.1500021e-05, -2.5271966e-05], dtype=float32), 1.8053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_324.wav', 'Fragen wir das Publikum!', 24, array([ 3.0478600e-06, -1.7624698e-04, -1.1634296e-04, ...,\n", + " 1.3709384e-04, 8.2070706e-05, 1.4319613e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_336.wav', 'Nicht nur in Norddeutschland.', 29, array([ 1.6894817e-05, 7.2304661e-05, -1.7737957e-04, ...,\n", + " 7.4396456e-05, 1.5326528e-04, -3.0850897e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_339.wav', 'Lass uns welche wegräumen.', 27, array([-1.3355519e-04, 3.6361063e-05, 1.2765500e-04, ...,\n", + " -4.6465106e-05, -9.3052886e-06, -3.1085176e-06], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_356.wav', 'Bauch schlägt Hirn.', 20, array([-8.7791312e-05, -9.9132430e-06, -7.8506528e-05, ...,\n", + " -1.2898828e-04, 1.9388601e-05, -7.8024947e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_381.wav', 'Polen Sie die Maschine um!', 26, array([ 8.2736617e-05, 1.0996176e-04, 9.2422182e-05, ...,\n", + " -2.2247934e-05, 7.0410904e-05, -2.1137239e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_125.wav', 'Danke sehr!', 11, array([ 3.3982175e-05, 3.0489264e-05, -3.2230830e-05, ...,\n", + " 1.3063883e-04, 6.5418164e-05, 1.0737507e-04], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_132.wav', 'Nerve ich dich?', 15, array([-1.3204044e-04, -3.8424434e-05, -1.6640245e-04, ...,\n", + " 2.0048997e-04, 2.0114701e-04, 2.8921696e-04], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_188.wav', 'Kann ich mal riechen?', 21, array([7.4782380e-05, 1.5360968e-04, 1.7683143e-04, ..., 7.1163136e-05,\n", + " 3.2413329e-05, 1.6134117e-04], dtype=float32), 1.5949583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_195.wav', 'Sehe ich das richtig?', 21, array([-4.4274679e-03, -6.2118913e-03, -5.6534973e-03, ...,\n", + " -5.3494594e-05, 1.0948109e-05, 2.8244473e-05], dtype=float32), 1.8706875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_211.wav', 'Ein Dessert gefällig?', 22, array([ 6.1982937e-05, 8.9088433e-05, 2.1896411e-04, ...,\n", + " -5.3060539e-05, 5.5113655e-05, 2.0669409e-06], dtype=float32), 1.6305416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_264.wav', 'Was schätzen Sie?', 18, array([-9.39443475e-04, -1.31584110e-03, -1.22378767e-03, ...,\n", + " 5.19938067e-06, -1.39896365e-05, 3.26375412e-05], dtype=float32), 1.9933125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_266.wav', 'Hast du Geld dabei?', 19, array([-1.3200377e-05, 3.8996362e-04, 1.0263748e-03, ...,\n", + " -2.9147041e-05, 9.2981281e-06, -4.0353654e-05], dtype=float32), 1.707375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_327.wav', 'Augen auf die StraÃ\\x9fe!', 22, array([-1.1210357e-04, -1.8035798e-04, -1.8643556e-04, ...,\n", + " 8.4691441e-05, 5.8400867e-05, 5.8256945e-05], dtype=float32), 1.8399791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_346.wav', 'Was soll ich da machen?', 23, array([-2.5878362e-05, 2.1881026e-05, -1.2260079e-05, ...,\n", + " 4.5499460e-06, 4.0606970e-05, -2.3619448e-05], dtype=float32), 1.9433541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_349.wav', 'Immer dasselbe mit euch!', 24, array([-2.3236821e-04, -3.3517351e-04, -3.0884243e-04, ...,\n", + " 8.0186677e-05, 1.6797509e-05, -1.6808892e-05], dtype=float32), 1.9652708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_373.wav', 'Kennen wir uns?', 15, array([-5.0764916e-06, -7.3543859e-05, 1.1312031e-05, ...,\n", + " -3.2780910e-05, -1.3342450e-04, -8.3744824e-05], dtype=float32), 1.2833125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_378.wav', 'Redet ihr nicht miteinander?', 28, array([ 3.3598881e-05, 2.8617033e-05, -4.8224880e-05, ...,\n", + " 7.4195086e-06, -4.8723170e-05, 6.5784006e-05], dtype=float32), 1.9491458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_420.wav', 'Ich hasse Rituale.', 18, array([ 7.1912136e-06, 3.0618376e-06, 8.3010753e-05, ...,\n", + " -1.4567961e-05, 1.1762774e-05, 3.1641615e-05], dtype=float32), 1.9995833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_486.wav', 'Wie groÃ\\x9f ist er denn?', 22, array([ 3.0858202e-05, 7.4509022e-05, 1.3619277e-04, ...,\n", + " -3.3022930e-06, 9.8051796e-06, -2.7459086e-05], dtype=float32), 1.867625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_537.wav', 'Es ist zum Heulen.', 18, array([-1.91718082e-05, 6.43216190e-05, 1.19517106e-04, ...,\n", + " 1.98961898e-05, 2.61543628e-05, -1.34301990e-06], dtype=float32), 1.9879583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_544.wav', 'Nimm es ihm nicht übel.', 24, array([ 4.2532893e-08, -6.0193088e-05, 4.5228205e-07, ...,\n", + " 1.0533330e-04, 4.6245714e-05, -1.5597003e-05], dtype=float32), 1.7243333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_547.wav', 'Um Gottes Willen!', 17, array([-1.3659755e-05, -1.1149528e-04, -7.7302495e-05, ...,\n", + " -5.2225241e-05, -6.4986933e-05, -1.9107327e-05], dtype=float32), 1.5258125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_570.wav', 'Voll der Lauch!', 15, array([ 2.3544633e-05, -8.2356913e-05, -8.4443280e-05, ...,\n", + " -8.3270104e-05, -1.1799393e-04, -4.4736080e-05], dtype=float32), 1.8773958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_587.wav', 'Das will ich meinen!', 20, array([ 1.15228731e-05, -1.00152036e-04, -3.91713802e-05, ...,\n", + " -3.00788033e-05, -2.60362140e-05, -2.54406623e-05], dtype=float32), 1.823375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_595.wav', 'Gib dir keine Mühe!', 20, array([1.1918874e-05, 7.7710565e-06, 2.2653954e-05, ..., 1.2088865e-06,\n", + " 7.3900424e-05, 4.7324560e-05], dtype=float32), 1.7467083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_612.wav', 'Entschuldige', 12, array([-3.3377805e-06, -1.3742609e-05, -3.8612947e-05, ...,\n", + " -4.1617693e-07, -5.6907498e-05, -6.3263155e-06], dtype=float32), 1.096375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_21.wav', 'Ich glaube, ja.', 15, array([-8.5291895e-06, -1.9790486e-05, 2.0588757e-05, ...,\n", + " 4.3540977e-06, 3.3659559e-05, 2.8167133e-05], dtype=float32), 1.7166458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_48.wav', 'Was denn jetzt?', 15, array([3.3551037e-06, 7.2315837e-05, 9.8261240e-05, ..., 1.8147666e-04,\n", + " 1.3495231e-04, 1.4128252e-05], dtype=float32), 1.5235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_53.wav', 'Ist es das wert?', 16, array([ 6.972987e-06, -6.975743e-05, -8.996664e-05, ..., -8.399185e-06,\n", + " -8.876120e-05, -7.246290e-05], dtype=float32), 1.8518125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_118.wav', 'Findest du?', 11, array([-1.12564965e-04, -6.36710465e-05, -1.04058718e-05, ...,\n", + " 9.31948132e-04, 8.68959934e-04, 9.69569141e-04], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_139.wav', \"Wohl bekommt's.\", 15, array([-5.15776883e-05, -1.17497526e-04, -1.66595215e-04, ...,\n", + " 2.18412912e-04, 1.14814145e-04, 9.11775787e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_242.wav', 'An die Latte!', 13, array([-2.9736115e-05, 6.2128674e-05, -1.7713173e-06, ...,\n", + " -9.5688220e-06, -3.3155960e-05, -2.0475885e-05], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_256.wav', 'Wie lange noch?', 15, array([-2.0701043e-05, 4.3786262e-05, -9.4478482e-06, ...,\n", + " -5.2062300e-05, -2.7314949e-05, -9.1643757e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_287.wav', 'Halt die Klappe!', 16, array([5.4399417e-05, 1.7967819e-04, 1.5970672e-04, ..., 6.5669185e-05,\n", + " 5.5145654e-05, 4.6019220e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_320.wav', 'Mach selber!', 12, array([-6.9740723e-05, 4.4339331e-06, -8.3184044e-05, ...,\n", + " 1.4031340e-05, 1.2219901e-05, 7.0223352e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_406.wav', 'Alles frisch?', 13, array([-1.15522525e-04, -1.33178124e-04, -1.96026522e-04, ...,\n", + " 5.01462309e-05, 9.76682568e-05, 2.38532848e-05], dtype=float32), 1.4626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_577.wav', 'Nun ja.', 7, array([-4.87583275e-05, -1.09872217e-05, -2.24729556e-05, ...,\n", + " 4.66253441e-05, 1.96394685e-04, 1.52344255e-05], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_584.wav', 'Wer macht Kaffee?', 17, array([ 3.8115049e-05, -9.6357744e-06, 7.8119905e-05, ...,\n", + " -2.0809734e-04, -1.8620661e-04, -1.3914006e-04], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_666.wav', 'Verflixt noch mal!', 18, array([-2.2882066e-04, -2.9250007e-04, -2.8351255e-04, ...,\n", + " 1.1955178e-04, 1.7373663e-04, 7.4429918e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_30.wav', 'Schweigen Sie!', 14, array([-3.1788008e-05, -3.4064793e-05, -2.7987528e-05, ...,\n", + " -1.5091732e-05, -2.6680038e-05, -3.8527149e-05], dtype=float32), 1.7066666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_152.wav', 'Danke für die Blumen.', 22, array([ 1.7122936e-06, 6.9385942e-06, 3.6246149e-07, ...,\n", + " -1.4888439e-05, 2.3918087e-06, -7.6587348e-06], dtype=float32), 1.8791666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_218.wav', 'Und das stimmt sogar.', 21, array([ 4.1728057e-05, 5.5362845e-05, 6.8501140e-05, ...,\n", + " -2.8829272e-05, -9.4307861e-06, -1.7323953e-05], dtype=float32), 1.77075)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_228.wav', 'Oder etwa doch?', 15, array([-1.9058538e-05, -1.6082793e-05, -2.4990761e-05, ...,\n", + " -3.7682898e-05, -2.6903717e-05, -2.3563476e-05], dtype=float32), 1.8430416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_235.wav', 'Lass es gut sein.', 17, array([2.5800218e-05, 2.4886122e-05, 2.6301905e-05, ..., 2.0628368e-05,\n", + " 1.3992375e-05, 1.1405512e-05], dtype=float32), 1.8430416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_243.wav', 'Was für ein Schwachsinn!', 25, array([-3.7606616e-05, -4.6087491e-05, -5.2579282e-05, ...,\n", + " -9.6937197e-07, -2.7171711e-05, -4.9796104e-06], dtype=float32), 1.79625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_247.wav', 'Meinen Sie etwa mich?', 21, array([3.4092998e-05, 2.4871710e-05, 3.1290274e-05, ..., 3.8184229e-05,\n", + " 3.8311930e-05, 1.9864283e-05], dtype=float32), 1.7936666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_266.wav', 'Doch, der kommt mit.', 20, array([-8.7682038e-06, 3.3905403e-06, -2.5130439e-06, ...,\n", + " -7.3065071e-06, -4.2862930e-06, -2.6758978e-06], dtype=float32), 1.9898125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_324.wav', 'Du willst eine Revanche?', 24, array([ 7.33632942e-06, 5.97303369e-06, 5.83600695e-06, ...,\n", + " 1.49849775e-05, 1.08204476e-05, -3.58769762e-06], dtype=float32), 1.9875833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_359.wav', 'Achtung, Lebensgefahr!', 22, array([ 1.4763166e-05, 2.4559184e-05, -6.1735605e-06, ...,\n", + " -4.0966352e-06, -3.3091931e-06, -8.6383498e-06], dtype=float32), 1.9786666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_368.wav', 'Sag doch was!', 13, array([ 2.2444649e-06, 7.6022111e-06, 4.6965952e-06, ...,\n", + " -3.8131137e-05, -2.2596261e-05, -3.6410544e-05], dtype=float32), 1.6553333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_377.wav', 'Klar geht das!', 14, array([ 7.9997551e-07, 7.2854018e-06, 1.5502587e-06, ...,\n", + " 4.2983497e-06, 1.1067883e-06, -6.2062031e-06], dtype=float32), 1.6706666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_399.wav', 'Ganz wie ihre Mutter!', 21, array([-1.3625373e-05, -1.5324851e-05, -8.2329316e-06, ...,\n", + " -3.1325493e-05, -3.4243036e-05, -3.8296192e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_404.wav', \"Und ab geht's!\", 14, array([-1.6434673e-05, -4.6597820e-06, -3.0193429e-05, ...,\n", + " 5.6945028e-06, 4.0367054e-06, 2.6991445e-06], dtype=float32), 1.7606666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_409.wav', 'Mahlzeit!', 9, array([-1.6801674e-05, -1.1057600e-05, -2.5246043e-05, ...,\n", + " -5.8098987e-08, -1.3756068e-05, 7.1873791e-07], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_417.wav', 'Was für ein Ding?', 18, array([ 6.9620419e-06, 2.2064933e-05, -7.5111966e-06, ...,\n", + " -2.0811036e-05, -7.9874835e-06, -4.7895933e-06], dtype=float32), 1.6473333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_468.wav', 'Genau einen.', 12, array([-7.29009771e-05, -8.52458907e-05, -1.06200605e-04, ...,\n", + " -5.32185413e-06, -1.07338547e-05, -8.40487064e-06], dtype=float32), 1.3666666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_514.wav', 'Zu Befehl!', 10, array([-2.3591008e-05, -3.5732090e-05, -3.4227767e-05, ...,\n", + " -2.8442626e-05, 1.2019399e-05, -1.3777444e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_611.wav', 'So viel dazu.', 13, array([ 7.4472086e-06, 7.6988908e-06, 1.9191646e-05, ...,\n", + " -3.9837760e-06, -5.9473659e-06, -1.5347923e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_633.wav', 'Doch nicht diese!', 17, array([-1.5188496e-05, -1.3384078e-05, -2.5278267e-05, ...,\n", + " -9.0744479e-06, -1.7723884e-05, -8.7737453e-06], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_637.wav', 'Da musste durch.', 16, array([-6.1405983e-05, -6.6703440e-05, -6.7519111e-05, ...,\n", + " -3.0437115e-05, -1.0807975e-05, -2.7072128e-05], dtype=float32), 1.752)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_660.wav', 'Bitte haben Sie Geduld.', 23, array([-5.3847558e-05, -7.3710136e-05, -6.7579982e-05, ...,\n", + " -1.0283680e-05, -3.1539796e-05, -2.2386694e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_668.wav', 'Na logo!', 8, array([-2.3636436e-05, -1.5810723e-05, -2.8241622e-05, ...,\n", + " -1.3751334e-06, 1.1204750e-05, 6.0684874e-06], dtype=float32), 0.992)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_676.wav', 'Ich bin Student.', 16, array([ 7.12830888e-06, -1.04677674e-05, 5.06380366e-06, ...,\n", + " 2.56778890e-06, 2.41716316e-06, 1.42220715e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_721.wav', 'Warum glaubst du ihm?', 21, array([-2.8855115e-05, -2.1601849e-05, -4.5714023e-05, ...,\n", + " 1.0700950e-06, -8.6324471e-06, -1.1586128e-05], dtype=float32), 1.888)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_767.wav', 'Alle Lichter einschalten', 24, array([ 3.82986327e-05, 4.59369221e-05, 5.11867729e-05, ...,\n", + " -3.22036831e-05, -1.03011635e-05, -3.75456489e-06], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_775.wav', 'Schlaf dich gesund!', 19, array([ 8.9927544e-06, 3.7294924e-07, 2.0666816e-07, ...,\n", + " -1.4574092e-05, 9.9155943e-07, -1.1447136e-05], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_785.wav', 'Wer spricht da?', 15, array([-5.0560098e-05, -5.3028423e-05, -5.4164509e-05, ...,\n", + " 1.4739732e-05, 9.2475852e-07, 2.9554553e-06], dtype=float32), 1.8953333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_9.wav', 'Kannst du häkeln?', 18, array([ 5.7386926e-05, 8.2160957e-05, 5.5038501e-05, ...,\n", + " -4.3172963e-06, 4.1677453e-05, 4.7943948e-05], dtype=float32), 1.6993333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_44.wav', 'Bitte kommen!', 13, array([1.0956727e-04, 1.5614097e-04, 1.3331856e-04, ..., 1.3650022e-05,\n", + " 1.1109641e-05, 1.3527738e-06], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_53.wav', 'Hör zu!', 8, array([-6.0608932e-06, -4.1002470e-05, 2.2774377e-05, ...,\n", + " -8.5628499e-06, -1.7102975e-05, -5.2866948e-05], dtype=float32), 1.3013333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_54.wav', 'Bitte, bleib da.', 16, array([ 3.5020625e-05, 5.4955650e-05, 8.0653575e-05, ...,\n", + " -2.3735600e-05, 3.2219548e-05, -2.8188835e-05], dtype=float32), 1.3893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_64.wav', 'Was piept hier so?', 18, array([4.8969712e-05, 1.0184415e-04, 1.0672094e-04, ..., 1.0047335e-04,\n", + " 8.2428909e-05, 7.4903524e-05], dtype=float32), 1.476)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_65.wav', 'Die Tränen sind echt.', 22, array([-2.5628888e-04, -3.2446094e-04, -2.8078147e-04, ...,\n", + " 6.0525483e-05, 4.5224155e-05, 3.3287215e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_66.wav', 'Oh, wie ist das schön!', 23, array([-1.3561957e-04, -2.9620592e-04, -1.1127204e-04, ...,\n", + " -1.3441611e-05, -2.0591922e-05, -4.1845051e-05], dtype=float32), 1.9373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_70.wav', 'Nein, die andere.', 17, array([1.08759763e-04, 2.17104956e-04, 2.50456098e-04, ...,\n", + " 1.99571132e-05, 1.15319264e-04, 1.09982837e-04], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_73.wav', 'Der Hunger treibt es hinein!', 28, array([-7.6006359e-04, -1.0618430e-03, -9.1635465e-04, ...,\n", + " -2.1929874e-05, -3.9133694e-05, -2.3749919e-05], dtype=float32), 1.8006666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_81.wav', 'Dann machen alle Mann kehrt.', 28, array([-1.5950583e-04, -1.6477516e-04, -1.3784993e-04, ...,\n", + " 6.2336148e-05, 1.8180552e-05, 9.2034599e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_90.wav', 'Komm mal klar.', 14, array([2.0439363e-04, 2.6905714e-04, 1.8548965e-04, ..., 3.1710202e-05,\n", + " 2.3530252e-05, 2.1564969e-05], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_95.wav', 'Ist noch alles dran?', 20, array([-2.2047247e-04, -3.2201153e-04, -2.8738266e-04, ...,\n", + " -7.7452714e-05, -4.3362299e-05, 7.5945250e-06], dtype=float32), 1.632)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_99.wav', 'Nie glaubt sie mir.', 19, array([ 1.5801163e-05, 5.7899309e-05, 3.1942949e-05, ...,\n", + " -3.0608622e-05, -8.0015372e-05, -3.3063152e-05], dtype=float32), 1.5613333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_207.wav', 'Sperrt sie ein!', 15, array([1.7913821e-04, 3.0638310e-04, 2.4345164e-04, ..., 5.7913669e-05,\n", + " 2.3223187e-05, 5.4880878e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_249.wav', 'Ja was geht denn ab?', 20, array([-1.0661902e-04, -9.4065879e-05, -6.9818758e-05, ...,\n", + " -3.3508950e-05, 3.7770699e-06, 2.3758860e-06], dtype=float32), 1.9973333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_250.wav', 'Dümmste Ausrede ever!', 22, array([ 3.16905534e-05, 3.74705655e-06, -2.55898794e-05, ...,\n", + " 4.44019097e-05, 2.41961206e-05, 1.06514235e-05], dtype=float32), 1.9806666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_251.wav', 'Wir sind hier ja unter uns.', 27, array([-3.3862656e-04, -5.0057843e-04, -4.7798100e-04, ...,\n", + " 3.9128430e-05, -4.0246316e-05, -1.3086459e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_278.wav', 'Er ist ein User!', 16, array([ 5.7516689e-05, 4.9558192e-05, 6.3942927e-05, ...,\n", + " -2.3214375e-06, 1.1798247e-05, 3.6477853e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_282.wav', 'Zurückbleiben, bitte!', 22, array([ 1.8404999e-04, 2.6386097e-04, 3.0643051e-04, ...,\n", + " -6.5650514e-05, -5.8646885e-05, -6.5778695e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_287.wav', 'Gut getrollt.', 13, array([-3.0470208e-05, -6.1425657e-05, -3.8205933e-05, ...,\n", + " 6.9129404e-05, 1.1258064e-04, 1.2031732e-04], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_336.wav', 'Ganz sicher sogar.', 18, array([2.2912030e-04, 2.5114618e-04, 1.9525687e-04, ..., 8.7549386e-05,\n", + " 8.5029111e-05, 7.8950601e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_341.wav', 'Wohl kaum.', 10, array([ 1.6102573e-04, 1.7911245e-04, 1.5706589e-04, ...,\n", + " -2.9753184e-05, -4.4280365e-05, 3.1124373e-06], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_405.wav', 'Wie geht das?', 13, array([-1.6796951e-04, -1.9163813e-04, -1.9830326e-04, ...,\n", + " -5.0582935e-06, 1.2309533e-05, -2.6891148e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_407.wav', 'Befehl ist Befehl!', 18, array([ 9.3892188e-05, 1.0890782e-04, 9.6308002e-05, ...,\n", + " -3.0468544e-05, -2.8461071e-05, -7.1021976e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_412.wav', 'Mit wem spreche ich?', 20, array([ 7.7782068e-05, 9.2144561e-05, 2.8574361e-05, ...,\n", + " -1.1466493e-05, 5.7958755e-06, 6.2275390e-06], dtype=float32), 1.7813333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_422.wav', 'An schlechten Tagen ja.', 23, array([ 4.2690190e-05, -2.3120232e-05, -2.5523063e-05, ...,\n", + " 2.1898361e-05, -2.7946093e-05, 4.6620054e-05], dtype=float32), 1.9833333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_460.wav', 'Sie haben richtig geraten!', 26, array([-9.0950904e-05, -1.4647168e-04, -7.1847418e-05, ...,\n", + " 2.8589966e-05, -2.2244849e-05, 1.1577226e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_476.wav', 'Alle sprechen so leise.', 23, array([-6.9834332e-06, -3.1972188e-05, -3.9375213e-05, ...,\n", + " -2.6475973e-05, 1.4716678e-05, -4.5046556e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_477.wav', 'Woher willst du das wissen?', 27, array([-2.12417421e-04, -2.56415573e-04, -2.42886104e-04, ...,\n", + " 9.67599408e-05, 9.51452384e-05, 1.15144765e-04], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_488.wav', 'Anders als man denkt.', 21, array([ 1.8948530e-04, 3.4113604e-04, 1.9700162e-04, ...,\n", + " -7.6619792e-05, -3.6041514e-05, -1.6451453e-06], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_495.wav', 'Runter mit den Waffen!', 22, array([ 1.12369155e-04, 4.44092657e-05, 8.84383553e-05, ...,\n", + " -7.52444794e-06, -4.84231314e-05, -4.22670855e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_504.wav', 'Und jetzt?', 10, array([-5.6267181e-06, -5.9708807e-05, -3.4106170e-06, ...,\n", + " -1.0430286e-04, -1.2670284e-04, -1.4261479e-04], dtype=float32), 1.344)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_511.wav', 'Jein.', 5, array([ 5.89297160e-05, 1.19100565e-04, 6.77589633e-05, ...,\n", + " -1.61726966e-05, -7.95948727e-05, -2.88161173e-05], dtype=float32), 1.0453333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_522.wav', 'Vorsicht Stufe!', 15, array([ 6.2581657e-06, 4.7380847e-05, 8.6832886e-05, ...,\n", + " 6.6710568e-06, 2.2640632e-05, -3.9922857e-06], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_526.wav', 'War ich zu zickig?', 18, array([1.6193213e-03, 2.2825657e-03, 2.0064272e-03, ..., 6.6650551e-05,\n", + " 7.2444294e-05, 8.5881074e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_530.wav', 'Wo drückt der Schuh?', 21, array([-1.46389175e-05, 3.62552214e-06, -9.26516877e-05, ...,\n", + " -3.03967099e-05, -1.01135854e-04, 3.96938458e-06], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_534.wav', 'Kann das noch warten?', 21, array([1.74110639e-04, 1.80995979e-04, 2.26840231e-04, ...,\n", + " 1.18193166e-04, 7.83515134e-05, 5.11603030e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_539.wav', 'Passen die Sätze so?', 21, array([-3.1769360e-04, -4.7089945e-04, -4.3369626e-04, ...,\n", + " 1.6810809e-04, 5.3649095e-05, 1.4577823e-04], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_542.wav', 'Ã\\x9cbermorgen.', 12, array([-2.4301407e-04, -3.5653665e-04, -2.1825638e-04, ...,\n", + " 6.1351508e-05, 9.2918686e-05, 8.8779299e-05], dtype=float32), 1.1306666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_589.wav', 'Ich mag deinen Mantel.', 22, array([-2.1532472e-04, -3.8814778e-04, -2.9697348e-04, ...,\n", + " -3.1324416e-05, -3.5802710e-05, 8.7614599e-06], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_604.wav', 'Wie macht er das bloÃ\\x9f?', 23, array([-1.8150010e-04, -2.0398400e-04, -1.5460433e-04, ...,\n", + " -3.4698380e-05, -6.5080814e-05, -1.8794183e-06], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_666.wav', 'Was soll ich sagen?', 19, array([-8.8535160e-07, -7.4019059e-05, 7.4082243e-05, ...,\n", + " -6.2706102e-05, 2.9464120e-06, -1.1627621e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_694.wav', 'Wie misst man das?', 18, array([ 2.5176766e-04, 1.8225121e-04, 3.6178919e-04, ...,\n", + " 2.0104897e-06, 5.5382880e-05, -2.6957323e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_704.wav', 'Mir fehlen die Worte.', 21, array([ 1.7020236e-04, 3.3776514e-04, 3.4704659e-04, ...,\n", + " 4.7222587e-05, -1.5073445e-05, -1.6250522e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_714.wav', 'Gehen wir?', 10, array([ 1.5890028e-04, 1.6513607e-04, 1.7650245e-04, ...,\n", + " 1.3219027e-05, 3.1738135e-05, -9.3036484e-05], dtype=float32), 1.3226666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_723.wav', 'Ich komme noch mal dran.', 24, array([-4.6879621e-05, -1.1869792e-04, -5.2995206e-06, ...,\n", + " 1.0155864e-05, -8.1713588e-05, -3.8661747e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_742.wav', 'Bitte schön!', 13, array([-3.4623430e-04, -4.4416677e-04, -3.0297900e-04, ...,\n", + " 5.3006592e-05, 5.1509913e-05, 7.1368544e-05], dtype=float32), 1.1733333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_743.wav', 'Das haben sie gesagt.', 21, array([-2.3902958e-05, 4.5714452e-05, 7.7266725e-07, ...,\n", + " -5.0056198e-05, 3.0718882e-05, 6.8078203e-05], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_799.wav', 'Ist der Kugelschreiber blau?', 28, array([-1.6907173e-04, -2.9390136e-04, -2.4633619e-04, ...,\n", + " 5.9892503e-05, 6.6163295e-05, 1.4039288e-04], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_21.wav', 'Alles nach vorne!', 17, array([2.0106880e-04, 3.4844220e-04, 2.3129249e-04, ..., 9.6451986e-05,\n", + " 7.4439027e-05, 9.3146300e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_55.wav', 'Nichts dergleichen.', 19, array([-2.7673854e-04, -3.7996779e-04, -2.6658855e-04, ...,\n", + " -4.9654176e-07, -4.3088527e-05, -2.0399790e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_66.wav', \"Langsam nervt's.\", 16, array([ 7.7058452e-05, 4.7672478e-05, 2.6094380e-05, ...,\n", + " -6.2562191e-05, 2.7688688e-07, -1.2926825e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_98.wav', 'Seid ihr verrückt?', 19, array([-1.3435316e-04, -1.8146966e-04, -1.6307829e-04, ...,\n", + " -3.7551112e-07, 1.6737657e-05, 1.7336246e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_110.wav', 'Gib mir fünf!', 14, array([1.9428060e-04, 2.9409130e-04, 2.5521498e-04, ..., 1.9916235e-05,\n", + " 3.7017526e-05, 2.2721317e-05], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_140.wav', 'Auch wieder wahr.', 17, array([-6.21244908e-05, -1.39888449e-04, -1.16935575e-04, ...,\n", + " -9.32170296e-05, -7.70114566e-05, -1.37492418e-04], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_167.wav', 'Sicher ist sicher.', 18, array([ 1.6774700e-04, 2.7458806e-04, 1.3175888e-04, ...,\n", + " -3.9984116e-05, -4.5541576e-05, 2.3846082e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_175.wav', 'Wie soll ich sagen?', 19, array([-2.0688836e-05, -6.4790765e-05, -1.1548823e-05, ...,\n", + " -1.0844359e-05, -3.6513706e-05, -4.4623717e-05], dtype=float32), 1.6213333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_200.wav', 'Ist doch Ehrensache!', 20, array([ 1.07319385e-04, 1.08591557e-04, 6.78624638e-05, ...,\n", + " 3.66282293e-05, -4.84154953e-05, -2.46383879e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_258.wav', 'Jeder Mensch ist anders.', 24, array([ 9.4392788e-05, 1.3444535e-04, 1.5623294e-04, ...,\n", + " -9.0343368e-05, -1.2968398e-04, -2.8964683e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_262.wav', 'Nächstes Mal vielleicht.', 25, array([-4.9963495e-04, -7.3549181e-04, -5.7168922e-04, ...,\n", + " 5.7476438e-05, 8.7852583e-05, 6.3541149e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_290.wav', 'Ich wollte nur nett sein.', 25, array([-3.0248266e-04, -4.1539475e-04, -4.3182663e-04, ...,\n", + " -6.8298694e-05, -3.5496461e-05, -8.2268067e-05], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_293.wav', 'Sie haben Post.', 15, array([7.0743052e-05, 1.5683858e-04, 7.2936782e-05, ..., 3.4985551e-05,\n", + " 2.5512374e-05, 4.4657580e-05], dtype=float32), 1.6)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_302.wav', 'Mit dem Raumschiff bitte!', 25, array([-3.3868386e-05, -4.2923082e-05, 2.2873657e-05, ...,\n", + " 2.9917417e-05, -9.9794874e-05, -1.3378082e-04], dtype=float32), 1.5470625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_309.wav', 'Hä, wieso das denn?', 20, array([-4.2834796e-05, -1.3094838e-04, -2.1130700e-05, ...,\n", + " -4.5203033e-05, -6.0939405e-05, -4.7152938e-05], dtype=float32), 1.9385)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_321.wav', 'Lass dich nicht so hängen!', 27, array([ 3.3312430e-05, 1.1557561e-04, 1.7304946e-04, ...,\n", + " -5.3516556e-05, -6.5977452e-05, -8.5248823e-05], dtype=float32), 1.6589166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_333.wav', 'Sehen wir uns in der Bib?', 25, array([1.8330962e-04, 1.0809512e-04, 2.0564985e-04, ..., 5.3472275e-05,\n", + " 1.1819158e-04, 1.3498007e-04], dtype=float32), 1.9571458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_340.wav', 'Klingt logisch.', 15, array([-4.9080444e-07, -4.6037778e-05, -1.0552061e-04, ...,\n", + " -7.5399061e-05, -1.1574150e-04, -1.1011600e-04], dtype=float32), 1.137)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_361.wav', 'Lies mir etwas vor!', 19, array([-5.9860780e-05, -1.2714561e-04, -4.6063276e-05, ...,\n", + " 1.3993531e-04, 1.7140653e-04, 1.5545388e-04], dtype=float32), 1.5284375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_368.wav', 'Nö, nicht wirklich.', 20, array([1.4233610e-05, 5.8029418e-05, 2.2922040e-05, ..., 2.8016962e-04,\n", + " 1.9504840e-04, 1.6919435e-04], dtype=float32), 1.77075)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_401.wav', 'Besser als gar nichts.', 22, array([-1.9661777e-04, -3.8629526e-04, -3.8140707e-04, ...,\n", + " 4.2625456e-06, 9.6469674e-05, 2.5569330e-05], dtype=float32), 1.7055)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_402.wav', 'Lass mich doch mal träumen.', 28, array([5.1605228e-05, 2.0454232e-05, 5.4702823e-06, ..., 1.0539140e-04,\n", + " 9.8325436e-05, 6.1908002e-05], dtype=float32), 1.87325)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_407.wav', 'Wochenende!', 11, array([-7.1158116e-05, -1.3735623e-04, -1.4360537e-04, ...,\n", + " 7.2980845e-05, -2.7338607e-05, -2.3744215e-06], dtype=float32), 1.0251666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_410.wav', 'Ich habe dich gewarnt.', 22, array([-2.9008405e-04, -3.9160642e-04, -3.8535651e-04, ...,\n", + " -8.1862388e-05, -2.1166212e-04, -1.1729619e-04], dtype=float32), 1.5563958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_420.wav', 'Kann schon sein.', 16, array([8.5848145e-04, 1.2030958e-03, 1.0428407e-03, ..., 9.0862151e-05,\n", + " 1.8885999e-04, 1.3144755e-04], dtype=float32), 1.2395208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_430.wav', 'Schön gespielt.', 16, array([-3.1265599e-04, -3.5982658e-04, -3.4920897e-04, ...,\n", + " -5.9947542e-05, -2.8197737e-05, -8.6103646e-05], dtype=float32), 1.3606666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_432.wav', 'Gut geschlafen?', 15, array([-4.6266021e-05, -4.5735891e-05, -1.5800438e-04, ...,\n", + " -5.1101240e-05, -4.5094261e-05, -1.9669098e-05], dtype=float32), 1.2488333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_435.wav', 'Auf das Wetter natürlich auch.', 31, array([-3.5034932e-04, -4.7157385e-04, -4.0150300e-04, ...,\n", + " 1.4378574e-04, 3.5348174e-05, 1.3807646e-04], dtype=float32), 1.9664583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_437.wav', 'Komm, geh weg!', 14, array([ 3.15589714e-05, 1.08517845e-04, 6.59165744e-05, ...,\n", + " -1.43856349e-04, -9.36611250e-05, -1.37200404e-04], dtype=float32), 1.4119375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_469.wav', 'Schluss mit lustig!', 19, array([ 1.0199297e-04, 1.2600295e-04, 1.6211855e-04, ...,\n", + " -1.5054672e-04, -7.8931960e-05, 6.7272131e-06], dtype=float32), 1.4259166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_483.wav', 'Das spart Geschirr.', 19, array([ 6.6607544e-04, 7.1844418e-04, 6.1214896e-04, ...,\n", + " -3.3901462e-05, 1.3226962e-04, 3.8378406e-05], dtype=float32), 1.8080208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_486.wav', 'Das haben Recherchen ergeben.', 29, array([-9.0566078e-05, -2.1272554e-04, -1.9089306e-04, ...,\n", + " 9.4858078e-05, 8.9547662e-05, 7.4881907e-05], dtype=float32), 1.9571458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_487.wav', 'Frohes Schaffen!', 16, array([ 6.8461159e-05, 1.5294057e-04, 2.2618793e-04, ...,\n", + " -2.1603348e-05, -5.1863241e-05, -6.0653092e-06], dtype=float32), 1.337375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_500.wav', 'Sie sind ja noch blutjung!', 26, array([-0.00065145, -0.00103323, -0.00116705, ..., -0.0001188 ,\n", + " -0.00014697, -0.00013791], dtype=float32), 1.8639375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_516.wav', 'Lebe ich noch?', 14, array([-4.3064877e-04, -5.6503405e-04, -4.1817623e-04, ...,\n", + " -1.6641241e-04, -1.2653919e-04, -8.6205284e-05], dtype=float32), 1.1090416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_527.wav', 'Nicht dafür!', 13, array([ 3.5247151e-04, 4.8163909e-04, 3.9777748e-04, ...,\n", + " -5.2257688e-05, -3.3391923e-05, -1.8325276e-05], dtype=float32), 1.137)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_534.wav', 'Genau hundert Stück.', 21, array([-0.00059065, -0.00093307, -0.00079542, ..., 0.00016691,\n", + " 0.00026112, 0.00016139], dtype=float32), 1.8732708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_535.wav', 'Wie ist das möglich?', 21, array([ 3.7494919e-04, 5.0490367e-04, 3.7185123e-04, ...,\n", + " 4.3858363e-06, -5.6393877e-05, -6.9622547e-05], dtype=float32), 1.3886458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_536.wav', 'Alles wiederholt sich.', 22, array([-7.8303702e-03, -9.4565414e-03, 4.3799067e-03, ...,\n", + " -7.5256619e-05, -4.4781635e-05, -4.8768667e-05], dtype=float32), 1.37)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_538.wav', 'Der Klügere gibt nach.', 23, array([-3.3002507e-04, -4.8394629e-04, -4.5790782e-04, ...,\n", + " -1.5844591e-04, -3.2335000e-05, -1.1339883e-04], dtype=float32), 1.4259166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_559.wav', 'Schwing die Hufe!', 17, array([-0.00077766, -0.00118464, -0.00101971, ..., -0.00019519,\n", + " -0.00011075, -0.00013927], dtype=float32), 1.3233958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_561.wav', 'Was das wieder kostet!', 22, array([ 8.5937936e-04, 1.1237016e-03, 9.1907283e-04, ...,\n", + " 2.4701139e-05, -1.2547316e-04, -5.1732359e-06], dtype=float32), 1.6775416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_563.wav', 'Wieso immer ich?', 16, array([4.5056498e-04, 7.2014128e-04, 6.0793286e-04, ..., 8.4482606e-05,\n", + " 9.7867851e-05, 2.6745778e-05], dtype=float32), 1.5843541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_582.wav', 'Dann gäbe es dich jetzt nicht.', 31, array([-2.4657813e-04, -3.9872411e-04, -3.3457237e-04, ...,\n", + " 1.6457469e-05, -1.5761821e-05, 1.1328906e-04], dtype=float32), 1.9944166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_585.wav', 'Dem werde ich Beine machen!', 27, array([0.00027461, 0.00040794, 0.00034263, ..., 0.00012492, 0.00024055,\n", + " 0.00019042], dtype=float32), 1.9850833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_588.wav', 'Wollen wir Ihn herein lassen?', 29, array([-3.2398489e-04, -4.3375781e-04, -3.6100275e-04, ...,\n", + " 1.1542152e-04, 9.4435090e-05, 1.1465035e-04], dtype=float32), 1.9198541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_590.wav', 'Richtig geraten!', 16, array([-2.6969259e-04, -4.4567345e-04, -5.3715584e-04, ...,\n", + " 6.1917281e-06, 1.5911644e-05, 3.0031568e-05], dtype=float32), 1.2954375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_634.wav', 'Nun sag schon!', 14, array([-0.00074525, -0.0010401 , -0.00091129, ..., 0.00015909,\n", + " 0.00022603, 0.00013058], dtype=float32), 1.0997291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_658.wav', 'Mit Vergnügen!', 15, array([-1.9300323e-04, -2.6942717e-04, -2.3031878e-04, ...,\n", + " 6.9992027e-05, 5.8482234e-05, 1.2584617e-04], dtype=float32), 1.1929166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_659.wav', 'Komm sofort her!', 16, array([ 5.0228823e-04, 8.3419622e-04, 7.3006074e-04, ...,\n", + " 4.1768268e-05, -4.2891694e-05, -7.8192716e-05], dtype=float32), 1.4725208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_674.wav', 'Chill mal!', 10, array([ 3.6116564e-04, 5.9050595e-04, 4.8674442e-04, ...,\n", + " -1.4056740e-04, -6.9539550e-05, -1.2587184e-04], dtype=float32), 1.0624583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_677.wav', 'Jetzt mal Butter bei die Fische.', 32, array([-0.00017322, -0.00025202, -0.0003011 , ..., -0.00014372,\n", + " -0.00011187, -0.00014939], dtype=float32), 1.9198541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_705.wav', 'Das wird Macken geben.', 22, array([ 2.6667553e-06, 2.4150137e-05, 6.4756452e-05, ...,\n", + " -7.3486663e-05, -7.0459449e-05, 4.1346510e-05], dtype=float32), 1.7334583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_707.wav', 'Hilf mir mal auf die Sprünge.', 30, array([ 3.0066914e-04, 4.8592529e-04, 4.8968260e-04, ...,\n", + " -2.9595327e-05, -4.5949713e-05, -2.5512512e-05], dtype=float32), 1.8452916666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_747.wav', 'Versuch macht klug.', 19, array([-6.13919692e-04, -8.45544797e-04, -7.43770273e-04, ...,\n", + " 9.61075566e-05, -8.48421769e-05, -1.16592164e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_768.wav', 'Kapiere ich nicht.', 18, array([ 4.0008963e-04, 6.7968445e-04, 6.0982589e-04, ...,\n", + " -7.4681542e-05, 2.5036192e-05, -4.9270067e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_776.wav', 'Der ist ja mickrig!', 19, array([-1.7217337e-04, -2.9700578e-04, -2.6711932e-04, ...,\n", + " -1.2146128e-04, -3.9679853e-05, -5.6118748e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_777.wav', 'Ja, sogar mehrere.', 18, array([ 1.1276272e-03, 1.6285295e-03, 1.3798362e-03, ...,\n", + " -2.8823823e-05, 3.4296296e-05, -5.9779604e-06], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_778.wav', 'Fünf oder lieber sechs?', 24, array([-0.00051076, -0.00086243, -0.00095237, ..., -0.00015284,\n", + " -0.00011934, -0.00010978], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_1.wav', 'Wen interessiert das schon?', 27, array([-2.0386204e-04, -1.6595512e-04, -3.4064340e-04, ...,\n", + " -5.8528771e-05, -4.0259012e-05, -2.3960278e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_4.wav', 'Das sieht man sofort.', 21, array([-4.7220071e-04, -6.1083253e-04, -5.2480790e-04, ...,\n", + " 3.0703570e-05, 5.0339484e-05, -4.0401741e-05], dtype=float32), 1.7007708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_9.wav', 'Kannst du ein Instrument spielen?', 33, array([-5.8206980e-04, -9.0975891e-04, -9.2016242e-04, ...,\n", + " -3.6644913e-05, -8.9309695e-05, 5.9820622e-06], dtype=float32), 1.9519166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_17.wav', 'Nein, hör mir zu!', 18, array([1.8352878e-04, 2.3541819e-04, 1.9473537e-04, ..., 3.8015917e-06,\n", + " 3.0260228e-05, 4.7941758e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_24.wav', 'Sowas ist schade.', 17, array([ 5.2204914e-04, 7.2680251e-04, 7.3363306e-04, ...,\n", + " -3.0053505e-05, -6.5714506e-05, -9.0218302e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_48.wav', 'Ich will zocken!', 16, array([-0.00016469, -0.00039593, -0.00179843, ..., 0.00018615,\n", + " 0.00012972, 0.00017355], dtype=float32), 1.5773958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_55.wav', 'Ein Insider berichtet.', 22, array([ 3.7575817e-05, 2.7695228e-04, 1.8994253e-04, ...,\n", + " 2.4524426e-05, 4.0446877e-05, -2.5534926e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_65.wav', 'Evelyn ist seekrank.', 20, array([0.00062829, 0.00093936, 0.0008276 , ..., 0.00017747, 0.00012535,\n", + " 0.00013539], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_66.wav', 'Zunächst der Blick aufs Wetter.', 32, array([-0.00092968, -0.00141539, -0.00128506, ..., 0.00019455,\n", + " 0.00034253, 0.00020309], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_77.wav', 'Was schmeckt am besten?', 23, array([6.4622820e-04, 1.0704662e-03, 1.1439651e-03, ..., 1.9296777e-04,\n", + " 9.2506059e-05, 4.9435432e-05], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_78.wav', 'Wir rufen Sie dann auf.', 23, array([-1.0261516e-03, -1.4563096e-03, -1.2881490e-03, ...,\n", + " 5.2330338e-06, 6.4821052e-06, -3.7749737e-06], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_84.wav', 'Das Essen war vorzüglich.', 26, array([-5.0324254e-04, -7.2285999e-04, -5.4835685e-04, ...,\n", + " -4.1776315e-05, -4.3907283e-05, 3.2214456e-07], dtype=float32), 1.9959791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_89.wav', 'SüÃ\\x9fes oder Saures!', 20, array([-2.1448301e-04, -3.2685092e-04, -1.9420320e-04, ...,\n", + " 5.3501964e-05, 3.9838564e-05, 9.8899181e-05], dtype=float32), 1.5641875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_96.wav', 'Woran das wohl liegt?', 21, array([7.9406239e-04, 1.0801835e-03, 8.6238224e-04, ..., 1.5784081e-04,\n", + " 1.3262879e-04, 7.3408869e-06], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_118.wav', 'Hier hast du deinen Fisch.', 26, array([0.00047934, 0.0008143 , 0.00071459, ..., 0.00040429, 0.00026866,\n", + " 0.00011292], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_135.wav', 'Und zwar hochverdient!', 22, array([-3.4465449e-04, -5.7459215e-04, -4.8516967e-04, ...,\n", + " 2.8431052e-05, 9.6089265e-05, 2.6090011e-05], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_144.wav', 'Da kräht kein Hahn nach.', 25, array([-2.4579404e-05, -2.7367115e-04, -1.3865142e-04, ...,\n", + " 6.7543602e-05, 4.0894251e-05, 2.7544003e-05], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_148.wav', 'Du zuerst.', 10, array([-8.7500273e-05, -8.8356370e-05, 3.9270883e-05, ...,\n", + " -1.0109833e-04, 5.8080084e-05, -1.4014350e-04], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_149.wav', 'Hier mal eine Faustregel.', 25, array([7.6173781e-04, 9.7895204e-04, 8.7399769e-04, ..., 5.2696447e-05,\n", + " 1.8836032e-06, 6.7383153e-06], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_150.wav', 'Ich sehe kein Leerzeichen.', 26, array([-2.0238354e-05, -3.9017228e-05, -1.8151976e-04, ...,\n", + " -2.8073411e-05, -8.1482809e-05, -9.7252036e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_151.wav', 'Hast du mal einen Fünfziger?', 29, array([-6.5894198e-04, -9.4568409e-04, -8.3610136e-04, ...,\n", + " -1.5597163e-04, -1.5190896e-04, -4.1842508e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_155.wav', 'Mit Pommes?', 11, array([ 0.0003422 , 0.0003448 , 0.00032375, ..., -0.00023719,\n", + " -0.00028336, -0.00012051], dtype=float32), 0.9252916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_172.wav', 'Noch fünf Minuten bitte, Schatz!', 33, array([-4.4656807e-04, -5.2705233e-04, -5.8281276e-04, ...,\n", + " -1.7271057e-05, 3.9541996e-05, 1.4292495e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_174.wav', 'Es ist wie verhext.', 19, array([3.7680543e-04, 6.3684850e-04, 4.2467855e-04, ..., 1.3614137e-05,\n", + " 8.9109992e-05, 1.3674991e-04], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_179.wav', 'Unter uns ist ein Verräter.', 28, array([-2.2123450e-04, -3.2310621e-04, -2.8145462e-04, ...,\n", + " -1.0567834e-04, 3.1090029e-05, 6.3631160e-05], dtype=float32), 1.8682083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_190.wav', 'Nimm die Maske ab!', 18, array([4.6733877e-04, 6.9651386e-04, 5.4769457e-04, ..., 1.6475593e-04,\n", + " 7.5979711e-05, 7.9883583e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_193.wav', 'Nicht dass ich wüsste.', 23, array([ 0.0001971 , 0.00045662, 0.00023958, ..., -0.00011544,\n", + " -0.00016933, -0.00016841], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_194.wav', 'Der Tee zieht noch.', 19, array([-0.00024223, -0.00046848, -0.00045602, ..., -0.00014842,\n", + " -0.00016475, -0.00012201], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_210.wav', 'Tu es für mich!', 16, array([4.4054058e-04, 7.1835978e-04, 6.8089634e-04, ..., 6.5819913e-05,\n", + " 6.3534033e-05, 2.4601215e-04], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_216.wav', 'Bölken Sie woanders herum!', 27, array([-4.3733866e-04, -5.8234221e-04, -6.0285319e-04, ...,\n", + " -2.0549475e-04, -5.1659747e-05, -6.9836286e-05], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_217.wav', 'So, so.', 7, array([5.1622407e-04, 8.1000535e-04, 6.2310486e-04, ..., 1.1862206e-04,\n", + " 7.1799346e-05, 3.3523640e-06], dtype=float32), 1.3747291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_220.wav', 'Leicht verdientes Geld.', 23, array([ 1.47327999e-04, 1.87759506e-04, -1.56362767e-05, ...,\n", + " 1.08211556e-04, 8.50987126e-05, -3.97509648e-05], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_226.wav', 'Wie lautet der Zwischenstand?', 29, array([ 5.1066454e-04, 7.2763517e-04, 6.3450093e-04, ...,\n", + " -8.1010330e-05, -1.8156270e-05, -5.7707053e-05], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_273.wav', 'Was hat ihn geritten?', 21, array([-3.4532882e-04, -5.6787761e-04, -6.2309759e-04, ...,\n", + " -3.4597360e-05, -1.2706745e-05, -1.1419446e-04], dtype=float32), 1.6214583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_282.wav', 'So nicht, Freundchen!', 21, array([-2.2482723e-03, -3.3393281e-03, -3.0241525e-03, ...,\n", + " 8.9230271e-05, 8.0567042e-05, -1.7856433e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_287.wav', 'So ein feiner Hund!', 19, array([-0.00024811, -0.00028893, -0.00043056, ..., -0.0001634 ,\n", + " -0.00015287, -0.00012142], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_297.wav', 'Ah, die Feuerwehr!', 18, array([-5.8479345e-05, 1.3606872e-06, -3.1950235e-04, ...,\n", + " 4.5466539e-04, 4.1461250e-04, 3.1427949e-04], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_298.wav', 'Nachricht bitte faxen!', 22, array([-3.4957391e-04, -4.1374876e-04, -4.3978900e-04, ...,\n", + " -1.4674234e-04, -2.0285949e-04, -3.0548752e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_330.wav', 'Alter Verwalter!', 16, array([0.00058996, 0.00086262, 0.00074697, ..., 0.00030815, 0.00029123,\n", + " 0.00018931], dtype=float32), 1.8615833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_333.wav', 'Was will man mehr?', 18, array([-8.3821319e-04, -1.1214241e-03, -1.0474359e-03, ...,\n", + " -4.0887986e-05, 1.7188730e-05, 6.5576496e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_362.wav', 'Ganz der Papa!', 14, array([ 1.0614250e-06, 1.0387501e-04, 2.6466480e-05, ...,\n", + " -3.6802659e-05, 4.0980707e-05, 7.8629993e-05], dtype=float32), 1.3042291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_368.wav', 'Es geht schon, danke!', 21, array([-3.1714016e-04, -4.7203674e-04, -3.6235168e-04, ...,\n", + " 7.8341058e-05, 4.7649206e-05, 1.9486140e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_404.wav', 'Notieren Sie sich das.', 22, array([-3.1276091e-04, -4.1585916e-04, -4.4194568e-04, ...,\n", + " -1.9349645e-04, -6.0014678e-05, 2.7422161e-07], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_416.wav', 'Mach das Licht an!', 18, array([ 7.4020500e-04, 9.9551259e-04, 7.7506527e-04, ...,\n", + " -9.4190882e-06, -5.5277683e-06, 6.0646169e-05], dtype=float32), 1.273375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_418.wav', 'Gebt mir ein O!', 15, array([ 2.55384133e-04, 2.99102190e-04, 3.85188963e-04, ...,\n", + " -6.97520736e-05, -1.12780595e-04, -5.84875634e-05], dtype=float32), 1.5641875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_425.wav', 'Wir haben unsere Vorschriften.', 30, array([-0.0014397 , -0.00206455, -0.00194661, ..., 0.00017973,\n", + " 0.00031227, 0.00029818], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_428.wav', 'Dort spielt die Musik!', 22, array([0.00064248, 0.00109204, 0.00095334, ..., 0.00016345, 0.00021933,\n", + " 0.00016792], dtype=float32), 1.9386875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_439.wav', 'Runter von der Couch!', 21, array([0.00032077, 0.0003695 , 0.00031393, ..., 0.00016823, 0.00027614,\n", + " 0.00030219], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_440.wav', 'Geh, Martin. Geh!', 17, array([-0.0006147 , -0.00096355, -0.00084441, ..., -0.00019064,\n", + " -0.00014664, -0.0001376 ], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_442.wav', 'Dann ist doch alles paletti.', 28, array([-0.0003903 , -0.00051721, -0.00051659, ..., 0.00044963,\n", + " 0.00069829, 0.00057605], dtype=float32), 1.7915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_464.wav', 'Hören Sie erst einmal zu!', 26, array([-2.3209564e-03, -3.7553089e-03, -3.8581355e-03, ...,\n", + " 4.0617133e-06, 6.2217005e-05, 1.8342262e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_475.wav', 'Ich will die Hände sehen!', 26, array([-1.1517418e-03, -1.5774536e-03, -1.5022659e-03, ...,\n", + " 8.5659660e-05, 1.5909245e-04, 1.0823877e-04], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_481.wav', 'Du kennst doch Tessa.', 21, array([-9.7565542e-05, -8.4838466e-05, -2.1631434e-04, ...,\n", + " -9.0966016e-05, -9.0894253e-05, -1.5524645e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_487.wav', 'Angeber und Neidhammel.', 23, array([-4.2524905e-04, -5.5071624e-04, -4.9216941e-04, ...,\n", + " -9.1045105e-05, -3.0268184e-05, -1.0583480e-04], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_491.wav', 'Können diese Augen lügen?', 27, array([-1.04710832e-03, -1.57430710e-03, -1.43215503e-03, ...,\n", + " 1.43472225e-05, 1.20743534e-05, -1.07111417e-04], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_499.wav', 'Kann man hier denn nicht lüften?', 33, array([-9.1343711e-04, -1.1802320e-03, -9.9357730e-04, ...,\n", + " 7.8159035e-05, 2.3012167e-04, 3.3637294e-05], dtype=float32), 1.9871666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_500.wav', 'Der Mann ist vom Leben gezeichnet.', 34, array([ 1.06765685e-04, 2.15540877e-05, -9.11364405e-05, ...,\n", + " -5.42830057e-05, -9.09425871e-05, -3.43727625e-05], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_506.wav', 'Wollen Sie mich aushorchen?', 27, array([ 0.00060325, 0.00087957, 0.00074186, ..., -0.00021219,\n", + " -0.00024823, -0.00017538], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_531.wav', 'Je eher, desto besser.', 22, array([5.7826861e-04, 7.7570765e-04, 6.1795511e-04, ..., 8.9765228e-05,\n", + " 4.5600675e-05, 1.4581751e-04], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_538.wav', 'Och, komm schon her!', 20, array([-4.9066258e-04, -7.3491497e-04, -5.5824185e-04, ...,\n", + " 8.5976262e-06, 1.0786976e-04, 1.2791457e-04], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_542.wav', 'Nimm deine Maske endlich ab!', 28, array([ 5.4343470e-04, 7.2278164e-04, 7.2296784e-04, ...,\n", + " -3.4153378e-05, -3.6221893e-05, -8.8784982e-05], dtype=float32), 1.9386875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_574.wav', 'Wollt ihr mich ärgern?', 23, array([0.00089293, 0.00139316, 0.0012052 , ..., 0.00011375, 0.00022351,\n", + " 0.00014075], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_599.wav', 'Das ist knorke.', 15, array([-8.0439750e-06, -4.1563135e-06, -3.6478632e-05, ...,\n", + " -1.6141655e-04, -8.8675122e-05, -1.2264083e-04], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_603.wav', 'Suchst du Ã\\x84rger?', 17, array([ 1.8951594e-04, 3.2533749e-04, 2.3231433e-04, ...,\n", + " -1.0691231e-05, -6.9874281e-05, -4.5488341e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_615.wav', 'Hör nicht auf diese Schwätzer!', 32, array([ 0.00019477, 0.00020745, 0.00017311, ..., 0.00030501,\n", + " -0.00018354, 0.00024707], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_618.wav', \"Gleich geht's weiter!\", 21, array([-6.4648612e-04, -1.0017229e-03, -9.2825363e-04, ...,\n", + " -4.5593577e-05, -6.6424482e-06, 1.4339538e-05], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_641.wav', 'Herr, erbarme dich!', 19, array([ 9.9213721e-06, 1.8233144e-05, -3.5843041e-05, ...,\n", + " -5.0301041e-05, -1.3241796e-04, -2.0356404e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_649.wav', 'Sammelt Holz für das Feuer!', 28, array([-0.00024918, -0.00046716, -0.00041068, ..., 0.00016901,\n", + " 0.0001653 , 0.00017449], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_692.wav', 'Erst will ich noch duschen.', 27, array([ 2.7669812e-04, 5.0494721e-04, 5.6616898e-04, ...,\n", + " 4.0362014e-05, -7.8570345e-05, 6.2029525e-05], dtype=float32), 1.6082291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_703.wav', 'Was kommt als nächstes?', 24, array([ 5.5248733e-04, 8.9842337e-04, 6.7765010e-04, ...,\n", + " -1.3254551e-04, -9.5152573e-05, -2.1063161e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_718.wav', 'Setzen, sechs!', 14, array([-1.2044140e-04, -2.0982703e-04, -2.7291384e-04, ...,\n", + " 1.7828704e-04, 9.6640695e-05, 1.3019536e-05], dtype=float32), 1.2689791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_720.wav', 'Dann nehmen wir meinen Wagen.', 29, array([ 1.2858727e-04, 1.7004457e-04, -5.1648447e-05, ...,\n", + " 2.5735653e-04, 2.8828968e-04, 1.9113944e-04], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_724.wav', 'Lach mal wieder.', 16, array([2.5169516e-04, 3.1780155e-04, 2.4175562e-04, ..., 1.8466891e-04,\n", + " 9.4025556e-05, 1.4185447e-04], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_726.wav', 'Lass uns mal Fieber messen!', 27, array([ 4.6217057e-04, 7.1049004e-04, 5.8858085e-04, ...,\n", + " -2.7612457e-06, -4.4886579e-05, -1.3602876e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_733.wav', 'Ja, du hast ja Recht!', 21, array([-0.00065709, -0.00095549, -0.00067059, ..., 0.00023162,\n", + " 0.00042249, 0.00021008], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_745.wav', 'Kannst du bitte das Licht anlassen?', 35, array([-4.5024044e-05, -6.6272514e-05, -1.4942518e-04, ...,\n", + " -1.0059726e-04, -8.9730158e-05, -4.9335773e-05], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_755.wav', 'Jetzt wird gefeiert!', 20, array([ 6.5074948e-04, 8.2373072e-04, 6.9322297e-04, ...,\n", + " 2.5613972e-05, -7.3600226e-05, 9.0847658e-05], dtype=float32), 1.4892708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_21.wav', 'Oh, ein Blechschaden!', 21, array([ 2.7968596e-05, 2.5622614e-05, 5.5850909e-05, ...,\n", + " -3.6388674e-06, -1.3192165e-05, -5.8324472e-06], dtype=float32), 1.7536458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_24.wav', 'Woran erkennt man sie?', 22, array([-1.6248678e-05, -2.0881544e-05, 2.2568598e-05, ...,\n", + " -1.0051125e-06, -4.4804568e-05, -3.8311518e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_57.wav', 'Wo hast Du den Ludenmantel her?', 31, array([ 9.4084098e-05, 6.2570427e-05, 8.1058839e-05, ...,\n", + " -3.1764132e-05, -4.2468575e-05, -3.3772998e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_59.wav', 'Bingo!', 6, array([ 4.7897654e-05, 2.7239477e-05, 3.7255515e-05, ...,\n", + " -1.7023414e-05, -2.9687346e-05, -3.9503360e-05], dtype=float32), 1.1456041666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_64.wav', 'Schreibt man das so?', 20, array([-1.6650798e-04, -2.2954465e-04, -2.1082905e-04, ...,\n", + " 5.5576045e-05, 1.4893518e-05, 2.0421723e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_81.wav', 'Halt mal kurz mein Bier.', 24, array([-8.2688921e-06, -1.1980872e-05, -4.0169580e-06, ...,\n", + " 8.8575485e-05, 1.3926605e-04, 3.6588870e-05], dtype=float32), 1.8417708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_83.wav', 'Doch Hilfe naht bereits.', 24, array([ 4.9734876e-06, 5.2194659e-06, 1.2122488e-05, ...,\n", + " -1.8982364e-05, -4.2752654e-05, -8.2323677e-05], dtype=float32), 1.98275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_84.wav', 'Formation einnehmen!', 20, array([8.1898354e-05, 7.4887575e-05, 6.6653323e-05, ..., 7.7452451e-06,\n", + " 2.1070047e-05, 3.0395060e-05], dtype=float32), 1.8682083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_87.wav', 'Holt mich hier raus!', 20, array([ 6.53247334e-05, -2.15428197e-04, -5.42638707e-04, ...,\n", + " -1.15612675e-05, 2.72592151e-05, 1.50995202e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_88.wav', 'Da vorne kommt mein Ex.', 23, array([-2.0436737e-04, -7.5976342e-05, 9.7310134e-05, ...,\n", + " 8.3587765e-06, -3.2081423e-06, 1.7971579e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_113.wav', 'Glaube es mir einfach.', 22, array([-2.7944061e-05, 1.0844935e-05, -1.5047234e-05, ...,\n", + " -2.7743961e-05, 2.9569403e-06, -3.5605283e-06], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_115.wav', 'Was die Leute immer haben!', 26, array([-5.4329084e-05, -8.8018889e-05, -7.1306808e-05, ...,\n", + " 7.3982832e-05, 5.8832418e-05, 6.6730849e-05], dtype=float32), 1.9431041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_124.wav', 'BloÃ\\x9f nicht!', 12, array([1.01506448e-04, 1.75192414e-04, 1.12130554e-04, ...,\n", + " 3.55834927e-05, 4.65009398e-05, 5.75332670e-05], dtype=float32), 1.0310416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_126.wav', \"Ich tu' immer nur rein.\", 23, array([ 3.21958287e-05, 2.19840458e-05, 1.46883485e-05, ...,\n", + " -8.37586867e-06, -5.43750639e-06, -1.22217643e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_138.wav', 'Wann lief diese Sendung?', 24, array([-1.7348650e-05, 1.9956657e-05, 3.1632226e-05, ...,\n", + " 1.5858004e-05, 1.8046559e-05, -4.8364400e-05], dtype=float32), 1.9563333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_143.wav', 'Gehen Sie aus dem Weg!', 22, array([1.80967872e-05, 1.12411635e-05, 1.61865628e-05, ...,\n", + " 6.79703808e-05, 7.41552940e-05, 9.28417285e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_145.wav', 'Es geht drunter und drüber.', 28, array([ 5.3915655e-06, 8.5220972e-06, -3.3527529e-05, ...,\n", + " -1.0693114e-05, -6.3991156e-06, 1.2663132e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_149.wav', 'Suchscheinwerfer einschalten!', 29, array([-4.3899286e-06, 1.1313143e-05, -7.2204307e-06, ...,\n", + " -3.3424400e-05, -1.3328722e-05, -2.6314769e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_152.wav', 'Ihre Uhr geht vor.', 18, array([ 1.1011517e-05, -3.0811309e-05, -2.2571772e-05, ...,\n", + " 8.1292972e-05, 7.4179443e-05, 7.1086802e-06], dtype=float32), 1.3394791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_153.wav', 'Wir suchen noch Freiwillige.', 28, array([-5.6182507e-06, -3.0251003e-05, 5.1053936e-05, ...,\n", + " -5.0866500e-05, -1.7348602e-05, -4.6226152e-05], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_157.wav', 'Halten Sie sofort an!', 21, array([ 2.3082459e-04, 2.3086018e-04, -2.2280088e-05, ...,\n", + " -4.5649995e-05, -3.0157349e-05, -1.7121181e-05], dtype=float32), 1.6501041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_190.wav', 'Zeig uns mal, wo der Hammer hängt!', 35, array([ 4.6486733e-05, 5.3618060e-05, 4.0510302e-05, ...,\n", + " -1.0646369e-04, -7.5534314e-05, -1.2183484e-04], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_196.wav', 'Da kann man auch parken.', 24, array([-1.0925556e-05, -3.7278984e-05, -1.0163063e-05, ...,\n", + " -6.9978710e-06, -3.4896555e-06, -6.6393928e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_204.wav', 'Eine gute Stunde ist rum.', 25, array([-2.2296244e-05, -5.8680125e-06, -5.0762057e-05, ...,\n", + " -4.8879232e-05, -8.5942098e-05, -6.8862631e-05], dtype=float32), 1.6214583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_205.wav', 'Oh ja, das fetzt!', 17, array([ 3.4871216e-06, -4.8185248e-06, 1.2310127e-05, ...,\n", + " -1.7998637e-04, -4.5437564e-04, -3.7538476e-04], dtype=float32), 1.5025)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_225.wav', 'Sag das Zauberwort!', 19, array([ 3.0607847e-05, 4.5160428e-05, 1.8997842e-05, ...,\n", + " -1.6968366e-05, 1.1446763e-05, -3.4663015e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_242.wav', 'Die Arme!', 9, array([4.2858810e-05, 7.1904920e-05, 2.9656387e-05, ..., 5.8210357e-05,\n", + " 4.0901028e-05, 3.2474836e-05], dtype=float32), 0.8636041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_246.wav', 'Schläfst du schon?', 19, array([ 3.2191518e-05, 5.0761428e-05, 4.3220087e-05, ...,\n", + " -4.0423780e-07, 1.7892495e-05, 5.0407853e-06], dtype=float32), 1.1456041666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_247.wav', 'Ja da schau her!', 16, array([-3.9683233e-05, -9.2827155e-05, -5.1356539e-05, ...,\n", + " 8.5207663e-05, 5.3869204e-05, 8.1267404e-05], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_251.wav', 'Moment, das ging anders.', 24, array([-7.3496245e-05, -9.7117241e-05, -9.9846256e-05, ...,\n", + " -2.2075654e-05, -5.6377292e-05, -3.1324758e-05], dtype=float32), 1.9475208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_254.wav', 'Weiter zum nächsten Kapitel.', 29, array([ 2.7818656e-05, 2.9083269e-05, 2.7292099e-05, ...,\n", + " -1.4497251e-05, 1.6704771e-05, 1.8156856e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_255.wav', 'Holla die Waldfee!', 18, array([ 3.2722608e-05, -3.4862321e-06, 2.1344584e-05, ...,\n", + " -3.5852513e-06, -1.3345180e-05, 1.8042003e-06], dtype=float32), 1.2777916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_278.wav', 'Lass mich nicht allein.', 23, array([-6.2487576e-05, -5.1307488e-05, 3.3147335e-05, ...,\n", + " -1.3666711e-06, -1.6965050e-05, 1.0842440e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_283.wav', 'Warst du beim Frisör?', 22, array([-5.1554598e-05, -2.8181448e-05, -2.1276550e-05, ...,\n", + " 5.1014787e-05, 6.0253118e-05, 4.9681836e-05], dtype=float32), 1.60825)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_291.wav', 'Warum bin ich so fröhlich?', 27, array([-9.2893220e-05, -9.0468158e-05, -8.4269959e-05, ...,\n", + " 5.6945123e-06, 2.3743269e-05, -1.5906717e-07], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_307.wav', 'Das klingt sehr gut.', 20, array([ 8.8375481e-07, 1.4093188e-06, -8.0541049e-06, ...,\n", + " -6.2088387e-05, -3.6809190e-05, -5.5097131e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_310.wav', 'Kann man das mitessen?', 22, array([-1.8419527e-05, -2.5431269e-05, -8.9255473e-06, ...,\n", + " 2.5581608e-05, 3.7564107e-05, 2.2521937e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_311.wav', 'Wo liegt das Problem?', 21, array([-1.7069402e-05, 2.2379625e-06, -8.6348446e-06, ...,\n", + " 2.4881610e-05, -2.6925150e-06, 1.8407424e-06], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_313.wav', 'Wo kann man sich ausloggen?', 27, array([-2.94713544e-07, -2.60781735e-06, 2.09315767e-05, ...,\n", + " -1.10319825e-05, -5.37709784e-05, -2.63888141e-05], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_319.wav', 'Wo kommt das nur her?', 21, array([-3.2170439e-05, -2.5212325e-05, -3.7200436e-05, ...,\n", + " -9.3722010e-06, -3.0964005e-05, -1.5780270e-05], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_337.wav', 'Und wenn man die nicht hat?', 27, array([-4.4960318e-05, 5.2144351e-05, -2.9507015e-05, ...,\n", + " -3.9032249e-05, 3.4188946e-05, -2.3692317e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_347.wav', 'Sag ihr das bloÃ\\x9f nicht!', 24, array([8.6986920e-06, 4.4441199e-06, 3.0283294e-05, ..., 9.9162316e-05,\n", + " 7.8216704e-05, 9.9542762e-05], dtype=float32), 1.6126458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_348.wav', 'Wo bleibst du?', 14, array([ 3.3125209e-05, 5.7069548e-05, 3.6280937e-05, ...,\n", + " -2.4643228e-05, -2.7121812e-05, -1.5307731e-05], dtype=float32), 1.2998125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_352.wav', 'Jetzt oder nie!', 15, array([-1.7882267e-05, 1.5871639e-05, -7.5667369e-05, ...,\n", + " -3.7708491e-05, 7.9740630e-06, -7.9073770e-06], dtype=float32), 1.3747291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_368.wav', 'Hinsetzen und FüÃ\\x9fe hoch!', 26, array([-3.0999392e-05, -7.2621566e-05, -4.7179296e-05, ...,\n", + " -2.5928295e-05, -3.2266624e-05, 1.4868124e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_394.wav', 'Nimm dir mal eine Pause!', 24, array([-2.7986377e-04, -3.0645030e-04, -2.3860915e-04, ...,\n", + " -3.2176635e-05, -4.1073359e-05, -1.7371191e-05], dtype=float32), 1.9519166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_427.wav', 'Vom Kinde verschmäht.', 22, array([2.4927745e-05, 5.9401387e-05, 5.5517099e-05, ..., 8.8263223e-05,\n", + " 3.5481713e-05, 1.4234082e-05], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_435.wav', 'Kann ich dir helfen?', 20, array([-7.2613778e-04, 1.4254064e-03, 4.3165400e-03, ...,\n", + " 9.7870041e-05, 6.2070317e-06, 1.0954802e-04], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_449.wav', 'Woher will sie das wissen?', 26, array([ 6.6095758e-08, -2.7216944e-05, -1.6521408e-05, ...,\n", + " 3.0345358e-05, -5.6843191e-06, -4.2101074e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_468.wav', 'Das lass mal meine Sorge sein.', 30, array([-4.7126541e-05, -5.9281327e-05, -3.5599784e-05, ...,\n", + " 2.0367926e-05, 4.0726398e-05, 1.8718367e-05], dtype=float32), 1.9342916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_500.wav', 'Ich habe heute Geburtstag.', 26, array([-5.1001366e-06, 4.8161728e-05, 1.0626727e-05, ...,\n", + " -8.0793325e-05, -6.0714734e-05, -7.9644029e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_513.wav', 'Fertig werden!', 14, array([-2.4789182e-05, -1.4137984e-05, -4.8843711e-05, ...,\n", + " 2.4393246e-05, 2.7856760e-05, 6.9619755e-06], dtype=float32), 1.3615)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_525.wav', 'Jeder trauert anders.', 21, array([-6.3906264e-06, -2.4861220e-05, -3.1557371e-05, ...,\n", + " -5.3394677e-05, 5.5594451e-06, -4.3505042e-05], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_530.wav', 'Wir sprechen uns später.', 25, array([ 1.9607371e-05, 1.2742041e-05, 5.9507223e-05, ...,\n", + " -1.0580019e-06, -1.0849526e-05, -2.2735680e-05], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_534.wav', 'Den Schuss nicht hören.', 24, array([1.0162838e-04, 1.3316146e-04, 1.3368837e-04, ..., 5.8495625e-06,\n", + " 7.8353441e-05, 3.3752654e-05], dtype=float32), 1.8726041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_543.wav', 'Wer tut das nicht?', 18, array([-1.5056261e-05, -2.7894443e-05, -8.4756257e-06, ...,\n", + " -4.3981410e-05, -3.8667356e-05, -4.8794256e-05], dtype=float32), 1.5773958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_547.wav', 'Zu so später Stunde?', 21, array([-1.2750152e-04, 1.9311530e-05, -6.8482601e-05, ...,\n", + " -8.0274267e-06, 3.7486578e-05, -4.1844236e-05], dtype=float32), 1.6478958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_569.wav', 'Ein Zirkus ohne Tiere?', 22, array([-2.8725301e-05, -5.8967784e-05, -4.7625667e-06, ...,\n", + " 5.3123777e-06, -7.1301661e-06, -2.9527286e-05], dtype=float32), 1.8461666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_570.wav', 'Sag schon, was ist drin?', 24, array([ 1.10985304e-04, 5.97430153e-05, 9.55062278e-05, ...,\n", + " 6.52888993e-05, -5.82730863e-05, 6.85385385e-05], dtype=float32), 1.8373541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_574.wav', 'Wir zählen auf dich.', 21, array([-7.5106524e-05, -9.9009638e-05, -7.9571801e-05, ...,\n", + " 3.8461326e-06, 8.2744657e-05, 5.6746823e-05], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_585.wav', 'Das funktioniert auch.', 22, array([ 5.4934342e-05, 1.7679840e-05, -5.7660582e-05, ...,\n", + " 4.9520886e-06, -2.5478117e-05, -6.3567706e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_589.wav', 'Was drauf?', 10, array([ 1.5172187e-05, 3.5768371e-05, -4.6845405e-05, ...,\n", + " 2.3743922e-05, -3.8076912e-05, 2.2450782e-05], dtype=float32), 1.2072916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_594.wav', 'Einer zur Zeit!', 15, array([-1.9068037e-05, -2.0037192e-05, -8.8215660e-05, ...,\n", + " -1.8433493e-05, -3.3125831e-05, 3.5209345e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_606.wav', 'Okay, und nun?', 14, array([-1.2366170e-05, 2.3954278e-06, -1.8647337e-05, ...,\n", + " -2.4212586e-06, 6.3337334e-06, -2.5126603e-06], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_607.wav', 'Das verstehst du noch nicht.', 28, array([ 1.6215906e-04, 2.5805720e-04, 2.2398161e-04, ...,\n", + " -5.9032095e-06, -1.2547288e-06, -1.8913257e-05], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_609.wav', 'Wie uncool!', 11, array([-1.1241895e-05, -3.2969092e-05, -5.8745212e-05, ...,\n", + " 8.5234688e-06, 1.9909365e-05, 1.7495377e-05], dtype=float32), 1.0927291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_624.wav', 'Setzt dich gerade!', 18, array([-1.7491520e-05, 6.7394591e-05, 5.0117076e-05, ...,\n", + " -2.1143003e-05, -1.6165326e-05, -1.6601503e-05], dtype=float32), 1.3835208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_638.wav', 'Nicht schlecht der Specht!', 26, array([-1.0250892e-05, 1.4861113e-05, -5.1604333e-05, ...,\n", + " 7.6938113e-06, 2.0211788e-05, 4.5162437e-06], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_647.wav', 'Was haben die vor?', 18, array([ 1.2927976e-06, -4.4330540e-05, -4.2087355e-05, ...,\n", + " 1.2652035e-04, -7.1286093e-05, -1.9011653e-06], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_648.wav', 'Ich habe gar nichts mitbekommen.', 32, array([ 1.6062468e-05, 4.4314598e-05, 1.1317232e-05, ...,\n", + " -8.4248430e-05, -4.8613791e-05, -4.1891144e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_664.wav', 'Je mehr, desto besser.', 22, array([-1.0978662e-05, 2.8232571e-06, -2.7930673e-05, ...,\n", + " 5.0805535e-05, 3.9726485e-05, 6.7175766e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_677.wav', 'Da vorne links!', 15, array([ 3.8325859e-05, 3.2421449e-05, 1.5961947e-05, ...,\n", + " 2.6722651e-05, -3.3873417e-05, 3.2344939e-05], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_682.wav', 'Jetzt mal halblang!', 19, array([-2.0417360e-06, -1.3626728e-05, -2.8990502e-05, ...,\n", + " -2.2435464e-05, -3.3464916e-05, 2.5530893e-05], dtype=float32), 1.4892708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_687.wav', 'Gib uns ein Beispiel!', 21, array([-8.4907850e-05, -5.6986839e-05, 3.7472455e-06, ...,\n", + " -1.4217812e-05, -2.3697576e-05, -2.4605337e-05], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_689.wav', 'Von wegen!', 10, array([-2.7207323e-05, -6.9836324e-06, -9.1906164e-05, ...,\n", + " 6.5761873e-05, 5.3384709e-05, 3.5098144e-06], dtype=float32), 0.8547916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_691.wav', 'Das finde ich ziemlich doof.', 28, array([-2.9834633e-05, 5.6474819e-06, -2.5375591e-06, ...,\n", + " -3.2603730e-06, -5.9017879e-05, -9.6670803e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_697.wav', 'Das trifft sich gut.', 20, array([ 7.9529818e-06, 3.9593842e-06, 3.0517844e-05, ...,\n", + " -4.2052940e-05, -3.0681629e-05, -2.6093589e-05], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_704.wav', 'Jetzt gibt es Zoff.', 19, array([ 1.7251841e-05, 3.0525447e-05, 4.0081544e-05, ...,\n", + " -2.7181366e-05, -6.4996988e-05, -2.0187828e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_708.wav', 'Liebe ist kein Verbrechen.', 26, array([ 1.3942296e-03, 2.0183886e-03, 1.7392144e-03, ...,\n", + " 4.2136421e-06, 1.5667934e-05, -1.1447505e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_725.wav', 'Auch das noch!', 14, array([ 6.9235853e-06, 1.0541713e-05, -6.9821567e-06, ...,\n", + " -6.0647875e-05, -3.7899004e-05, 1.4291401e-05], dtype=float32), 1.2337083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_728.wav', 'Toller Hengst!', 14, array([ 1.5415973e-05, 1.2052349e-05, 2.2745300e-05, ...,\n", + " -5.1455394e-05, -8.6221211e-05, -2.3398878e-05], dtype=float32), 1.1632291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_748.wav', 'Reine Gewöhungssache.', 22, array([-7.46887818e-05, 3.63702893e-05, 2.65028193e-05, ...,\n", + " 1.14920855e-04, 8.75776823e-05, 7.50372201e-05], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_764.wav', 'Siehe weiter unten.', 19, array([ 1.0315010e-04, 1.2668683e-04, 1.3160890e-04, ...,\n", + " 3.5362529e-05, -4.0091851e-05, 3.1800329e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_778.wav', 'Hilfe ein Ã\\x9cberfall!', 20, array([-9.1011774e-05, -1.6054764e-04, -6.9503607e-05, ...,\n", + " -3.2605390e-06, -1.1628125e-05, -4.9398786e-05], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_804.wav', \"Wen wundert's?\", 14, array([ 1.8174978e-05, 1.0757233e-05, 1.4760263e-05, ...,\n", + " -4.7010188e-05, -6.0861544e-06, -1.5782018e-05], dtype=float32), 1.2601666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_19.wav', 'Bis die Schwarte kracht.', 24, array([ 3.15900324e-05, -1.30308879e-04, 3.94875406e-06, ...,\n", + " 3.35644108e-05, 1.02667604e-04, 4.54106703e-05], dtype=float32), 1.7536354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_26.wav', 'Auch das wäre möglich.', 24, array([-4.5410670e-05, 1.9743770e-06, -1.9743769e-05, ...,\n", + " 4.3436296e-05, -1.9743770e-06, 3.5538786e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_28.wav', 'Was geht gar nicht?', 19, array([ 1.9743770e-06, 9.8718847e-06, -3.3564411e-05, ...,\n", + " 1.2241138e-04, -4.5410670e-05, 0.0000000e+00], dtype=float32), 1.4065104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_33.wav', 'Die Geschichte geht anders.', 27, array([-3.3564411e-05, -7.7000703e-05, -8.2923834e-05, ...,\n", + " 3.3564411e-05, -3.9487541e-06, 3.1590032e-05], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_42.wav', 'Welche SchuhgröÃ\\x9fe?', 20, array([ 0.0000000e+00, -1.1846262e-05, -5.9231311e-06, ...,\n", + " 4.5410670e-05, -3.9487539e-05, 2.9615656e-05], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_51.wav', 'Mediathek aufrufen!', 19, array([-3.6328536e-04, 1.9941208e-04, -8.4898209e-05, ...,\n", + " 5.9231311e-06, -5.7256933e-05, -4.9359427e-05], dtype=float32), 1.9739479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_54.wav', 'Es tut ihr furchtbar leid.', 26, array([-3.3564411e-05, -4.9359427e-05, 1.1846262e-05, ...,\n", + " 5.1333802e-05, -8.8846966e-05, 5.7256933e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_62.wav', 'Noch mal von vorne, bitte.', 26, array([ 2.5666901e-05, -2.9615656e-05, -3.7513164e-05, ...,\n", + " 8.6872591e-05, -5.7256933e-05, 6.9103196e-05], dtype=float32), 1.8417604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_63.wav', 'Oh jemine!', 10, array([-1.7769393e-05, -6.9103196e-05, -3.7513164e-05, ...,\n", + " 5.7256933e-05, 5.1333802e-05, 3.9487539e-05], dtype=float32), 1.20728125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_84.wav', 'Reich mir den mal rüber.', 25, array([-1.5795016e-05, -9.8718847e-06, 6.7128822e-05, ...,\n", + " 0.0000000e+00, -1.2833450e-04, 3.3564411e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_101.wav', 'Findest du nicht auch?', 22, array([-7.3051953e-05, -9.8718847e-06, 5.9231311e-06, ...,\n", + " 2.5666901e-05, -5.3308180e-05, 1.1451387e-04], dtype=float32), 1.32184375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_113.wav', 'Alles korrekt.', 14, array([ 1.1846262e-05, 2.9615656e-05, 1.2833450e-04, ...,\n", + " -1.9743769e-05, 2.7641279e-05, -1.7769393e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_128.wav', 'Alles wird gut.', 15, array([-9.2795723e-05, -3.1590032e-05, 8.2923834e-05, ...,\n", + " 1.3820640e-05, -4.7385049e-05, 1.1846262e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_156.wav', 'Würde ich auch machen.', 23, array([-6.1205690e-05, -5.3308180e-05, -5.5282559e-05, ...,\n", + " -9.8718847e-06, -1.1648824e-04, -6.1205690e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_173.wav', 'Gib Gas!', 8, array([ 3.7513164e-05, 7.1077571e-05, -1.9743770e-06, ...,\n", + " 5.9231312e-05, -3.0405406e-04, 4.5410672e-04], dtype=float32), 1.03984375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_221.wav', 'Weil er es kann.', 16, array([ 3.9487541e-06, -3.1590032e-05, 2.1718148e-05, ...,\n", + " -9.6744472e-05, -3.9487539e-05, -6.3180065e-05], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_225.wav', 'Was ist der Sinn des Lebens?', 28, array([-4.7385049e-05, -9.0821341e-05, 8.6872591e-05, ...,\n", + " 7.8975081e-06, -1.3820640e-05, -2.0730958e-04], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_247.wav', 'Es ist kalt.', 12, array([-5.33081802e-05, -1.14513867e-04, 2.36925243e-05, ...,\n", + " -4.34362955e-05, 5.92313108e-06, -1.08590735e-04], dtype=float32), 1.17203125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_259.wav', 'ScheiÃ\\x9f drauf!', 14, array([ 9.8718854e-05, 3.9487541e-06, 5.9231312e-05, ...,\n", + " -3.3564411e-05, -1.7769393e-05, -1.1253949e-04], dtype=float32), 1.19846875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_271.wav', 'Katzen haben sieben Leben.', 26, array([-1.57950162e-05, 7.89750811e-06, -6.12056901e-05, ...,\n", + " -1.04641986e-04, -7.30519532e-05, -5.92313108e-06], dtype=float32), 1.8593854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_285.wav', 'Nicht so lasch!', 15, array([ 3.3564411e-05, 1.4412952e-04, -8.8846966e-05, ...,\n", + " 5.9231311e-06, -1.4610391e-04, -3.1590032e-05], dtype=float32), 1.4187708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_309.wav', 'Ich gehe dann mal kicken.', 25, array([-7.8975077e-05, -5.1333802e-05, 2.1718148e-05, ...,\n", + " -9.8718847e-06, 0.0000000e+00, 1.7769393e-05], dtype=float32), 1.6743229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_313.wav', 'Versuchen Sie es später noch einmal!', 37, array([1.7769393e-04, 1.5597578e-04, 7.7000703e-05, ..., 1.7769393e-05,\n", + " 2.5666901e-05, 0.0000000e+00], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_327.wav', 'Ulrike muss es ja wissen.', 25, array([ 1.3820640e-05, -4.3436296e-05, -2.5666901e-05, ...,\n", + " 8.0949460e-05, 3.1590032e-05, -1.5795016e-05], dtype=float32), 1.5553645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_329.wav', 'So viele schon?', 15, array([-1.1451387e-04, -9.4770097e-05, 1.3820640e-05, ...,\n", + " -9.8718847e-06, 7.8975081e-06, 3.3564411e-05], dtype=float32), 1.32625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_339.wav', 'Noch nicht.', 11, array([-1.1569849e-03, -1.1234205e-03, -1.1056511e-03, ...,\n", + " -4.1461917e-05, -1.9743770e-06, -2.3692524e-05], dtype=float32), 0.9473229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_351.wav', 'So alt bin ich dann auch wieder nicht.', 38, array([ 1.0957792e-03, 8.6082838e-04, 5.8836438e-04, ...,\n", + " -7.7000703e-05, -1.0661636e-04, -5.3308180e-05], dtype=float32), 1.9100520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_352.wav', 'Diese Gelegenheit kann man nutzen.', 34, array([ 1.2043700e-04, 1.9743769e-05, 7.5026328e-05, ...,\n", + " 3.1590032e-05, 6.5154440e-05, -5.1333802e-05], dtype=float32), 1.9981770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_361.wav', 'Bist neidisch, was?', 19, array([ 3.9487539e-05, 1.7769393e-05, -2.9615656e-05, ...,\n", + " -5.9231311e-06, 1.9743770e-06, -3.1590032e-05], dtype=float32), 1.4848645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_364.wav', 'Puh, das Quiz ist schwer!', 25, array([-5.9231312e-05, -6.9103196e-05, -8.2923834e-05, ...,\n", + " 1.3623202e-04, 1.3030888e-04, 2.1520710e-04], dtype=float32), 1.9497083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_371.wav', 'Alles klärchen!', 16, array([-3.9487541e-06, 3.3564411e-05, 1.5795016e-05, ...,\n", + " 5.1333802e-05, 6.1205690e-05, 3.5538786e-05], dtype=float32), 1.5068854166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_392.wav', 'Zeig mal dein Piercing!', 23, array([ 5.7256933e-05, 1.3820640e-05, 3.5538786e-05, ...,\n", + " -6.1205690e-05, -9.8718847e-06, 5.5282559e-05], dtype=float32), 1.7712604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_393.wav', 'Parlieren Sie doch im Park!', 27, array([-7.1077571e-05, -5.3308180e-05, -5.7256933e-05, ...,\n", + " 1.5795015e-04, 1.1253949e-04, 1.0069323e-04], dtype=float32), 1.8505729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_400.wav', 'Hä, was? 400', 13, array([ 3.5538786e-04, 2.7443841e-04, 2.5469463e-04, ...,\n", + " -6.3180065e-05, -1.7769393e-05, -5.9231311e-06], dtype=float32), 0.98696875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_431.wav', 'Tun Sie nicht so überrascht!', 29, array([-2.2310461e-04, -2.6259213e-04, -3.0800281e-04, ...,\n", + " -7.7000703e-05, -1.0661636e-04, -1.1451387e-04], dtype=float32), 1.6743229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_438.wav', 'Was lernen wir daraus?', 22, array([ 3.35644108e-05, -7.70007027e-05, -7.30519532e-05, ...,\n", + " -1.02667604e-04, -8.68725911e-05, -2.76412793e-05], dtype=float32), 1.9078541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_439.wav', 'Was will sie denn noch?', 23, array([ 5.3308180e-05, 5.7256933e-05, -3.9487539e-05, ...,\n", + " -3.9487541e-06, 5.5282559e-05, 6.9103196e-05], dtype=float32), 1.7624479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_441.wav', 'Ich stecke fest.', 16, array([-1.6584767e-04, -1.5795015e-04, -1.3030888e-04, ...,\n", + " 9.2795723e-05, 7.5026328e-05, 7.5026328e-05], dtype=float32), 1.2976041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_451.wav', 'Es riecht nach Sonnencreme.', 27, array([-4.1461917e-05, -3.7513164e-05, 2.1718148e-05, ...,\n", + " -2.7641279e-05, -1.0661636e-04, -1.0069323e-04], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_452.wav', 'Da entlang!', 11, array([-1.7769393e-05, 5.9231311e-06, 1.7769393e-05, ...,\n", + " -7.8975081e-06, 7.8975081e-06, 0.0000000e+00], dtype=float32), 0.97815625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_453.wav', 'Tja, Thaddäus!', 15, array([-9.87188541e-05, -1.46103906e-04, -1.24385755e-04, ...,\n", + " 1.02667604e-04, 1.97437703e-06, -2.76412793e-05], dtype=float32), 1.6787395833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_458.wav', 'Das Leben ist voller Ã\\x9cberraschungen.', 37, array([-3.9487539e-05, 1.1846262e-05, -1.3820640e-05, ...,\n", + " 6.1205690e-05, 3.1590032e-05, 1.9743770e-06], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_460.wav', 'Danke der Nachfrage!460', 23, array([ 8.29238343e-05, 1.16488241e-04, 9.67444721e-05, ...,\n", + " -1.12539492e-04, -1.08590735e-04, -1.42155142e-04], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_465.wav', 'Ich will auch so ein Pferd.', 27, array([ 0.0000000e+00, 1.9743770e-06, -3.1590032e-05, ...,\n", + " 7.8975077e-05, -3.9487539e-05, -5.7256933e-05], dtype=float32), 1.5404791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_473.wav', 'Was hat das zu bedeuten?', 24, array([-1.26360130e-04, -1.08590735e-04, -1.16488241e-04, ...,\n", + " 8.29238343e-05, 2.36925243e-05, -1.57950162e-05], dtype=float32), 1.8241354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_492.wav', 'Die Narkose wirkt nicht.', 24, array([ 1.7571956e-04, 1.6782204e-04, 7.8975077e-05, ...,\n", + " 3.1590032e-05, -2.1718148e-05, -2.7641279e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_495.wav', 'Dein Bruder ist echt krass drauf.', 33, array([-7.10775712e-05, -3.35644108e-05, -2.17181478e-05, ...,\n", + " 1.16488241e-04, 1.02667604e-04, 7.89750775e-05], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_498.wav', 'Das behaupten alle.', 19, array([-1.1253949e-04, -1.1846262e-04, -9.8718854e-05, ...,\n", + " 5.5282559e-05, -1.1846262e-05, 4.5410670e-05], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_500.wav', 'So einfach ist es nicht.', 24, array([-0.00019349, -0.00019744, -0.00022113, ..., -0.00021521,\n", + " -0.0002231 , -0.00020534], dtype=float32), 1.7976979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_503.wav', 'Das gibt ihm den Rest.', 22, array([-3.94875406e-06, -6.71288217e-05, -1.20436998e-04, ...,\n", + " 1.04641986e-04, 1.24385755e-04, 1.14513867e-04], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_507.wav', 'Was fällt euch ein?', 20, array([9.8718854e-05, 9.0821341e-05, 6.7128822e-05, ..., 1.7374518e-04,\n", + " 2.0730958e-04, 1.5795015e-04], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_511.wav', 'Lass die Glucke in Ruhe!', 24, array([2.6851529e-04, 2.3692525e-04, 8.2923834e-05, ..., 9.2795723e-05,\n", + " 6.3180065e-05, 6.1205690e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_512.wav', 'Wieso denn das nicht?', 21, array([-3.9487541e-06, -2.5666901e-05, -6.9103196e-05, ...,\n", + " 3.1590032e-05, -1.9743770e-06, 1.3820640e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_542.wav', 'Na gut, ich komme mit.', 22, array([ 6.3180065e-05, -3.9487541e-06, 4.3436296e-05, ...,\n", + " -6.9103196e-05, -6.5154440e-05, 7.8975081e-06], dtype=float32), 1.8329479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_543.wav', 'Entschuldige dich bei ihr.', 26, array([-9.6744472e-05, -7.8975077e-05, -5.1333802e-05, ...,\n", + " -7.7000703e-05, -1.2241138e-04, -5.9231312e-05], dtype=float32), 1.7624479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_544.wav', 'Das reimt sich ja.', 18, array([ 1.1056512e-04, 8.4898209e-05, 1.1648824e-04, ...,\n", + " -9.0821341e-05, -1.1451387e-04, -1.1253949e-04], dtype=float32), 1.25134375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_549.wav', 'Sie kamen, um zu bleiben.', 25, array([ 3.9487539e-05, 7.8975081e-06, 3.3564411e-05, ...,\n", + " 2.1718148e-05, -2.7641279e-05, -9.6744472e-05], dtype=float32), 1.8593854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_555.wav', 'Sie nimmt kein Blatt vor den Mund.', 34, array([-1.1569849e-03, -7.0287823e-04, -5.3308180e-05, ...,\n", + " 2.5666901e-05, 1.5795016e-05, -1.9743769e-05], dtype=float32), 1.8021041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_556.wav', 'Hoffentlich geht es ihm gut.', 28, array([1.0187785e-03, 1.1372411e-03, 1.2616270e-03, ..., 3.5538786e-05,\n", + " 7.8975081e-06, 5.9231312e-05], dtype=float32), 1.9342916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_557.wav', 'Vergiss deine Schoner nicht!', 28, array([ 3.9487539e-05, -5.5282559e-05, -2.0336083e-04, ...,\n", + " -6.9103196e-05, -7.1077571e-05, -7.1077571e-05], dtype=float32), 1.8461666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_590.wav', 'Wenn du schon so fragst!', 24, array([ 1.4610391e-04, 1.4807828e-04, 1.7966831e-04, ...,\n", + " 1.7769393e-05, -4.3436296e-05, -2.7641279e-05], dtype=float32), 1.8329479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_595.wav', 'Was muss ich einkaufen?', 23, array([ 0.00016387, 0.00012636, 0.00011254, ..., -0.00010464,\n", + " -0.00011649, -0.00010464], dtype=float32), 1.6038229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_599.wav', 'Der tut nichts!', 15, array([ 9.2795723e-05, 6.1205690e-05, 2.5666901e-05, ...,\n", + " -1.1648824e-04, -9.8718854e-05, -7.8975077e-05], dtype=float32), 1.16321875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_605.wav', 'Natürlich war es das.', 22, array([3.1590032e-05, 1.9743769e-05, 7.8975077e-05, ..., 1.4610391e-04,\n", + " 1.6782204e-04, 1.4412952e-04], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_612.wav', 'Sprechen Sie deutsch?', 21, array([ 2.05335207e-04, 1.91514569e-04, 1.57950155e-04, ...,\n", + " 2.96156559e-05, -6.31800649e-05, -1.02667604e-04], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_616.wav', 'Gleich hole ich Anne ab.', 24, array([6.3180065e-05, 7.1077571e-05, 1.2636013e-04, ..., 2.1718148e-05,\n", + " 3.1590032e-05, 1.3820640e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_646.wav', 'Ich bin ganz hin und weg!', 25, array([-3.5143911e-04, -2.5666901e-04, -1.6979642e-04, ...,\n", + " -4.3436296e-05, -6.1205690e-05, 4.3436296e-05], dtype=float32), 1.7800729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_647.wav', 'Frische Luft tut gut.', 21, array([ 1.04641986e-04, 1.97437703e-06, -7.89750811e-06, ...,\n", + " 8.48982090e-05, 1.38206397e-05, -7.89750811e-06], dtype=float32), 1.6655104166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_652.wav', 'Nein, du Genie!', 15, array([1.3623202e-04, 1.2833450e-04, 1.2833450e-04, ..., 2.7641279e-05,\n", + " 4.5410670e-05, 5.1333802e-05], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_675.wav', 'Das kann doch wohl nicht wahr sein!', 35, array([-1.9743769e-05, -3.3564411e-05, 3.1590032e-05, ...,\n", + " 9.2795723e-05, 9.6744472e-05, 1.2043700e-04], dtype=float32), 1.9827604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_684.wav', 'Jedes Kind weiÃ\\x9f das.', 21, array([-1.204370e-04, -1.382064e-04, -9.674447e-05, ..., -1.461039e-04,\n", + " -1.382064e-04, -8.489821e-05], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_711.wav', 'Die werden ja nicht schlecht.', 29, array([-1.3030888e-04, -1.0069323e-04, -8.2923834e-05, ...,\n", + " -1.3228325e-04, -1.1253949e-04, -9.6744472e-05], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_719.wav', 'Das Leben ist schön!', 21, array([ 2.5666901e-05, 4.7385049e-05, 2.9615656e-05, ...,\n", + " -3.3564411e-05, 3.3564411e-05, 7.8975077e-05], dtype=float32), 1.2953958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_720.wav', 'Was machst du jetzt?', 20, array([-1.5597578e-04, -1.2833450e-04, -1.3425764e-04, ...,\n", + " -5.3308180e-05, -3.9487541e-06, 3.9487541e-06], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_728.wav', 'Falsche Antwort.', 16, array([-5.1333802e-05, -1.1846262e-05, 9.8718847e-06, ...,\n", + " -9.8718847e-06, -4.7385049e-05, -5.3308180e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_743.wav', 'Oder wir gehen Burger essen.', 28, array([-2.9615656e-05, -4.7385049e-05, -3.1590032e-05, ...,\n", + " -8.2923834e-05, -5.1333802e-05, 6.3180065e-05], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_744.wav', 'Lasst mich allein!', 18, array([-0.00018757, -0.00018757, -0.00024877, ..., -0.00011846,\n", + " -0.00011057, -0.00013031], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_745.wav', 'Da sind wir wieder.', 19, array([7.2459638e-04, 7.7395578e-04, 8.3911023e-04, ..., 0.0000000e+00,\n", + " 1.3820640e-05, 1.7769393e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_754.wav', 'So weit, so gut.', 16, array([-1.7769393e-05, -3.5538786e-05, 3.5538786e-05, ...,\n", + " 9.8718847e-06, -5.3308180e-05, -4.3436296e-05], dtype=float32), 1.5245208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_756.wav', 'Alles war voller Qualm.', 23, array([ 6.3180065e-05, 2.9615656e-05, 3.7513164e-05, ...,\n", + " -3.1590032e-05, -3.3564411e-05, 2.1718148e-05], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_765.wav', 'Fick dich!', 10, array([-3.9487539e-05, -8.0949460e-05, -5.7256933e-05, ...,\n", + " 3.9487539e-05, 7.8975077e-05, 9.4770097e-05], dtype=float32), 0.9076666666666666)\n" + ] + } + ], + "source": [ + "# print clips shorter than 2 sec\n", + "for item in data:\n", + " if item[-1] < 2:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sec_per_chars = []\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " sec_per_chars.append(sec_per_char)\n", + "# sec_per_char /= len(data)\n", + "# print(sec_per_char)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Average durations per char: 0.07641993439576344\n", + " > STD duration per char: 0.015251748851166484\n" + ] + } + ], + "source": [ + "mean = np.mean(sec_per_chars)\n", + "std = np.std(sec_per_chars)\n", + "print(\" > Average durations per char: \", mean)\n", + "print(\" > STD duration per char: \", std)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# fit a distribution\n", + "dist = norm(mean, std)\n", + "\n", + "# find irregular instances long or short voice durations\n", + "items =[]\n", + "pdfs = []\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " pdf = norm.pdf(sec_per_char)\n", + " pdfs.append(pdf)\n", + " items.append(item)\n", + "# if pdf < 0.395:\n", + "# print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6gAAAOFCAYAAABnc8/AAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAgAElEQVR4nOzdd2AUZd4H8O+zm957QgqkUkJCKAmh9x4VEATECqJYALuCYANFLKfevbaznXp3WO70VMTeRVSqCCggQpDeOwRIMu8fWzK7O7M7u9nNzm6+n38gu7Ozz87Ozjy/p/weIUkSiIiIiIiIiPzN4O8CEBEREREREQEMUImIiIiIiEgnGKASERERERGRLjBAJSIiIiIiIl1ggEpERERERES6wACViIiIiIiIdCHE3wWwl5KSIuXm5vq7GEREREREROQDK1euPCBJUqrSc7oLUHNzc7FixQp/F4OIiIiIiIh8QAixTe05DvElIiIiIiIiXWCASkRERERERLrAAJWIiIiIiIh0gQEqERERERER6QIDVCIiIiIiItIFBqhERERERESkCwxQiYiIiIiISBcYoBIREREREZEuMEAlIiIiIiIiXWCASkRERERERLrAAJWIiIiIiIh0gQEqERERERER6QIDVCIiIiIiItIFBqhERERERESkCwxQiYiIiIiISBcYoBIREREREZEuMEAlIiIiIiIiXWCASkRERERERLrAAJWIiIiIiIh0gQEqERERERER6YKmAFUIMUwIsVEIsVkIMdPJdmOEEJIQolz22Czz6zYKIYZ6o9BEREREREQUfEJcbSCEMAJ4GsBgADsALBdCvC9J0q9228UCuBHAT7LHigFMANAeQCaAz4UQrSVJqvPeRyAiIiIiIqJgoKUHtSuAzZIkbZEk6SyANwCMVNhuHoCHAdTIHhsJ4A1Jks5IkrQVwGbz/oiIiIiIiIhsaAlQswBsl/29w/yYlRCiM4AcSZIWu/taIiIiIiIiIsALSZKEEAYAjwO4tRH7uEYIsUIIsWL//v2NLRIREREREREFIC0B6k4AObK/s82PWcQCKAHwtRCiGkA3AO+bEyW5ei0AQJKk5yVJKpckqTw1NdW9T0BERERERERBQUuAuhxAkRAiTwgRBlPSo/ctT0qSdFSSpBRJknIlScoF8COACyRJWmHeboIQIlwIkQegCMAyr38KIiIiIiIiCngus/hKklQrhJgG4BMARgAvS5K0XggxF8AKSZLed/La9UKItwD8CqAWwA3M4EtERERERERKhCRJ/i6DjfLycmnFihX+LgYRERERERH5gBBipSRJ5UrPNTpJEhEREREREZE3MEAlIiIiIiIiXWCASkRERERERLrAAJWIiIiIiIh0gQEqERERERER6QIDVCIiIiIiItIFBqhERERERESkCwxQiYiIiIh8rOZcHWrO1fm7GES6xwCViIiIiMjH2t79MbrM+8zfxSDSPQaoRERERERN4OTZ5tmD+vve4/jvyh3+LgYFCAaoRD6yctthnDpb6+9i6NqBE2eQO3Mx3lq+3d9FISIiIh8Z/MS3uO0/a/xdDAoQDFCbyI7Dp3DhM9/j8Mmzml+zovpQowKc1X8exvTXV6O+XvJ4H/6291gNTpxpOAZfbdiHpX8ccHs/l7+8DDe9sdqt19TW1WN59SEAwJLfDyB35mLkzVqM02fr8P6aXfhxy0HVMu85WoMxzy7FzW/+7HZZ/e273/djy/4TLrc7V1eP2rp6t/YtSZLN+fj4Z5sAAHe8/Yvqa87U1mHf8Rq33qep7bM7T4mIiPTkyw17cbbWvXs2+UbNuTos+b2hLnvk1FkcPXXOo329tGQrcmcuxoY9x7xVPF1ggOqBunoJy6sPofyBz9B6zkc4dbYW63YexaOfbFD98T/3zR9Y9ecRfLB2t+LzP245iDpzxX3P0Rq8tXw7xj73A255cw1OnKnFXf9bi3U7j2ou333vr8foZ5Zi0ZpdOHDyjMPzdS6C1u2HTuG9n3cqPrdu51HsPWYbMLy8ZCt+3n4Es975xa0LYG1dPSRJvSyV87/ABf+3xPr3pFeWY+ILP2HeB7/ik/V7sGHPMVzx8jK8urQa0xauQs05U/A48qklOHrqHL7ZtB8A8O2m/Xj35104cuos1u08ityZi/HQR78BMAVAM15fjRXVh/Dxuj3YdvAkvtq4D4WzP8JFz/2AO//7C95fYzoWkgTMeXcdZry+GhOe/xHTFq7Cwp/+tJavvl5C5fwvMPVfKwEAa3c0fGfbDp5EzTlTsLVl/wl8tXEfXvxuC06aA5tNe49DkiQcPX0O76yyHQbz/eYD+HWX84tPXb3kcCy3HzqFuYt+tQkKf9lxBC9+twUTnv8BNefqIEmm1207eBIb9xzHZS8tw4C/fAPAFHidqa2z7vf3vcfx2a97Me65H1A0+yMUzv4Im/fZBrNXv7YCT335u2IZ+z76NTrN+wy/7T6GI6fOYv/xhnNz0OPf4MnPN+HCZ763ec3Nb/6Mrg9+YS3DlFeX4+UlWwGYLvJfbdjndqDsbV3nf4Hz/vZdk76nJElevSH9tOUgPrS7Pj3+6UZcZz6XPXXwxBm88O0W1d/57qOnnTbErdx2GFNeXeHxzftsbT3+/dM267Xmm037NTXafbR2N3JnLsYf+09Yr5cHT5xBXb2ErQdO4qO1u/HLjiMuf5eBzlL56ffoV26/dsfhU/jnj9twrOYcth08CcB0jfxt9zGcqa3Dpr3H8cr3W1FXL2HxL7uxbudR/LH/BCrnf449R/XdKEUNmrIBseZcHd5asd1pvUGPjp46h0MaOijeWbVD9Vq3/dApXPbST9Y6gxb15nrB8upDmPzKCsx85xf8uOUgTpypxX9WbHe7A+PPg6dQW1ePpX8cwAvfbkHuzMXWpE/n6urx189/d9mx8s8ft+Hjdcp1YblzdfU4dPIsfvjjoE05D588i+M1nt0PLH7ddQxt7/4Ifx485dbrdh89jZ+3H3G6zbm6epyzq5Os23kUkiThxy0HceMbqzH6maW49KWfsH6XqY7Yce5nKJv7KbYdPInZ/1uLl5dsxeUvL8OBE2ew/dAp3PW/tXhpyVas33UUv+46hl1HTmPjnuMAgIc/3gAAGPbkd/hDQ+dCoBB6+5GXl5dLK1as8HcxnLr85WX41hz4AEDPwmR8v9nUmzaoXTqev6wLDAYBANh64CT+u3I7nv7qD+v2c6raITYiBHe+vRbZiZGYN7IEk15Zjj6tU232ay88xIDf5g7Dol924bwOmTAI4Np/rcT9F5TgndU7MKhdOlqnx2LZ1kMY9/cfrK9bdtdARIWH4Nhp0w+632NfQwDY+MBwnKurx/PfbsFVvfIQEWoEYGrJ6Tr/C5ytrUf1giqHcuTOXAwAWHf/UMSEh2DnkdPoueBL6/MzBhbhb1/8ju/u6I+cpCiH1//zx23o3DIB7TLikH/Xh5jcMw83DS7C2dp6pMSEAwBWbjuEgtQYdJxrSiYwtku2prkLseEhOG538b68eyu89sM2xe3X3DMEZXM/dblfV6b2zcffv9mC7vnJ+EGlZ1WLEaUZOHjiLH7aeghzqtrh1Nk6dGmViEte/AkAMLg4HS9cXo6xzy5FvSRh4dXdIEnAul1HcdFzDd95iEGgVnZBf39aT3TITgDQ8P3JGQTg7D4VFWbE1D4FeOLzTU7LP6hdGj7/bR8AoGteEspbJeJMbT1mDW+L4zW16KQxOUTvohTsPHwaT03sjBHmwG/xjF44evocJr5gOhaf3twHQ574FoDpuC35/QBendwVsREhqJeAf/6wDbOr2mHbwVNYt/MoxnTJBmBqCHjk443o1yYVY7tkIyLUiK827MPbq3bgqYmdsedoDQ6cOIOSrHhNZf1j/wkMNAf0lt/Lym2HcejkWQwuTgdgOubJ0WFYefdgSJKEt1Zsx95jZzB9QCG2HzqNrMRIGM3XDFe2HjiJd1fvRE5SFG77zxrMHdkex2tqcX2/Aghhu4+jp8/h0U82YPaIYkSGGRX3V3OuDgdOnEGvh00ByHs39ERZju25svWhEXjis024pFsrpMdFYM32IyjJiodBAF/8tg9xkaHompeEpX8cQJjRgPLcJOv+r/zHMny9cb91v3uP1eDJzzfh9WXb0TYjFhv2HEfHnAS8e0NPAEDV375DWmw4/n5ZOQwCKJz9kXVfmx8cjhCjY7vq8ZpzmPXOWkzs2hLd8pOx5cAJRIeH4NTZOlz92gps2W8KjrISIrHzyGnrd7V2x1EkRIUiIz4CdfWS9Rr48Mcb8OzXf9i8x8KrK63nnj3L977wpz/x75+2YfGM3tbnTp+tQ0SoATsOn8aqPw9jZMcsm9cePnkWB06cwT+WVmNAmzQYjQKb9hzHQx9twEc39ka7FnHWbc/U1uH3vSeQHBOGFvGRAICvN+5DcWYcwowG/LT1EIrSYnDdv1bhoTGl6NwyETXn6iAEEB5ixKGTZ3Hk1Fnkp8YAAE6drcXkV5bjxy2HEBcRgn9NqcTy6sNokx6LXkUpmPfBr3jJ3BgEAJGhRiybPRBCCIx8agkeu6gMnVom2nye8X//AeeVZeKRjzfgeI3ttXjL/BHIv+tDh+P34OgSzP7fOofHZw5vi9GdspAeF6F43L3hreXbUZmfhFbJ0YrPS5KE+xf9igldc9A2I05xm2DQbf4X2HOsBo+M6YBxFTn48+ApJEaHIjYiFOfq6rH3WA3++eM2/P2bLdgwb5j1t/LJ+j2Y+s+VWDilEj0KU9x6zw/X7sazX/+B88taoKpDJrISIm2e37zvBHKTo2x+85Zz8qUryjGwXbrHn3fVn4fRMTsBWw6cwLGaWnS2O4+VSJKEmnP1OFdfjzCjwXoMauvqYTQICCHw6tJqbNl/AvePLMHxmnPYdvAUUmPDUTn/C5t9bZg3DLX1EmLCQ6yPrdl+BCOfNjXQ9i5KwT+vqrR5jfze/dhFZRhrvqfJnTxTi192HEWoUSAi1Ijz/m8JJvfMQ4+CZEx5zbFu/dCFpRjYLg1xEaGY9c5a9GuTanON+nbTfuSlRCMnKcpa17PUdywsdb03l/+JO99ei6l98jFrRDtrI4IQApIkIW+W7W+/ekEV6uolfLVhn7Vsf53QETHhIdh//AxmvrPWuu2NA4tw8+DW2HusxnosHx9Xht1Ha3BFj1zU1UsIMQjUnKvDoMe/wU2DWmPPsRpU5iWhX5s01NWbGuEt174b31iN937eBQB4ZEwHjO6chcMnzyI2ItTmXmmpT1vuX/mzFqNeAq7pk4826bHWegUAVB84iSWbD2D+h78hPMSAJ8Z3xJX/WI6+rVPxzab9mDeqBHe/a3udm9a/ENf2K0DJvZ84fDdaVOQmYnn1Yevfj48rw6iOWVi9/Qi6tHJ9TvubEGKlJEnlis8xQHXP019txqOfbHS53fQBhTAaBJ78XLknyVc+mN4Le4/V4KpXG47hP66swKRXljtsO6pjJrq0SsTd761HUVoMPrulL37ZcQQXPNXQg/X5LX1RmBaDunoJry6txvllmah48HPr8/NHl6I4Mw6jnm54TVJ0mLWl8KUryvHp+r24bWgbGASw6s8juNp8Ifr8lj4Y9Pi3NmX6z7XdbQItcvTVbf3Q/7Gv3XpNbESIQ2WxuUmPC8eXt/ZDeyc3gv+7uBOmv24aCn5+WSZKMuPQITsB4aEGtE6PxZ6jp3Gmth61dRLKchJwprYOd/73F7xrvtHdOLAI1/YtQLt7PgYAfHN7P9z93nprw1P1giosWrPL+h4x4SE4caYWEytb4u6qYrz2QzXaZMTivZ93YWj7dNTWS5i2cDW+ub2ftQKt1MAgd+vg1qjMT8bSPw7gpSVbcbymFh1zEnDv+cXoZA5Ypv5zpXV0gRaW4KxHQTLKchLw7Nd/4PahbbD3WI218ef2oW2s18aY8BC8MqkC6XERGPbktzh5tg5PjC/DBWVZKFAIUABTg1e9JKHDfc4bjDY+MAzhIaYKxMpth/HNpv34aO1u/G7uyb+uX4FDcKmkMi8JP201DeHPS4nG1gOmINZoEC5HmNhrmxGL8FAj1qi0rPdvk4qvNpqOd3mrRFzfvwC3vrUG1/crxIMf/uZ03wYBdGmViHmjSnDRcz9Yf8cXds7CT1sOWQNuJePKs/HWClPDXnxkKI6aGym/vb0/Bj3+Dc46GXlwfb8CPKNwHOXHKjEqFKvvGQLA1FN9+mydRw1+V/bIxStLqxWfi4sIwS/3DXV7n3JTXl2BgydNDUKTXzHdfzq3TMDb1/VA3qwPrZ/jeM05vPbDNlzXt8DayLxu51Gc939LkBYbjmWzBzWqHE3peM05xISH4N8//YmV2w5jaPt0ZMRHItQokBkfiTpJQlxEKMru/xTX9rVtfPzbxZ0ww3yNurp3Hl74bqvNvgvTYnDgxBmsvnsw5n/4G174bisGtUvHriOn8fZ1PSBBwsKf/kRyTBhOn61H/7apSIwKswZ0/125A3kpURjzbMO9PsxowNe390OPBV+ia14S5o5sj2FPfocre+QiOToMU3rnIzLMiBmvr8b7a3ZhXHk2xpXnoEurRJytq8cL327B5T1yERcRat3nb7uP4fd9J9C3KBWb9h1HRW4STp2txbwPfsPry/60OZefu7QzhpW0cHpMX/uhGve8tx4A0K5FHD66sTcOnzyLTvM+w4yBRbi8eyuUP/C59RiFGg34bbfnIyyeHN8Re4/VYPO+E9i074Ti9WVQu3QkRYdixsAiPP7ZJryzSnnkm5r8lGhsOXASpVnxWGsepffKpAr8tPUQ9h6twTurdyLUKFCQGoO+bVJtAlOL1ukxuPf89tiy/wTuNh8fua0PjUC7ez5GzTnH643adUaJJ3UfoKHTBAA+vqk3QgwGDHr8G5ttIkIN1vJ9c3s/3PW/tchNjsa/ZSPknPni1r7Whmq9iA4zYv3cYf4uhlMMUL3IVeUwGG1+cLhNLwYRAavuHozOTbRcwLOXdMbw0hbYfugUej/i/lBLiyt75OLDtbux7/gZ1xu7EGoUOFfn3v3DncqImpeuKMeGPcfx1y9+53wqHXjn+h7o3DLRp/fGNfcMQXxUqMPjlt6Uf0+pRE9Z790Hv+xC78JUfLlxL9pnxltHWth7dGwH3P5f0/z39plxWG8ert06PQYlmfEIMQprgG8JUGvO1eHQybOolyTsPVaDLq2SFPf99cZ96FmYglCFHv8dh08hMtSIZPOIITU15+oQZjRYg2Wt9h2rQVe7Hjs9sIw08PRc+fmewZi2cDWWbG6Yu/f0xM7Ysv8E/mLOZ2AZxZSfGm0dOWHxxPgy3PymcpKe7vnJeP2abli38yheXVqNR8Z2sPb8/e2LzRhemuFwHslH8ZApKLef8kP+tXLOIJfXGX9igOpFzTFAJSL/+/LWvta5wUTNyfllmfi/izs5PP7h2t24/t+rEB1mRN82qZg5rB1uWLjK2hPkbfKeGAulaTA/bTmI8c//iKl98zFreDsAwFNf/o6zdRKGFKfjPHNeBflrtx08icOnzqGjeXi9ZUjkxV1b4qELS63bbT90CkaDQKZsOOzOI6dxoqYWbTJisX7XUTy4+Dcs/cPzqSa+Ih9l4S1T++Tj79869uo11oWds/D4uI6s81FAmzuyPS7vnuvvYqhyFqCGKD1IRET6wuCUmquP1u4GFAJUi5Nn6/Dh2j34cO0en5bDPjhVc+CEaYrL9kOnzMlRgMc+3aS6D/nUoWn9C9G2Rax1OY7Xl/2J+aNLrPPLLSMotswfAYPB1MMnzwGhZ94OTgH4JDgFgHdW7cRXG/b5ZN9ETUU+xznQBG7JiYiIKOgZhPIQ121uZuD0hRe/24LxFTmINc99PHGmFhvN2bVdBc2vLq1G6/RYm8Dtqa82O2z32KcbEWo0oGtew3Din3ccQXZCJJ7UGDST+w57mDmciBqPASoRERHpllIypyW/H7Aur+BPDyz+DQ8s/g2RoUZM6pnr1hzre99fj6cndna5nXwVAIspr67QtGwJEVEg4jqoFFQKUpWXCmiMGQMKvb5PIiLSzrJe4Lurd2L7oVO6W5T+9Lk6jxKA3bBwlUfvx+CUiIIZA1SdaJ8ZvOurNaUPb+yt+HiKkyxm4SHOfwa3DGnTqDJpNagRa7o9dlGZF0tCRKQvVX9bgnU7j+KmN3/G6Ge+xwOLnS/PQ0TU3A1om+bvIniMAapO9C5KtVmQnTyjNldpUs9c1df8dUInLJxSqfq8mkXTeilmcLQYUZph/X9OUqTqdhbTFHpq/62xXG3SYzVt522PXVSGR8Z28Mt7E1HzYsl+a0lCREQU6MIUOkm+u6O/V/adEBXmlf34AwNUnbiuXwEeH+e8FywpOnBPNE90yI53+zVK4WnvohRU5CqvVWcioYdsDT0A6Ncm1ebvq3rl2fxdvaAKpS7K540VnORr+7VKjlJ/L3j2ZkqB80cqvdD2vrujP8Z2MS2Wbu+vEzpq2sfknnmuNyIiIqJmxZM6YCDqU5Tq8FhOknp9r7lggOpn1QuqUL2gCvGRoUiPi3C6rdKatUotL8HCkwAvxGjA9f0KbB4bqHGIw2c397H+f/7oUpvn7j6v2O2y1Ms+wFU981CaFY/iRvSSPzq2DO9P66n4nCQBg4vdHyL8wTTHYFRrT778AvrjrIG4c1hb3Ht+Me4Y1gYjO2Zp2od9h/cH03spbrfsroGICjM6PJ4a2zB0W22ucFyE/3PBhRqVe/ZJ34a2d/83JR85Qdr87eJO+GB6L873JyKrzHjXI888MaeqnU/2q9UQu7pap5YJ+OdVXf1UGv0K3ujGT+aNbO+0gr/sroHW/2cn2v74kqLDbJL82M+bvKG/7c179d2DsfruwS7LFIiV495FKa43UnFx15YOj6mM/LVRJBsmK18E3VPyALtFQiQWTe+FitxEm23u0RD4lmaZWhG75iU57UV/aqL6OoGqZYSE5y/rgoVXm4YSv3eDcgAMACvmDLL+X34eA0BGfASu61eAST3zcH0/03n65jXdcGEnbYGqRduMWPRt7diaGBsRal3AXs0Ehe8dAIwG2y9/at98t8qkRzcNKvJ3EZqF8BDHRhGLhKhQxcefuaSLW+/xv+t74LwOLTCuPNut1wWTC8oyUZIVj8vsFpSXN0CRb2m5RxI1pY4tG+759tOpehQke7zfKb211QF+sqvneMvzl5c7PNY9X/3z9CpMwbT+rhvv7Nc8DfR7CgPURrKvqF/WPRcLp1QiIlT50KbJekmVbgh5KQ0Bqn0wY59EJzE6DNEqi/C2zWgItr68tZ/iNno1ojQDr07qqnrDfPu67tj60Aj0Mg9/fVHhx27Pvje2qkML2V/u3Zn/c213TdvJe10t72802J4Xk3rmukyQtWh6L2x9aITNfpTIK9PvT+uJPgqBnr16CRjSPgM9ClJQvaAKZeYg0L4HuWVSlE2DSZqL3n4AqMxPtgn6ldh/HqNB4InxjsODhVAOQOXfnNr8Y4MQNje3WcPbOSTG0nKsGsMbw73lbhrU2rs7bAT74e/BxP6USo0NR89CU0ViYFv3e1dLshx/651aJuKpiZ1RlOZ6HrnaaIBvbu/ndlkCQVenUzNIK2fTQyw2PziiCUriyBeNEME2isGujRXPXuJ8aSL7KUp6dHVv5fvGpd0a7vMdsuLx0hXlWHPPEACmnkYLrfk5hpdkYOtDIzCxUrkBW03bjFikx0U4zTPiichQx0bPzIQICIX6S765w+pfUypxpZM8KmoeGRvYyTMZoDaSfe8MYAocf5jZ+JYX+ZDNt6Z2t1bAc5Iivf6j8cSs4W1dbiPvddPKaDDAoHBcJ/XMRfWCKnRplQQhBJ65tDNev7obBhWn49e5Q/HDrAE228eEh2B0pyxcJJsjmZ0YiRsHFuGWwfIKfkP0MK1/ofVzrZgzyKEBIj81WnU+66JpvfC/63tY/85JirIGz5b5qvZzRYUQiucQAJvhvEoXL9v9NPw/MtSIDtkJmnovlYaNA8DEypa4tFtLPDG+DNf1K8Arkypc7stVuZSkxNr2CAshVHuJXTUj2B/GEJXjCgDLZtuel+1a+CfJVFPR0vrqKW8H3wBwcVfHec3e5KxRK1al0Q8Aylsl4qUrKvDpzX1Ue1ABoLOsItWzMBnnl2XikTEd0LqRycwu7OzYIp4RF4FWyd5fXssfHK4X7NVrtE0PDNeURM9oEEh0ck77ii++YhFkJ84lla2w5l5TkPb61d3Q3UXvYYSTkR/uktdptJjcMw+fyqZLKYkND8G1fQscHv/lviF4YFSp9dpqNAgMbJeOePPfl3dvZd3Wvk4kv+bK1dVLEEIgPtL23HaV3ba/wvPPuGgYUNIi3rYx31KXekd2XEd1zFI8Y9+5rgcWzzBNe5I///ktfXD70KZZXcKfGKB6gX23OqDemyOXpTiMtOF1F3bOxsc39cZ3d/Q3De+MMVXcJ1Q4bwma1DMXGeYfxUVdfNfFf35ZpuLj1Quq8ODoEozpnI2UmHC3x9bHRypXEO8cZhsQx0WEWi/UUWEhaGE3XyEhKhRPjO9o08ucEReBmwe3RkFqjMM8AAC4bWgbTDVfOFNiwm16CpfNHohF05TnSAKmILRTS7te7+J0VC+oUvmuneuQ7XjBVTqthhSnoyTTFAC/fGW59eYwqlMWvr29P567VH3IYb2T4OKBUaUY3Skbdw5ri/zUGADAuzf0xFe39dP8GS5QOUe+nzkAj4ztgOxE7YkABheno3PLBJsETPLjIb9hfTijN941D1dWOmbxkaG284olU2Knz2/pq7k8Wl3Xz/FGLO91vHNYW8X5td4yp6odEn2YYG1C1xzVaQTuNGx0adXw28lPiVHdTmsSLzXVC6owqDjd/H07VqRszimF10eEGtE6PdZpYP60rCJTmBqD/7u4E8ZV5LgVzGu9dgfT0Ez7j+KsscBdtw5ujekDCjG8JLh611wRAppT6Dm7H6hp6aVkLk8qjJzxBft6RCBIiQlHfGQoqhdUoXtBssvriNr9xNWSekrs6zSAadpYsso9JcQoXDbEdc1zPjKitcpoktGdsvHjrIHWYN1ZI6Glfjrc3Jtufx+2X30gLyUa388cgOoFVfhh1gDcprC84PCSDFzWrVNBo9EAACAASURBVJXD40osPaUVuUk2wbGlbt5ZdlzVOiASosLQPtMxUVR2YpTDlL9gxADVQ5aTTwLw5a2mSq28w8bg5Mj2aZ2KLq0S8azCXKUrephO/revM7WutM2IsyajiQkPwZb5IxySANlrmRSFEHMBBmlMnPPcpe63DGXERai2KF1S2Qp/MWcldjaPS8ldI5QnsEcoDI1wl/J1QFsNLy02QnVItVbe7m2KDQ/B85eXW3ucB7RNt0le1DI5ylrxT4oOw9MTbb8vdyu3HXMSbIahu5KZEGmdQyuXlRCpmP1XjRCm7/+d63vaJGB6bXKlzTYWxZlxsqBM+UNO7plr7UmXAIzsmIXCNMfAyNkSRa5kxkfgzmFtHSqI8uC4VXKUR5W8R8Z0sF4nXFHrKfeG1umx+F1leGC/NtrXYEuJ0RZEe2s5LtP37VgRklcWXI1cUBNmVL4BuPM9KLXgO6N1vTv59A9nVt09WNMoGS2U8gIosT/eUxV6WTw1tjwbtw5pg/svaO+1fQYCAd+McrCQ92p5wjLE19V9pbyVY6CkRmkElsU1ffJd5jPwFlfv89rkrhjUzvXv1j64cvV19m6dgikKUy+8de3cOG84VqrkP6k3t3K8Nrkr/n6ZeuO45bfubiNURnyENeD72Tz0V8nfJnTE7w8Ox+hOpoa+uAjbYNb+DPnvtd2tHQkt4iMVR7YJITBvVImmKUGWMiZEhWLNvUPwyqQK5CZHWQNUx3273KVT9jlrggEDVA/JW6gsww9CZFGpfQ/qVb3ysGHeMACmH+7b1/VQ7NXoXZRqHsaqfDE2GIRblSZfbGl9hQBGlLbAA6NKGt2rIRcV5rusq764Uc8a3lZzL7GrlkN3RbrR82YQDUOMO7dMwLxRJQFzUVMbstUmI9baWGT/m3N1RgshrC3K8sBhah/bBAr3nt/eYUi9u3P+6l2ceFpGXNgbV5Gjep3QakqvPF1MF7Dnj15By7ng7L21lkveEyW/XlsefmK867lB8lPGUrEUwrT0gnxEhmW7v1zk3flGSdFhqtMPXLFPthanMioGsJ2DaP9u6XGeXZ+UpglYPktqbDhu1tEcbl8zXVuUrz/28+E8acgqa2Sw9/CYDnhwdInLJUX+q7ExDgAursjBZd1aKZ4/RoPA3JHOGynGKAyl98S7TpIOAqb6wItXmAKX6SoZrOMjQx1Wa3D2PQ1rn4Gh7TMw57xim9Fbb17Tzc3Sq3PWAGC59vVpnYqh7ZVHKwjR8BlCZKNv7PfamOqaEAKhKg2FluctSrPikexGXeiBkSXW/88fXaqp979fmzR8fXt/1Q4bTxpDLflQpg8oxMc39dbc+BgoGKB6yHIuSRJgNP/RRnZy2Fc427WI80oPoDPuTgKXs59Ur6ViYPlBXdqtlVstc4lRodZMqvZDA2cM9Ed20sZFrVP7FqC3bB2ry7q1wg39lVv+R5S2wPLZ7s/LtWe5P2i5pjXczIT15tEiIVLzUJXG8nWwYQm67e+ZQsMxkv+OLWaNaOcyaHNnaLL9/qtKWzg874+AbEJFDqZ78HtTKr8WK+YMcjo8XN4I0djAGzD1yDwwqsT1hmaWQClJtrC5/dciD3ymKCT5sCRuU6tAWh7WOkfuv9d2x8NjSjHWPNxXkoD3p/XC9zMHOGybGB2muiyOliQ53uRO0CI/EvLfQXpcuMeNlc4afIQQuLEZZcE2GIRqw6zRIPDO9T2s+Q582dOqJj4yFJdUtoIQwmG+nqe6FyRj3qgSh3mHWj12UQdsme/7pFGW0/Tr2/vb5cUw+eb2fvha4ZrpbCj2c5d1UfzdKK1MkO/GaCgLpXLKqa3JLm+Uk49EUwzMPLgf5rr5WRpzy5WvH5+XEo1RCvk+lOoW9v5xZQX+72L3V1+w7LsiNwm/3DcEtw5pg5SYcM3ZiQMFA1QPWS58BmFae/ONa7rhtckNvWiWE8hoEFg0rRfGdHZvuQ1PhHrY2g2Yhk9Gy3rjFk3rpZplLSsh0iGjsDuGlWQg0VwJnNyz4T0qchNtLn6WH/ZVvfKczv3Um3mjSnD7UPWhcd7IWtgQcrr+zq3bylotA23amv09TD6szHKztr/RaTk2Y7uYeiGvUjnXVcvj4nl3so86Sy/vrlEdlef82i9pBQALxnRQrcDNHdker1/tvRZ3wDQESevwcKV5T3JaslS+N60nRrgRTFuuN9eah9MpDc+bPaJhaHZmQqTD57HMPVWrk8h/i64IAZTnJmG8i5wD8gqh2jmvVMnVorG9Y1YaAx9vJLe5ZXBrh8Yqb+072AhhmgunlO9AK28GtZZ1sMfazb+2ny/oiqveKPsy29+ThRCKvYSWuY9yagmB7KdiXdgpC0nRYTajreTnpFKZWyVHK460S4oOQ2VeEh66sNThOXe4us5ayO+3alNzLEkJ1ZITWobaAsDckdobDl2xrMhQlBaLZbOdJyd98fJy3He+6RouP9xqQbUaIYTD6gv2X9/oTlno1yYV05ys7dy/bZpqLhdn5L2w8qHLaquHBKrg+jRN6LWrKnH/Be2twwK65SfbXEjkLbil2fEez2XyhNo7RYYasWhaL6y9T3nc/rr7hzb0EDgp7qB2aXjxCtdLu6hxd/7HBWWZ1iy4vuHfiosn7265wWppk2jotQGGFGdgULt0zPTS3DJf+fmewVglm+Ni/zHnjixp6OV0cSycHaKk6DC8fV0PhwRbrjj7OY/smIkHR9vegOepDCnrXZSCxOgwr/SgVi+owl0qC5APbZ+Bt6Z2t1teqUFhWozNcM7Lu+eqZoq8po//W2kvqXTd+x8XEerWb8vSeGMZ6ms0CIeh2fZD6v93fQ98clNDBdVSQVCrtFv2p+V+oLQPVy9Te97T+49axnJPVbqa4qBQzEfdDE5mDCzyeGhysFKrfnuSNMeXkmPCseTO/g7LnNkHRa6yxLpiGVEgn6plCVycUZov2To9VnHEjX3w9/j4jlh192D0Lkq1BnGeXveNBoE3p3a3jthwRv4e9nUvrYGZloByXEUOru6dp2kUXHxkqNN3tgw91/I7zpb1DKfFOu+BH1ScjitlnSJaKSVWs/QCW47vb3OH2TwfExGCVyZ1RbqGJfncobbKAQD0KUrFHcOCJ7uvvq5OAeDCzlnWOUBX9MhV3c7yu/JkCIW7Xp3c1dry6ExyTBhKs+NVhxoLoa2NubHB9qB26ZpaXX0R09sn0gGgOmm9qbjTdmeZi2LpuclS6Blz3L+lUmyqYL94RbnbQ1S9SUurb0JUmM2F2Nk51/D5lLfxzXkkVIePVpW2sAYyljJd1j3XYSjPmnuG4KUrTFluQ5xlVXOD2u9KCOF0/vPnt/S1zmdx1YDktV41D1gSVmn9SuXf/dKZA1Qb54CGCoelcVGSJNS6SGuaEBVmM7XDcgWVzxmzOf9kjUXOyuopbT2z/gveXCV+sunVMB8rT+YDKi1h4W4vSTD4x5Wm64vakPN/2a0l6WquvJz9Ou3ekp0Y5TDn0p7m37/KlglRYaheUGUzssBZfc66P41vnJscZbNMoD3L0nfOhqJrGaWmpTyW3s/E6DCvNEyrvWeY0YDZVcWIjXBvWLXS7h67qAw3DSrSlhhLYQdqS87I2czxz3A+Re3ZS7uoTvux7MfXU/i0MBgEru8XPNl9GaC66fFxHfG+huGmIUYD/nFlBRZ6eZickr6tU1GSFW8dg98iIdI6/CAjviHzrOXHHmIQqsN3nbEsEt/Y+o39LdDTeSJqws29GJalUWzeW/bm0wcU4b0bejZZRj9vSIuNwNMTO+Pt63rg6Ymd8ffLXPdkuzvvzdvs39WT4eHOSq7Wm9zYCuljF5U5TaZyabdWNq2ma+4ZghkDizDQ1eczlzM+qiH5hSfzUJS4U8G054/5Z3JCOK4ZZ8/d36r8nM9MiHRaeXr5ygrcOaytTRkGasiwafN+5rdLUpkP2i3f1EhgWYC9MT6c0RuZ8RF4fJxs2SUPfuNFCpmr5RbP6KVpTWVvaGzg3slcMb2iR67j+o125/fa+4bgCnPlPVjXFLQ0CKj9tAvs7pFK2ykto6dFQlSoLhOwyVl66JKjwxQbbu6V9ao+e0lnTY073fKT8NKVzpfXemBUCdbfP1S1h3DZXQPx9CWu7wn2Aa587XSLa/oUoHpBFWLCQ1STBvUuSvFZg4Ma5REips+TGhuOmwa11nS8LcvdWJacWTFnkKZ6t/zt57mRqyAQvHtDT69P0fEH36VLJbeXCWisK7rnok16LHoUpmBQu3QMbJdmnVvy8U29kWtezF0IgdlVxfjf6p04cOKszT6UWrABU89FVkIkHlj8m9uVoLKcePRvk4rV24/gyKlzCDUYbN7n+cu6YPzzP7r3YZ1Ii43AK5Mq0NlF65vRIPzaG+QpyzBNteGa9tyZ99YUvN2TYdmbJ5lwnbGfB6VEPucjPirUIYGEYs+FwkNKc0Q90Zie2FhzA1SuQkKd6gVVOFNb51Cp2PTAcJf7/fyWvvht9zFNZVg0vRd2HTmt+rxlXV3LV92/TSq+2rhffYcaTolLKluif5s05CRF4bp+BVhRfcj63OhO2RjaPgPF93yiqfzyt7ttSBt8sn4vzuvQMMfo0m6tMLg4wyujNooz47B0lvM5V0rsz0lXP5v2mfEYVpKBd1bvtD42uWceXv5+q/b31Lid7RI/jo+54k4gHRsRisiwhkbX+y9oj3vfX++wXVWHFlj8y27N+71tSGuUZifgipeXaX6NhUG4Xou0Z2Eyvt98UPG5e88vxv2LfnV4vDGNT+vuH4o1249g5NPf2zx+TZ8CLK9egdbpzhs4fMHSmLX7aI3L7ZxJjgnHggtLNS2HZRlxNaEiBxd0zMTEF35S3O6Na7o7PGY/X9FoEKpL1hWmxdisv+6M5fOlxIRhxRzlZV+8zdvVCCFMnSauRquomdwzDwlRYRhrHmmhdWUCy3UwMSq0Ub2f8nNs3f1D8cjHG/DaD9s82tfCqysRajRomrrlTCB1ujjDHtQgYjAI9DDPSTAahE3ig7YZnmcRntonH3Oqiht64tz88YSHGPGPSV3xzW398cqkCsRHhdrcMNUqIJbW3ZgI99tR+rVJs5k8blnmwBs9F4EmwtxL56010BrNg/uQs3OuYV6fh+XRASGEw7wrJdFhRsxwknTBZQIuJ8e+dXosXri8HPNVhmCHhxg9uoYUpsVoTgSREhPuNGGL5Vqh9bvWst2Do0udrhcdFRaCb2/vb13v2vn7NbxhkXlumjwjsRDCGpx2zU1ymKvcWFd6sGavpcHx4TGlePlK5yMyOrVMQLsWcR6vDezq62jMTzg7MRKXOslMrnTqW6cHQFh7XwHlnih5g5XSEGKLK3vmeTwqSMtw5n9Pcb9nRO1nb3+81UZgKDXkDi5OR/WCKiREKc+Jc/Zdall33VnGb0C4Pc94dKcs9C5ynLM5oWtLtxqMFozpgB4Frud+Wqy7fyjese/NV/HGNd3cWg7G16Oibhviu+WYwrxULwkxGjCuPMfp0jdKGhruG3cM5T+ZmPCQRi2T2KMgBRW5SejSynE6jqVBw9k1LtiwB5VUWX53KTHhNglDPG3diY8KVWypVJsfM390KUZ3ynIYhuSJthlxeHVyV9dJOoJQckw43rimG0qyfJloyrXU2HDsP35GdR0wZ5zOQVUZwqz0+Hkae52VvHlNN4eefiEEHh5Tih2HlXv95OW23IiHqCwFMrGyJSZWtkTuzMU2jxekRuOP/ScBAOvtEjG4y1XvtdK8KWdJGfTeKOCt4rXUuEyLO9fGt6517GkBTD3Zx2tqUeRBz5SWpEZl2QnYsOe49e9hJRnYuPc4+rROdZksLDk6DC9e4Xz4oru8dQ6lxIRruk7YPuhYhtKseGTIerAs9yd5T+HM4W3x3Dd/KL6Pp0NiAVPwc0HHTFz2kvu9r86o3WN9OR/Z2b61JKVzlvFbvuuK3EQsrz5s/VseDMrL8ISGtSp9wZ3zoZubGd3d/focNpedFkrTH6YNcJ7w6IdZA9D9oS/dK4RZfGQo/nNtd7TNiEWnuZ95tI/GkCePDAQRoUZsfcj3Sx/pCQNUUmXfY2oZgeHtoZRqIsOM6NPa9XISWvX14r4Cjbs3Pq8yny9/Hd8R0eEhiI/S3rsQZjTgbF29pm1Vs/iaH9/0wHDVFPhadFRJvKC0DIglCG+Z1BDYFKbFYMO8YW73Qn58Ux8Uzf7IrdeocXeo31tTuzfZGprDFDIluuKLabONubx5o8J/UZcc3DS4yGYEiFqA4YmreufhzRXbrX9P7ZuPq3rn2byfOuefb9G0Xta5YK7IG16se2/E4fvLuDLXG9lpWK5L/bdRmBYLYI91ioyvTOtfCKNBOJ0n7Wp94MaeJt6ch95UFf+h7TOsAapBmJbNaQq9ClOwZPOBJnkvNd6sij12URn+s2I7Hvpog4s3bfivvKHBk2uUt7OEu8OSwdnTJRN7FaZg2dZDyExQ7n33RU4Hfya48wcGqKRZuXkSvdryE7cObu3+cFzR/H50zVVkmNHtOb+LpvfC1xv3adrWvuHE/v7gKjOkK+4Mp0qNDceLl5dbfzMWngyR9eavw91hUM4y/wLeK9vUvvkY2VH7/EHLd+HvxE6+oi1Y9Iz9d2YQwpoAr7FyU6IUAyylyuu/p3RDt4e+sCubZ2fUgLZpLkfaKI0ekDRMD7ioSzYGyfI5+EpPDUuGpMSoj2aYU9VO9d5s8drkrrjcPDc21CgcE8t58fd093mul23xlNrX9Te7ZHONvT6N7ZKtOKcXMK2eUCebN/n61d2sU4mairu/F4fzXPZ3UnQYpvYtcB2g2nnsojLc9p81yElqfENmU9YEo8ND8OOsgUh28ptyZlr/Qozpko2sBO/kjyBHDFBJs4rcJPw6d6jqGPvpGta/ouZncs9c3PjGz057IPq3SVVca61NRqzNMh7OqK4BqenV3udsXqO/pJvXiZvswVpwSrzVuHR591w331j9qTuGtcGS3/3bs+Ft3jjOcREhGF7iOMRdy661xi1qo2uU8hekxIShNCset8ky6Hr6MR/X0HvqLGuos4q+EPB5cDqhIschuCzLjseaHUe176NrS9XhpPLj/94NPZGZEOl6vnojXaBx3rk7jAaBunpJ9ffg7TUnYyNC0TIpCn8eOqVYFvk8WFeNA7rkST4Iu9/KmM5ZSI0NR2+VBpZr+xaoDof3t8YkqzMYBINTH2OSJHJLYyaAy1nm4fniJkb6MrJjFqoXVCHRyVzG6PAQj1tgp/QyBVv2lRbLIvSF6doCXFf81dHvzREGF3Y29VJe1t3/iRZev7obFl5diXvOK/b4Rq9Uv7q+X2GTLO/lbZas7xd09M018Zf7huLhsR0cjpk7UzZcberyeVnlNsRowKLpvbwy9UIpUY/99UTpXLmhfyEmVOTgkm4Nw/SFsN3WnXOzLNv5PP9IlREUvYt8O/1EngyqLCdBNTj1Vob16gVVqqM1OmlYo1KNJdu5AHCZOVmMsx4wb1w6g2n9XPt7idIn07o6gHyffVunqn7fM4e31f1yQ6RPDFDJIQW6hS8vzAWpMaheUIX2mfFenV9Fzc+c84oVb4ApMeH411WVeGqid9YYDQYlWfGoXlDlNAGJOxpT/+tekIweBSmY3Mv93lzL+7q6dgTS9AHLNdHXSwTYHzKvDiFXOd6WrLZqS2tYy+LFwqTEhKN6QRUynfSSxEeGYsGYDqoNr6mxzhMv2XvPxRrpavdUb9xrnZVS6y3WMqT/yh65jS6PkkXTeuHVyV09fr38c0zta1rfU/7dxXppqHpzIz/HG5OngcibGKASXpnk/IYRSJU8IrleRSlem8/nr19Bc/31PTG+Ydjm+9N64sMZva1/N8U1SU/NZt5sxLMPhuyPZZjR+9WCqX0LcO/5xRhfkeN0O18um+HqGPrq+76mT771/x4u9eiRhVdXWv+vdYm4RHNPdGN6Oe29fV1DVt3S7PhGXY+tPcGyz2H5bOEhBrTNsG1sZ9XFfZbkQWp4TJ3LiDONTkjVuB4rqWOA2owVpZmGPsorJPJ7eFN1bDIANuFR0Dd/nad6/nk4K1uiG9malYzu1LAmZIfsBBSrjPSgxrH/Cr+7sz8Wz7DtCdR6L1DrQQ0LMWBSzzyXa1f64lx/fHxHVOYl2Swd47QMKv/31BD5XHSV4+iNe63l2FmWUpMH+w1DfJueq8zD7lBbUgxQztLvzQYPX685qhezRrTz23vr+V6n1eXdc/HMJZ1xUbnrNY3JOY6HaMaeu6wL1u44qrjsh/xi7OtrBof4OvrnVZ4Pg6Lm5d7zi/HWih3+LoaDXC8NI1bSMMTXZ2+hK95sHLEcs8SoUDw6tsxh7lh6XIRqshlXpWhsMX1xr+mWn4w3pyqvOSvXFPchpaG8iVGh6OHjBDvWjxZgAcDUvvlon9kwr7eqtAX+/u0Wt5Yqc8e8USU4XnPOJ/sOFK56uAPsFGpyBoPAiFLP11unBgxQm7H4yFD0KnKd2p58y5LgI1TWk+3rpBnkvqa+MQthqli6Ck4m9czDJC9l5XWXs7L5MgGa5W1dzd0LlsqUL4Kn9LgIr2eabuwa2f4cTZNmDsp7F6WqNnykxYbjkkrPE4yFGg04V1dn89jqe4Z4vD8lSkVviE9d9WB7dvzzU6Kx5cBJ1xu6adZw2968O4a1xfX9Cq1zmoGG5e+uUpjLPr4iB2t3as+EbEm8RET+xwCVXPJ1naG5D/F9cnxHLFqzC+1aeCfbLAWHRdN64dP1e/xdDEVtM2KxYc9x1ee/nznAaXKaxlKqaIcaBTrleG84IclpC5AbeyWXv35M56YdIpeVEImlMwcgPS4C+4+fMZXH7gMtmz1I077UGhRSY8Ox7aDjkiXuWn33YJytq0fl/IZ1ZO1/E0q3VVe3Wk8bQj6+qQ/q6iW0u+djj16vldEgHHpPLcmwlIztko05767zaZmIyDcYoJKNZh4r+kVSdBiu8FHWRPKepv5tlGTFoyTL+bIV/rLw6m7YtFc5QH1kTIcmWx9OXp/+/cERXtprcF8EvdUZmxYbjn3mQM6i0UN8Za+/bUgb9Q19JNPH562WY68lSEyMDkNtXb0bb2z6x1dndlgI05n4Q6Q5oVH3Au0j4f53fQ+MfmYpgIZzzZ3zojEdCn+d0BHLqw95/HpqXnhVIVWcG0rk6Ib+Bf4uglssS0d4U1J0GLrlK8+bG+ciU6s3WIf4+uAS1dK8fuaFTdyDp4U3ErV4a/kwpd5Eb46GUVtXsSkkRYchNiIEc6qKvbpfXy7dZj30Cm/RkP3W8yG+eSnRTdbw5C1eWQdVx9WgmPAQfHFrXzw6toPm13RqmejVxFXuGNkxCw+MKlV9vrkkoiJtGKCSKss8yE4tOWyOSAiB6gVVuH1oW38XRbM5Ve3QIdu362r6gy+rMamx4dj60AhcWtlS82v+clEZ+rXx/bxxbwY47gSTasuUXNqtJUZ29N5cYyEEuuUn4blLu3htn54ICzFg7X1Dcb6TedT/udZ14iWLCRU5LpfvsND6vbgMNmX/1xpkOWuU/uq2flg03fk6r9T0ClJjEBGq7dyyaExDA0NIaioc4kuqBhWn47e5w6zDSIh8xVvV7qrSFoiLDOzL2oSKHHy0Tp9zT/XGZZIkD2tT7vYEjumSjTFd9NfjqsSbPUKW3pD3ft7ltX2+cY32wM+fshO1V/IXjOmABWM6oNfDX7rc1lcJsQDXa1z6A6cVEZGSwK7Jkc8xOCVf8nbd5OlLOnt5j03PUpklsufddR2pqXkz9lT7/pQabR4aU4oBbdNczml31TDji+BZz0No5YIpkG7Igk6kXwxQifwkKTrM30XwO94gySM+nINKjrQuU6LFxV1zsH7XsUbvx98aeyx6FaYgQWkNclkkdOewtmibEYsztXW49l+rPH6vuIjQgOnh15vpAwpx59trkRIT7u+i+Iw7wXcwBeqkbwxQifxg1d2DmflQhvc8cgeTaQSuhy5svqMD2mfGYeeR0wCA4aUZimuqynspr+tnSsj20drdLvcdqIGD3ss9vqIlxldon48eCPTasKf3c4GaFmvIRH6QFB2GmHC2DxE1hqt6FgNZYGrffJu/1RIekXvsh9J+clMfl695ckJHtE6PAdD4c1Pt+2v4fgPjC/ZlsMTfv/fxmFJTYYBKREQBxdmSGsFoaPsMAMC4CveHac4a3s7m74YlR9wvR4DEPH7RJiPW5TZRYSHobM6Kr3YslQJLLce9KQIHVz+3RdN64ZVJFT4vBzUOf8cUCNiFQzb8MfSjfWYcMuMjAmr5DiLyn+ZWv8pJikL1giqv7pM9IY3j7Pi1zYjzeL+BvP54abbzJExKGCz5TwCfatQMMEAlRU1504gOD8HSWQOb7g2JKCh4c13Q5sKTSikrss5d2SPX5m+vZL+X3YSVjr/aEF5rQqsACfx4bvmf/amSmxyFonSVEQEBcl5R4GOASkREAcVSOWfl1n2NCWDUXpOXEo2tB056XKbmRPM56+bJHSgBKenf17f393cRiDgHlZSx4kekH1kJkYjS2CszuDg96CurWtfxC/bj0FSyEyMBAB2yExSf/2B6LyybrTwK5vLujplqm5OcpCibv61zgN3YR1Odx5ae4O75yYrP+6JewN9oYOH3RU2FPahkgxcfIv357o7+mgezvnB5OeYu+hUvf7/Vp2XyJ16mmlZZTgI+vbkPClNjFJ+PDg9BtEpW8vsvaO/LouleelyE4uMu77Uuhvg6bG7+Nz7StLZqqNH9/ocurRK9PtfZFZ9m8eWFgihgMUAlItI5g4E1LSWBnFDGXyzHzN0zqrXanDQXAmW5E3f5fP6zwrmtJbHVYxeV4Z1VO1DmQcIial549SQ9Y4BKREQBResQX3LCx4HjnKp26NIq0afvEYga06ZiHxQnRoXi8KlzABoaApKiwzCld77Da/UqSNsvAoo7jUi+/Lp4LpAcA1QiHbn3dUbqwgAAIABJREFU/GKsqD7s72I0OQYa3qXn7LZvXtMN5+oaV74Qg2n4YmSoF7KlBjGluYRNdWYEUpDkicYu0+Py9bLaulrFffU9Q5A/azHqJUvPuG9r+Hq+rhBRcGGASqQjk3rmYVLPPH8Xo8mwwbT5qVRJwOKODtnxuGVwa0yoyEHX+V94oVTB6eo+jtcSSw8ef3v+4UmIF8wj2dXm6JI+BeuQfdIfBqhEREGmsb07eieEwIyBRf4uhq4tuLAU/dukOTweYp7PHB3O3me/8mSZHycvCsTAoTQr3qNkTkQU/HhlICIiCjITurZUDFo6ZMfjtiGt8cT4jn4oFTWG0hDbwcXpAAKzR9zXMXUgHhMiMmEPKhH5TRCPXPOr5JgwAECceckJIgshBKYNCKze54VTKjHxxZ/8XQyv8PZw3b9d3AkHT5xtmkzfvGAHFU+yoPs0SRKbFEiGPahE5He8LXnXNX3y8fCYUoztnO3vovhVAI56JAU9ClMwpZdpPu2VPXKtjxe3iMMH03v5qVSNo3ZqGs2BZniIY/VMqQIfHmJEZkKkN4tGzQwvk6RH7EElG8GcjIGouQg1GjC+oqW/i0HkdWGywC0nKRIlWYG13qerTLilWfGYMaAQEytbNVGJqLmZXdUOIQYDBrRNwwe/7PZ3cYgUMUAlRex5ICIicu3qPm4sqWPJoqxykxVC4JYhbVRe2nQtyJd2a4my7IQmez9qOmmxEfjLuDJ8/utet1/LuiE1FQaoREREFHD0MGctPS4cWUE4xPaBUaUOj1l6r3OSgu/zEpG+cA4qKeJQXyIKdHoIYMg7zivLBAD0a51qfawpexT1wN/nc0JUGF6ZVIEPpvX2azm0CsSld5qSJ78eX56D/LpIjj2oZIMXCCIKJH1bp+KbTfv9XQzysY45CaheUIV9x2v8XRSvCNRbbT+FtXVdWT57EE6drfVBacgbWO8jPWKASkREAeuVSRUc8dGMyHtw/N2j6AlPTtXIMCMAIDE6zLuFaSKpseEAwv1dDCIKIAxQiYgoYAkh2ANAAcedc7Zv61TMG1WCCztl+a5ARBrwWktNhQEqERERBZwbBxX57b0tCYNaJUX7/L2EELisG5edIf8RgrlJqGkxQCUiv+N9j3yhObb2f3FrX/y665i/i+FzkaFGtGsR57f3T4oOwwuXl6MiN9Gt10ms5ZNO8FwkPWOASkR+0wzjByKfKkiNQUFqjL+L0SwMLk53+zWWkKA5Np40NR5irXikSH+4zAwREREFlEAP8AIxwRMRUVNhDyoRERGRF62cMwiGQI+iiezwlKamwgCViPyGM2DIldgIz29TrEuRvyTHcFkVCh4CvF9T02KASjY4Z578gYEEKfny1r5IiArMtR/JNwK9B4f3WApkHJpOTYUBKikK9EoAEQW+fCb7oSDDJEmkF2wrIT1jkiQiIiIiomaIjSWkRwxQSRGHIRERkV6xTk2uMPDyPh5TaioMUMkGLz5EFCwEL2ikMxJbfykA+fJaOrGyJQAw6zXZYIBKRERE1ITYeEKByBdn7X3nt8eGecNgNPA3QQ2YJImIiIgCSqAGeOw/Jb3wpDPfF+evwSAQYTD6YM8UyNiDSkRERNSEAjO8pmDEc5H0iAEqEfkdexXIF1jxCl6BOpdzWv9C5CRFondRir+LQkSkWwxQichvGEAQkTsC/ZrRrkUcvrtjABKiwvxdlKAXqMPAm0qvohSUZcfj9qFtXG7bJj0WQOD//ihwcA4qERERBRQGH0SNExMegvem9dK07b+mVOK33ccQYmS/FjUNnmlERERERKQoKToMPQs5LJ2aDgNUIiIiIiIi0gUGqGQjQPNOEBE54CjQ4MWvlogoeDFAJUWs2FFTYHsIEREREckxQCUiv2N7CBEREREBDFBJBYf6ElGwKEiN9ncRiIiISCMuM0M2OLSXiIKFEAKvTu6K4hZx/i4KeQmXlyEiCn4MUImIKGj1bZ3q7yIQERGRGzjEl4iIiAILO1KJiIKWpgBVCDFMCLFRCLFZCDFT4flrhRBrhRA/CyGWCCGKzY+HCSH+YX5ujRCin5fLT0RERESkaGj7dH8XgYjc5HKIrxDCCOBpAIMB7ACwXAjxviRJv8o2WyhJ0nPm7S8A8DiAYQCuBgBJkkqFEGkAPhJCVEiSVO/lz0FEREREZLVs9kDER4b6uxhE5CYtPahdAWyWJGmLJElnAbwBYKR8A0mSjsn+jEbD8obFAL40b7MPwBEA5Y0tNBEFFyaNJiJ3cIQvaZEWG4HwEKO/i0FEbtISoGYB2C77e4f5MRtCiBuEEH8AeATADPPDawBcIIQIEULkAegCIKdxRSaiYMFKJhERERHJeS1JkiRJT0uSVADgTgBzzA+/DFNAuwLAkwCWAqizf60Q4hohxAohxIr9+/d7q0hEREREREQUQLQEqDth2+uZbX5MzRsARgGAJEm1kiTdLElSR0mSRgJIALDJ/gWSJD0vSVK5JEnlqalcEoCIiIiIiKg50hKgLgdQJITIE0KEAZgA4H35BkKIItmfVQB+Nz8eJYSINv9/MIBau+RKRERERJpwWgARUfBzmcVXkqRaIcQ0AJ8AMAJ4WZKk9UKIuQBWSJL0PoBpQohBAM4BOAzgCvPL0wB8IoSoh6nX9TJffAjyHonZaoiIiIiIyE9cBqgAIEnShwA+tHvsHtn/b1R5XTWANo0oH/mJYDM1ERERERE1Ma8lSaLgwp5Uago8zYjIE4KtqEREQYsBKtngPZ/8gacdEREREQEMUImIiIiIiEgnGKASERFRQOFoHyKi4MUAlYiIiAIK8yQQEQUvBqhERERERESkCwxQiYiIKKBwiC8RUfBigEpEfsfRekREREQEMEAlIj9iJwgRuYONWUREwY8BKhEREQUUNm4REQUvBqhERERERESkCwxQyQZT9xMRERERkb8wQCUiIqKAILEVlYgo6DFAJRtM3U9ERERERP7CAJWI/IZ9IUTkDsFWVCKioMcAlYj8jlVOItKCQ3wDX1J0GACgLDvBzyUhIr0K8XcBiIiIiNzBntTAlZMUhQ9n9EZhWoy/i0JEOsUAlYiIiIiaTHFmnL+LQEQ6xiG+REREFBA4wJeIKPgxQCUiIqKAwgG+RETBiwEqEfkde0WIiIiICGCASkR+xF4QIiIiIpJjgEpERERERES6wACViIiIiIiIdIEBKtngGuhERKRXvEcREQU/BqhEREQUUAQnsBMRBS0GqGSDN30iIiIiIvIXBqhERERERESkCwxQiYiIKCBIXDWZiCjoMUAlIr9hVZOIPMP5KEREwYoBKhH5HauaROQeNm8REQUrBqhERERERESkCwxQiYiIKMBw3AURUbBigEpERERERES6wACViPyOs8mISBNeLIiIgh4DVCLyGw7SIyJPCF48iIiCFgNUIiIiIiIi0gUGqGRD4vApIiLSKd6iiIiCHwNUIiIiIiIi0gUGqGSD83qIiEiveIsiIgp+DFCJiIiIiIhIFxigEhERUUDgHFQiouDHAJWIiIgCCof6EhEFLwaoROQ37A0hIiIiIjkGqETkd+wNISItuBQaEVHwY4BKREREAYUZ54mIghcDVCIiIiIiItIFBqhE5HcctUdEWsRGhAAARnbM8nNJiIjIV0L8XQAiar44So+I3BEdHoK19w1BdBirL0REwYpXeCIiIgoYsRGh/i4CERH5EIf4kg1mSCQiIiIiIn9hgEpERERERES6wACVbDB1PxERERER+QsDVCIiIiIiItIFBqhERERERESkCwxQiYiIiIiISBcYoBIREREREZEuMEAlIiIiIiIiXWCASkR+w2V3iYiIiEiOASoR+R1XNyIiIiIigAEqERERERER6QQDVCLyOw71JSIiIiKAASoR+RGH9hIRERGRHANUsiGxK4uIiIiIiPyEASoRERERERHpAgNUsiE45pKIiIiIiPyEASoRERERERHpAgNUIiIiIiIi0gUGqERERERERKQLDFCJiIiIiIhIFxigEhERERERkS4wQCUiIiIiIiJdYIBKREREREREusAAlYj8RvJ3AYiIiIhIVxigEpHfCX8XgIiIiIh0gQEqEfkde1KJiIiICGCASkR+xJ5TIiIiIpJjgEpERERERES6wACViIiIiIiIdIEBKhEREREREekCA1QiIiIiIiLSBQaoREREREREpAsMUImIiIiIiEgXGKASERERERGRLjBAJSIiIiIiIl1ggEpERERERES6wACViIiIiIiIdIEBKhH5jeTvAhARERGRrjBAJSIiIiIiIl1ggEpEfiP8XQAiIiIi0hUGqGTjjqFtAQDxkaF+LgkRERERETU3If4uAOnLuIocjKvI8XcxiIiIiIioGWIPKhEREREREekCA1QiIiIiIiLSBQaoREREREREpAsMUImIiIiIiEgXGKASERERERGRLjBAJSIiIiIiIl1ggEpERERERES6wACViIiIiIiIdIEBKhEREREREekCA1QiIiIiIiLSBQaoROQ37TPjAAAjSlr4uSREREREpAch/i4AETVf+akx2DJ/BAwG4e+iEBEREZEOsAeViPyKwSkRERERWWgKUIUQw4QQG4UQm4UQMxWev1YIsVYI8bMQYokQotj8eKgQ4lXzc78JIWZ5+wMQERERERFRcHAZoAohjACeBjAcQDGAiy0BqMxCSZJKJUnqCOARAI+bH78IQLgkSaUAugCYKoTI9VLZiYiIiIiIKIho6UHtCmCzJElbJEk6C+ANACPlG0iSdEz2ZzQAyfIUgGghRAiASABnAci3JSIiIiIiIgKgLUlSFv6/vfsPtvyu6zv+enfTiMQqULdWkyhLZm0nSifQbaBTpRZSDMYxccZpg4zG2g6TSoZU7ECsTEbzn2knto47Ykbj1B9xtWjbHbs2ta2tQ6fSrBoIAWM2AUkyWFaIpJTWJPLuH+e7cLjdm72wd7nve/fxmDmz3+/n+z2Xz+GT7+597jn3u8mja/uPJXnZxpOq6g1J3pTkwiSvXIbfnlXMfijJc5N8b3d/9GwmDAAAwN60bTdJ6u7D3X1ZkrckeesyfGWSP03yFUkOJPm+qnrRxudW1eur6nhVHT958uR2TQkAAIBdZCuB+niSS9f2L1nGNnMkyXXL9rcn+ffd/XR3fzjJf0tyaOMTuvvO7j7U3Yf279+/tZkDAACwp2wlUO9NcrCqDlTVhUmuT3J0/YSqOri2e02Sh5btD2b5uG9VXZTk5Ul+72wnDQAAwN5zxp9B7e5nquqmJPck2Zfkru5+oKpuS3K8u48muamqrkrydJInktywPP1wkp+uqgeSVJKf7u53n4sXAgAAwO62lZskpbuPJTm2YezWte2bN3nex7P6p2YAAADgWW3bTZIAAADgbAhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBG2FKhVdXVVPVhVJ6rqltMcv7Gq7q+q+6rqHVV1+TL+umXs1OOTVXXFdr8IAAAAdr8zBmpV7UtyOMlrklye5LWnAnTN3d394u6+IsntSe5Iku7++e6+Yhn/jiTv7+77tvUVAAAAsCds5R3UK5Oc6O5HuvupJEeSXLt+Qnc/ubZ7UZI+zdd57fJcAAAA+P9csIVzLk7y6Nr+Y0letvGkqnpDkjcluTDJK0/zdf5uNoQtAAAAnLJtN0nq7sPdfVmStyR56/qxqnpZkk9093tO99yqen1VHa+q4ydPntyuKQEAALCLbCVQH09y6dr+JcvYZo4kuW7D2PVJfmGzJ3T3nd19qLsP7d+/fwtTAgAAYK/ZSqDem+RgVR2oqguzis2j6ydU1cG13WuSPLR27M8k+Tvx86cAAAA8izP+DGp3P1NVNyW5J8m+JHd19wNVdVuS4919NMlNVXVVkqeTPJHkhrUv8Yokj3b3I9s/fQAAAPaKrdwkKd19LMmxDWO3rm3f/CzP/S9JXv45zg8AAIDzxLbdJAkAAADOhkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMMKWArWqrq6qB6vqRFXdcprjN1bV/VV1X1W9o6ouXzv2V6rqv1fVA8s5z9nOFwAAAMDecMZArap9SQ4neU2Sy5O8dj1AF3d394u7+4oktye5Y3nuBUl+LsmN3f01Sb4hydPbN30AAAD2iq28g3plkhPd/Uh3P5XkSJJr10/o7ifXdi9K0sv2q5O8u7vftZz3ke7+07OfNgAAAHvNVgL14iSPru0/tox9hqp6Q1U9nNU7qG9chr86SVfVPVX1O1X15rOdMAAAAHvTtt0kqbsPd/dlSd6S5K3L8AVJvi7J65Zfv7WqXrXxuVX1+qo6XlXHT548uV1TAgAAYBfZSqA+nuTStf1LlrHNHEly3bL9WJLf7O4/6u5PJDmW5KUbn9Ddd3b3oe4+tH///q3NHAAAgD1lK4F6b5KDVXWgqi5Mcn2So+snVNXBtd1rkjy0bN+T5MVV9dzlhkl/M8l7z37aAAAA7DUXnOmE7n6mqm7KKjb3Jbmrux+oqtuSHO/uo0luqqqrsrpD7xNJblie+0RV3ZFV5HaSY939787RawEAAGAXO2OgJkl3H8vq47nrY7eubd/8LM/9uaz+qRkAAADY1LbdJAkAAADOhkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGGFLgVpVV1fVg1V1oqpuOc3xG6vq/qq6r6reUVWXL+MvrKr/s4zfV1Vv2+4XAAAAwN5wwZlOqKp9SQ4n+dtJHktyb1Ud7e73rp12d3e/bTn/W5LckeTq5djD3X3F9k4bAACAvWYr76BemeREdz/S3U8lOZLk2vUTuvvJtd2LkvT2TREAAIDzwVYC9eIkj67tP7aMfYaqekNVPZzk9iRvXDt0oKp+t6r+a1V9/VnNFgAAgD1r226S1N2Hu/uyJG9J8tZl+ENJvrK7X5LkTUnurqov3vjcqnp9VR2vquMnT57crikBAACwi2wlUB9Pcuna/iXL2GaOJLkuSbr7T7r7I8v2byd5OMlXb3xCd9/Z3Ye6+9D+/fu3OncAAAD2kK0E6r1JDlbVgaq6MMn1SY6un1BVB9d2r0ny0DK+f7nJUqrqRUkOJnlkOyYOAADA3nLGu/h29zNVdVOSe5LsS3JXdz9QVbclOd7dR5PcVFVXJXk6yRNJblie/ookt1XV00k+meTG7v7ouXghAAAA7G5nDNQk6e5jSY5tGLt1bfvmTZ73y0l++WwmCAAAwPlh226SBAAAAGdDoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYYUuBWlVXV9WDVXWiqm45zfEbq+r+qrqvqt5RVZdvOP6VVfXxqvrH2zVxAAAA9pYzBmpV7UtyOMlrklye5LUbAzTJ3d394u6+IsntSe7YcPyOJL+2DfMFAABgj9rKO6hXJjnR3Y9091NJjiS5dv2E7n5ybfeiJH1qp6quS/L+JA+c/XQBAADYq7YSqBcneXRt/7Fl7DNU1Ruq6uGs3kF94zL2RUnekuSHnu1/oKpeX1XHq+r4yZMntzp3AAAA9pBtu0lSdx/u7suyCtK3LsM/mORHuvvjZ3jund19qLsP7d+/f7umBAAAwC5ywRbOeTzJpWv7lyxjmzmS5MeX7Zcl+baquj3J85J8sqr+b3f/2OcyWQAAAPaurQTqvUkOVtWBrML0+iTfvn5CVR3s7oeW3WuSPJQk3f31a+f8YJKPi1MAAABO54yB2t3PVNVNSe5Jsi+9xAB9AAAMgklEQVTJXd39QFXdluR4dx9NclNVXZXk6SRPJLnhXE4aAACAvWcr76Cmu48lObZh7Na17Zu38DV+8LOdHAAAAOePbbtJEgAAAJwNgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwwpYCtaqurqoHq+pEVd1ymuM3VtX9VXVfVb2jqi5fxq9cxu6rqndV1bdu9wsAAABgbzhjoFbVviSHk7wmyeVJXnsqQNfc3d0v7u4rktye5I5l/D1JDi3jVyf5iaq6YNtmDwAAwJ6xlXdQr0xyorsf6e6nkhxJcu36Cd395NruRUl6Gf9Edz+zjD/n1DgAAABstJV3My9O8uja/mNJXrbxpKp6Q5I3JbkwySvXxl+W5K4kX5XkO9aCFQAAAD5l226S1N2Hu/uyJG9J8ta18Xd299ck+WtJvr+qnrPxuVX1+qo6XlXHT548uV1TAgAAYBfZSqA+nuTStf1LlrHNHEly3cbB7n5fko8n+drTHLuzuw9196H9+/dvYUoAAADsNVsJ1HuTHKyqA1V1YZLrkxxdP6GqDq7tXpPkoWX8wKmbIlXVVyX5y0k+sA3zBgAAYI8548+gdvczVXVTknuS7EtyV3c/UFW3JTne3UeT3FRVVyV5OskTSW5Ynv51SW6pqqeTfDLJ93T3H52LFwIAAMDutqV/8qW7jyU5tmHs1rXtmzd53s8m+dmzmSAAAADnh227SRIAAACcDYEKAADACAIVAACAEQQqAAAAIwhUAAAARtjSXXwB4Nl85Quem6+9+It3ehoAwC4nUAE4a7/55r+101MAAPYAH/EFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAIAhUAAIARBCoAAAAjCFQAAABGEKgAAACMIFABAAAYQaACAAAwgkAFAABgBIEKAADACAIVAACAEQQqAAAAIwhUAAAARhCoAAAAjCBQAQAAGEGgAgAAMIJABQAAYASBCgAAwAgCFQAAgBEEKgAAACMIVAAAAEYQqAAAAIxQ3b3Tc/gMVXUyyR/s9DzO4EuT/NFOT4LPibXbvazd7mXtdi9rt3tZu93N+u1e1m5rvqq795/uwLhA3Q2q6nh3H9rpefDZs3a7l7Xbvazd7mXtdi9rt7tZv93L2p09H/EFAABgBIEKAADACAL1c3PnTk+Az5m1272s3e5l7XYva7d7WbvdzfrtXtbuLPkZVAAAAEbwDioAAAAjCNTPQlVdXVUPVtWJqrplp+dDUlWXVtVvVNV7q+qBqrp5GX9BVf16VT20/Pr8Zbyq6keXNXx3Vb107WvdsJz/UFXdsFOv6XxTVfuq6ner6leX/QNV9c5ljX6xqi5cxr9g2T+xHH/h2tf4/mX8war6xp15JeeXqnpeVb29qn6vqt5XVX/ddbc7VNX3Lr9fvqeqfqGqnuO6m6uq7qqqD1fVe9bGtu1aq6q/WlX3L8/50aqqz+8r3Ls2Wbt/uvy++e6q+tdV9by1Y6e9pjb7/nOz65azd7q1Wzv2fVXVVfWly77rbrt1t8cWHkn2JXk4yYuSXJjkXUku3+l5ne+PJF+e5KXL9p9L8vtJLk9ye5JblvFbkvzwsv1NSX4tSSV5eZJ3LuMvSPLI8uvzl+3n7/TrOx8eSd6U5O4kv7rs/1KS65fttyX5h8v29yR527J9fZJfXLYvX67HL0hyYLlO9+3069rrjyT/Msk/WLYvTPI81938R5KLk7w/yRcu+7+U5Ltcd3MfSV6R5KVJ3rM2tm3XWpL/sZxby3Nfs9Ovea88Nlm7Vye5YNn+4bW1O+01lWf5/nOz69bj3KzdMn5pknuS/EGSL13GXHfb/PAO6tZdmeREdz/S3U8lOZLk2h2e03mvuz/U3b+zbP+vJO/L6huwa7P6BjrLr9ct29cm+Zle+a0kz6uqL0/yjUl+vbs/2t1PJPn1JFd/Hl/KeamqLklyTZKfXPYrySuTvH05ZePanVrTtyd51XL+tUmOdPefdPf7k5zI6nrlHKmqL8nqD++fSpLufqq7/ziuu93igiRfWFUXJHlukg/FdTdWd/9mko9uGN6Wa2059sXd/Vu9+q75Z9a+FmfpdGvX3f+hu59Zdn8rySXL9mbX1Gm//zzDn5ecpU2uuyT5kSRvTrJ+Ex/X3TYTqFt3cZJH1/YfW8YYYvno2UuSvDPJl3X3h5ZDf5jky5btzdbR+u6Mf57Vb/SfXPb/fJI/XvvDe30dPrVGy/GPLedbu8+/A0lOJvnpWn08+yer6qK47sbr7seT/LMkH8wqTD+W5Lfjuttttutau3jZ3jjO58d3Z/XuWfLZr92z/XnJOVBV1yZ5vLvfteGQ626bCVT2hKr6oiS/nOQfdfeT68eWv51yu+phquqbk3y4u397p+fCZ+2CrD769OPd/ZIk/zurjxl+iutupuVnFa/N6i8ZviLJRfGu9a7mWtudquoHkjyT5Od3ei6cWVU9N8k/SXLrTs/lfCBQt+7xrD53fsolyxg7rKr+bFZx+vPd/SvL8P9cPkKR5dcPL+ObraP1/fz7G0m+pao+kNVHll6Z5F9k9dGYC5Zz1tfhU2u0HP+SJB+JtdsJjyV5rLvfuey/Patgdd3Nd1WS93f3ye5+OsmvZHUtuu52l+261h7Ppz9iuj7OOVRV35Xkm5O8bvkLhuSzX7uPZPPrlu13WVZ/sfeu5fuWS5L8TlX9xbjutp1A3bp7kxxc7ph2YVY3izi6w3M67y0/g/FTSd7X3XesHTqa5NTd0m5I8m/Xxr9zuePay5N8bPmY1D1JXl1Vz1/eYXj1MsY50t3f392XdPcLs7qe/nN3vy7JbyT5tuW0jWt3ak2/bTm/l/Hra3W30QNJDmZ18wHOke7+wySPVtVfWoZeleS9cd3tBh9M8vKqeu7y++eptXPd7S7bcq0tx56sqpcv/z1859rX4hyoqquz+tGWb+nuT6wd2uyaOu33n8t1uNl1yzbr7vu7+y909wuX71sey+omnX8Y1932O9d3YdpLj6zu0vX7Wd1N7Qd2ej4enSRfl9VHm96d5L7l8U1Z/WzGf0ryUJL/mOQFy/mV5PCyhvcnObT2tb47q5sSnEjy93b6tZ1PjyTfkE/fxfdFWf2hfCLJv0ryBcv4c5b9E8vxF609/weWNX0w7oT3+VqzK5IcX669f5PVHQpdd7vgkeSHkvxekvck+dms7hrquhv6SPILWf288NNZfVP897fzWktyaPlv4eEkP5akdvo175XHJmt3IqufSzz1Pcvb1s4/7TWVTb7/3Oy69Tg3a7fh+Afy6bv4uu62+VHL/0kAAACwo3zEFwAAgBEEKgAAACMIVAAAAEYQqAAAAIwgUAEAABhBoAIAADCCQAUAAGAEgQoAAMAI/w+a7XNaKksz5AAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot pdf values too see outliers\n", + "plt.figure(figsize=[16,16])\n", + "plt.plot(pdfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_227.wav', 'Q-R-S-T-U-V-W-X-Y-Z macht es komplett!', 38, array([-4.0032621e-04, -3.3042193e-04, -3.4537757e-04, ...,\n", + " 7.7704317e-06, 2.7401828e-05, 7.1041533e-05], dtype=float32), 11.323739583333333) 0.38161673291429454\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_496.wav', 'Ist der Kuli blau?', 18, array([ 1.2363373e-05, -3.6298752e-05, 2.1456377e-05, ...,\n", + " 3.9692618e-06, -6.7328816e-05, -9.5399046e-05], dtype=float32), 5.530666666666667) 0.38054811432758695\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_426.wav', 'H-I-J-K-L-M-N-O-P!', 18, array([ 4.7872534e-05, -3.4164757e-05, -2.1835160e-04, ...,\n", + " -4.3899294e-05, -7.5021897e-05, -3.4489829e-05], dtype=float32), 11.167979166666667) 0.32909346861901806\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav', 'Kann ich mich irgendwie revanchieren?', 37, array([-5.1586820e-05, -9.1837741e-05, -9.9342957e-05, ...,\n", + " -1.4234778e-04, -1.2327779e-04, -1.4810068e-04], dtype=float32), 9.728) 0.3853891360487213\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_41.wav', 'Ja, eben.', 9, array([ 8.6438486e-05, 1.5554321e-04, 1.1511238e-04, ...,\n", + " -1.3761004e-05, -2.3534812e-05, -5.6318945e-06], dtype=float32), 2.1033333333333335) 0.38819509492217963\n" + ] + } + ], + "source": [ + "# print outliers\n", + "threshold = 0.39\n", + "for item, pdf in zip(items, pdfs):\n", + " if pdf < threshold:\n", + " print(item, pdf)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Audio\n", + "Audio(\"/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Plot Dataset Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAEICAYAAABGaK+TAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAd00lEQVR4nO3dfZRcdZ3n8fcnnQYqQekgGaQbMIyyYZERoj2zODoji2gQETIehoeVGRBczuyZGcTBsERnFWedhTkRhTnj4GRUUEEEMUZkxQzrw3F1EDexwYCQgeEpdHholQaBRjrJd/+4t5LqSlV1Pde9XZ/XOTnpulV977dvuj518/397r2KCMzMLH/m9boAMzNrjgPczCynHOBmZjnlADczyykHuJlZTjnAzcxyygE+x0g6RtJjPdr2JZKu7cW25zJJSySFpPnp41slndWmdYek17RjXXVur221mwO8KyQ9LOm4rK2rxTp69kHR7yLiHRHxhV7XMZtKH+h5qT0vHOBm1rDi/wastxzgHSbpS8DBwDclPSfponT50ZL+VdKkpLskHZMu/31Jv5B0UPr4SElPSzqs2rpm2f6wpK9JmpD0kKTzS567RNKNkr4o6deS7pE0WvL86yWNpc99VdINkj4uaSFwKzCc1vGcpOH02/aotr6yuq6S9ImyZd+Q9Ffp1/9d0ni6ns2S3lplPddI+sf0v+bPSfqRpFdKuiLdb/dJWlbn/vg9Sben/yaPS/oHSXuUPB+S/kzS/elrPi1JVeqquq7ylki67PuS3pd+PSDpE+nvwYPAO8vWXfraeZL+WtIjkp5K9/0+lWpKX78yrWerpHOqrTd9fLakH5b9/H8u6X7g/nTZlZK2SHpW0kZJf5AuPx74EHBa+u9yVyO1l+yjsyQ9mu6LD1f7ufpWRPhPh/8ADwPHlTweAX4JnEDyIfq29PHi9Pm/Bb4LFIBNwF9UW1eFbR0DPJZ+PQ/YCHwE2AP4beBBYHn6/CXAi2kdA8ClwI/T5/YAHgHeDwwC7wZeAj5evp2SbVddX4U6/xDYAih9vAiYAoaBpelzw+lzS4BXV1nPNcAvgDcAe6X77SHgT9MaPg58r8798QbgaGB+us17gQtKthXALcAQyQfpBHB8lbqqrit9HMD8ktd/H3hf+vWfAfcBBwH7At8rfX3Za88BHkh/lr2BtcCXqtR0PPAkcASwEPhyut7XlK83fXw28MOyn/+2tKZCuuxM4BXpz3kh8ASwV8nvw7VlNdRVe8k++meS98GRwG+A/9jr93OW/vgIvDfOBL4VEd+KiB0RcRuwgST4IPnF3wf4CTAOfLrJ7fwuyYfC30TESxHxIMkb4vSS1/wwrWM78CWSNwrsCp+/j4jpiFib1jObausr939J3qB/kD4+Bbg9IrYC24E9gcMlDUbEwxHx7zW2+fWI2BgRLwJfB16MiC+mNdwAFI/Aa+6PdB0/johtEfEw8E/AW8q2dVlETEbEoyTBelSlgupcVzWnAldExJaI+BXJB2E17wE+GREPRsRzwCrgdFVucZwKXB0Rd0fE8yS/Z426NCJ+FRFTABFxbUT8Mv05Lyf5d1ta57rqqf1jETEVEXcBd1H996kvOcB741XAH6f/vZ6UNAm8GTgAICKmSY4sjwAuj/SQpMntDJdt50PA/iWveaLk6xeAvdI30DAwXrbtLXVss9r6ZkjX+xXgjHTRfwGuS597ALiAJGCekvSVkhZNJU+WfD1V4fHe6dc194ek/yDpFklPSHoW+F/AfrP8fHtTQZ3rqmaYmfv6kVleW/r8IyQfvPtXeW29661mxu+ApA9KulfSM+n+3IfGfs7Zaq9rf/crB3h3lAfwFpL/Kg6V/FkYEZcBSBoBPgpcDVwuac8a66plC/BQ2XZeFhEnzPqd8DgwUtbjPajJOqq5HjhF0quA/wR8befKI74cEW8mCd0A/q4N25ttf1xF0ro4NCJeThLuFXvcdai1rufTvxeUvP6VJV8/zsx9fXCN7Wwl2Uelr93GzA+xetf7fI2ainb+u6f97otIjuwXRcQQ8Ay7fs7Zfkcaqd0qcIB3x5Mkfb6ia4F3SVqeDljtpWRa3oFpYF4DfA44l+RN9z9rrKuWnwC/VjIgWEi3dYSk363je28naWX8haT5kk4Gfq+sjlfUGjCbTUSMkfSvPwusj4hJAElLJR2bfnC9SHIUvaPZ7ZSYbX+8DHgWeE7SYcB/a2FbVdcVERMkrbEz0xrOAV5d8r03Auenvw+LgItrbOd64AOSDpG0N8mR/g0Rsa3Ca28EzpZ0uKQFJAcJpe4E3i1pgZK54efW8TNuIxkLmC/pI8DLS55/ElgiqVrONFK7VeAA745Lgb9O/9v+wYjYApxMclQ2QXJkuJLk3+N84LeA/5G2Gd4LvLc4ul++rlobTXvAJ5L0aR9iV1jOGroR8RLJwOW5wCRJ3/4WkoEkIuI+kjfgg2kttVoctXwZOC79u2hP4LK03idI9seqJte/Ux3744MkrZxfk/TGb2hhc7Ot67+S/Jv/Engt8K8lz/0zsJ6k5/tTksG9aj5PMtbwA5Kf6UXgLyu9MCJuBa4gGeh9IP271KdIBqqfBL5A2tKqYT3wbeDfSNofLzKzxfLV9O9fSvppK7VbZcUZAGazknQH8JmIuLrXtZiZj8CtBklvUTKner6S059fR3LEZWYZ4LOprJalJH3ThSTzpU+JiMd7W5KZFbmFYmaWU26hmJnlVFdbKPvtt18sWbKkm5s0M8u9jRs3/iIiFpcv72qAL1myhA0bNnRzk2ZmuSep4lmzbqGYmeWUA9zMLKcc4GZmOeUANzPLKQe4mVlO+UxMM+updWPjrF6/ma2TUwwPFVi5fCkrlo30uqxccICbWc+sGxtn1dpNTE1vB2B8copVazcBOMTr4BaKmfXM6vWbd4Z30dT0dlav39yjivLFAW5mPbN1cqqh5TaTA9zMemZ4qNDQcpvJAW5mPbNy+VIKgwMzlhUGB1i5vN4b2/c3D2KaWc8UByo9C6U5DnAz66kVy0Yc2E2atYUi6fOSnpJ0d8my1ZLuk/QzSV+XNNTZMs3MrFw9PfBrgOPLlt0GHBERryO5I3XLdww3M7PGzBrgEfED4Fdly/4lIralD38MHNiB2szMrIZ2zEI5B7i1DesxM7MGtBTgkj4MbAOuq/Ga8yRtkLRhYmKilc2ZmVmJpgNc0tnAicB7osat7SNiTUSMRsTo4sW73dLNzMya1NQ0QknHAxcBb4mIF9pbkpmZ1aOeaYTXA7cDSyU9Julc4B+AlwG3SbpT0mc6XKeZmZWZ9Qg8Is6osPhzHajFzMwa4GuhmJnllAPczCynHOBmZjnlADczyykHuJlZTjnAzcxyygFuZpZTDnAzs5xygJuZ5ZQD3MwspxzgZmY55QA3M8spB7iZWU45wM3McsoBbmaWU03dkcfMzOqzbmyc1es3s3VyiuGhAiuXL2XFspG2rNsBbmbWIevGxlm1dhNT09sBGJ+cYtXaTQBtCXG3UMzMOmT1+s07w7toano7q9dvbsv6HeBmZh2ydXKqoeWNcoCbmXXI8FChoeWNcoCbmXXIyuVLKQwOzFhWGBxg5fKlbVm/BzHNzDqkOFDpWShmZjm0YtlI2wK7nFsoZmY55QA3M8spB7iZWU7NGuCSPi/pKUl3lyzbV9Jtku5P/17U2TLNzKxcPUfg1wDHly27GPhORBwKfCd9bGZmXTRrgEfED4BflS0+GfhC+vUXgBVtrsvMzGbRbA98/4h4PP36CWD/ai+UdJ6kDZI2TExMNLk5MzMr1/IgZkQEEDWeXxMRoxExunjx4lY3Z2ZmqWYD/ElJBwCkfz/VvpLMzKwezQb4zcBZ6ddnAd9oTzlmZlaveqYRXg/cDiyV9Jikc4HLgLdJuh84Ln1sZmZdNOu1UCLijCpPvbXNtZiZWQN8JqaZWU45wM3McsoBbmaWUw5wM7Oc8g0dzKyr1o2Nd+wONf3GAW5mXbNubJxVazcxNb0dgPHJKVat3QTgEG+CA9zM2ma2o+vV6zfvDO+iqentrF6/2QHeBAe4mbVFPUfXWyenKn5vteVWmwcxzawtah1dFw0PFSp+b7XlVpsD3Mzaop6j65XLl1IYHJjxfGFwgJXLl9a1jXVj47zpsu9yyMX/mzdd9l3WjY03X/Ac4BaKmbXF8FCB8QohXnp0XWylNDMLxQOgu3OAm1lbrFy+dEbAQuWj6xXLRpoKXA+A7s4BbmZtUevouh1zvz0AujsHuJm1TaWj63a1Pupp0fQbD2KaWUfVMzulHq0OgM5FPgI3s45qV+ujlQHQucoBbmYd1c7WR7MDoHOVWyhm1lGdaH14PnjCR+Bm1lHtbn14PvguDnAz67h2tj48H3wXt1DMLFc8H3wXH4GbWSbUe7KP54Pv4gA3s6YUA3d8cooBie0RjDTZ326kr13vKfv9cOcfB7iZNWTd2DiX3HwPk1PTO5dtjwCaH1BspK9dz6Bovwx0OsDNrG7lwVhJvQOKpUfIUeU11frasw2K9stAZ0sBLukDwPuAADYB742IF9tRmJllT6VgrKQYvNXaGPV8EEDzfe1+GehsOsAljQDnA4dHxJSkG4HTgWvaVJuZZUy9ATg8VKjZxqjng6CVk336ZaCz1WmE84GCpPnAAmBr6yWZWVbVE4DF4K3Vxqj1QSBgZKjApe/+nabbHf1y4aumAzwixoFPAI8CjwPPRMS/lL9O0nmSNkjaMDEx0XylZtZzlYIRYJ6Sv0uDt1Ybo9oHwchQgYcueyc/uvjYlnrVK5aNcOm7f4eRoUJbPhCyqpUWyiLgZOAQYBL4qqQzI+La0tdFxBpgDcDo6Gi1sQozy4FGTouv1caodypgq7W2M7CzOC2xlUHM44CHImICQNJa4PeBa2t+l5nlWj3BuG5snOd/s2235cWQztulYbM6LbGVAH8UOFrSAmAKeCuwoS1VmVnm1HsEWm2GyaIFg3z0Xa/d+T15ujRsVqclNh3gEXGHpJuAnwLbgDHSVomZZU8rLYBGjkCrzTBZsMf83AR2uaxOS2xpFkpEfDQiDouIIyLiTyLiN+0qzMzapxjA4+lJM8UArvc62o3cFi2rYdeKaoOuvZ6W6KsRmvWBVu9L2UgoZzXsWpHVaYkOcLM+0OpRcSOhnNWwa0VWpyX6WihmfaDeMxMr9cmBmjNKyuVthkm9sjjoqojuTc0eHR2NDRs8UcWs2+qZGVLpNYMDgoDpHVH1+6zzJG2MiNHy5T4CN+sDxaAtvwzs0y9M84Eb7uSCG+7ceU3vUtPbKx/g5XlGyVziHrhZjjVyd/YVy0ZYuOfux2zFiC4P71ryPKNkLvERuFlO1Ts3u57rbjcqzzNK5hIHuFlOzXZ24LqxcT72zXt4+oXpKmuYXaUeeN5nlMwlDnCznKo1NbDeGybMZnp7sGjBIBHwzNT0nJlRMlc4wM1yqtrUwHkSH/vmPXWFt2DWtsrTL0xTGBzgU6cd5eDOGA9imuVUtWtzb4+YtW0yMlTg4cveyadOO4oBadZtNXLWpnWPA9wsp4pnB9YTwKVKe9grlo1w+alHVvwgKOeZJ9njADfLsRXLRtjRwPS/ocLgbqeAl58mXu0DwTNPssc9cLMcKp0aOK/CCTjlBiQuP/XIqj3s0tPEKw2AeuZJNjnAzXKmPGDrOQFnR0TdA5Bz9Vomc5ED3Cxnqt0wodKp8EWNtj+yeOEm250D3CzjStsl+xQGZ1zLpNSOCK447Si3P/qIA9wsY8oD+/mXtu28qFS18IbkKNvtj/7iADfLkPL+dq3ALlU+NdCB3R8c4GYZsW5snAtvvKuhqwIWZeHuMNZ9DnCzHls3Nr7bdbobMVLSOrH+4gA364Fin3t8cqqu65FU4wHK/uYAN+uy8j53o+G9aMEgky/4yoDmADfrqlb63JC0S3508bFtrsryygFu1gXtuLmC2yVWrqWLWUkaknSTpPsk3Svpje0qzGyuKLZMGgnvRQsGOfPog3deYGpkqOCZJrabVo/ArwS+HRGnSNoDWNCGmsxyrfREnOGhAs//ZtusN1coDmSOuK9tDWg6wCXtA/whcDZARLwEvNSesszyqdKNhmcz25UCzapppYVyCDABXC1pTNJnJS0sf5Gk8yRtkLRhYmKihc2ZZV+1C01VUxgccHhb01oJ8PnA64GrImIZ8DxwcfmLImJNRIxGxOjixYtb2JxZtq0bG6/riLuo0s0VzBrRSg/8MeCxiLgjfXwTFQLcbK6acTKOoJGZgVf4BsHWBk0HeEQ8IWmLpKURsRl4K/Dz9pVmlh3lVwh8adt2XpjesfP5RsLbp75bu7Q6C+UvgevSGSgPAu9tvSSzbGn2CoGVeC63tVNLAR4RdwKjbarFLDMavedkNUOFQRbuOd/X5raO8JmYZmWauedkJYXBAS456bUObOsYB7hZmUanAlayaMEgH32Xw9s6ywFufa/ee07WUpyF4jMprZsc4NbXmh2g9CVdLQsc4NbXmmmX+JKulhUtXY3QLO+2NnDmJHgaoGWLj8Ct75SeQVmPAYkdEW6XWOY4wK1vNHPz4MLggK9XYpnlALc5r5G74QgY8gCl5YQD3Oa08lkm9Rj7yNs7WJFZ+zjAbU4o7WsPpKe+DzRxCvzwUKFDFZq1nwPccq/aqe+NhrdnmFjeOMAt19aNjXPhjXc1fb2SeYIdPoPScsoBbrkz40YKJDcDbtRQYdAXmrLcc4BbrpS3SxoNbx9p21ziALfcaKVd4vncNhc5wC2zWm2VFGeh+Kjb5ioHuGVSs60SH2lbP3GAW+Y02iopHp37SNv6jQPcMqHZdsmAxOWnHunQtr7kALeec7vErDkOcOuJRi/pWs73nDRzgFuXNXJlwErc5zbbxQFuHdfqdEC3Sswqc4BbRzXb3/bMErPZtRzgkgaADcB4RJzYekk2VzR75qRnlpjVpx1H4O8H7gVe3oZ12RzQSp/b7RKz+rUU4JIOBN4J/C3wV22pyHKrmXtOAkgQvqSrWcNaPQK/ArgIeFm1F0g6DzgP4OCDD25xc5ZVjdy6zP1ts/ZoOsAlnQg8FREbJR1T7XURsQZYAzA6OtrcVfctc4ozS7ZOTu28CXA9/7jub5u1TytH4G8CTpJ0ArAX8HJJ10bEme0pzbKoUn+73l63+9tm7TWv2W+MiFURcWBELAFOB77r8J7bim2SZgYnhwqDDm+zNvM8cKvJp7ybZVdbAjwivg98vx3rst5rxz0n3es26zwfgdsMrd5zEtzrNusWB7jNsHr95rqmApZaMDiPPeYP8MzUNMOeGmjWNQ5w22nd2HhDvW63Scx6ywHex2b0utOzIevlNolZ7znA+0TpiTf7FAaZ3r6D51/a1SqpFd7FgUzf5d0sWxzgfaB8YLLRa5V86rSjHNZmGdT0iTyWH80MTBaNDBUc3mYZ5QCf4xodmCxVGBxg5fKlba7IzNrFLZQ5qtlLuxYNFQa55CSfQWmWZQ7wOWC2Acp6+JrcZvnjAM+5ZgcoF6WXgPWJN2b55QDPsWbvOTkyVOBHFx/boarMrFsc4DnUSn/bA5Nmc4cDPEdaHZj0pV3N5hYHeA60cpd38IwSs7nKAZ4xpdcnKZ663sg1uQU771HpAUqzuc0BnhGVjrKLg5P1hrcvMGXWXxzgGVA+FbAZ7m+b9R8HeI81OxWwyMFt1r8c4D3QjntOemDSzBzgXVSpz93I4GTgU93NbBcHeBc0Ow3QoW1mtTjAO6CVW5WB7zVpZvVxgLdJtb52o+HtqYBmVi8HeIsqnd7e6KCkWyVm1oymA1zSQcAXgf1J8mdNRFzZrsLyoB3ztz2bxMya1coR+Dbgwoj4qaSXARsl3RYRP29TbZnW6vxtB7eZtarpAI+Ix4HH069/LeleYATIfYCX3uGm/HoirV5YyifemFm7KJo8gpyxEmkJ8APgiIh4tuy584DzAA4++OA3PPLIIy1vr5MqtUUG54m995rP0y9M133iTfF1xQtSub9tZs2StDEiRsuXtzyIKWlv4GvABeXhDRARa4A1AKOjo61/WnTY6vWbd+tpT++InUfc9fwAPso2s25oKcAlDZKE93URsbY9JfXW1smppr/X87fNrJtamYUi4HPAvRHxyfaV1H2lPe95acujUZ6/bWbd1soR+JuAPwE2SbozXfahiPhW62V1XrUTb5oJb88oMbNeaGUWyg9Jxupyp3yg0ifemFke9eWZmJUGKuvlAUozy4q+CfDSlkmjBL6/pJllzpwP8ErXKmnEyFCBH118bJurMjNr3ZwO8FavVVIYHGDl8qVtrsrMrD3mdIA30uu+4rSjdn5PpVPozcyyJpcBXn6tkv982GK+d98E45NTO09dH2hgPvfIUGFnUDuwzSwvchfg5W2R8ckprv3xozufL4Z2veHtNomZ5VXuAryVKYBFxduceR63meVZ7gK8lWuVOLDNbC7JfICX97v3KQw2NSXQ0wHNbK7JdIBX6nc3w31uM5uLMh3gzfS7fSMFM+sXmQ7w2frdQ4VBFu453/O2zawvZTrAh4cKNdsmz0xNc+dH397FiszMsmNerwuoZeXypRQGB6o+PzxU6GI1ZmbZkukj8GI7pNJd4D0waWb9LtNH4JCE+NhH3s4Vpx3FyFABkUwJ9O3LzKzfZfoIvNSKZSMObDOzEpk/Ajczs8oc4GZmOeUANzPLKQe4mVlOOcDNzHJKUeeND9qyMWkCeKRk0X7AL7pWQOvyVi/kr+a81Qv5qzlv9UL+am53va+KiMXlC7sa4LttXNoQEaM9K6BBeasX8ldz3uqF/NWct3ohfzV3q163UMzMcsoBbmaWU70O8DU93n6j8lYv5K/mvNUL+as5b/VC/mruSr097YGbmVnzen0EbmZmTXKAm5nlVM8CXNLxkjZLekDSxb2qoxpJB0n6nqSfS7pH0vvT5ftKuk3S/enfi3pdaylJA5LGJN2SPj5E0h3pfr5B0h69rrGUpCFJN0m6T9K9kt6Y5X0s6QPp78Pdkq6XtFfW9rGkz0t6StLdJcsq7lMl/j6t/WeSXp+RelenvxM/k/R1SUMlz61K690saXm3661Wc8lzF0oKSfuljzu2j3sS4JIGgE8D7wAOB86QdHgvaqlhG3BhRBwOHA38eVrjxcB3IuJQ4Dvp4yx5P3BvyeO/Az4VEa8BngbO7UlV1V0JfDsiDgOOJKk9k/tY0ghwPjAaEUcAA8DpZG8fXwMcX7as2j59B3Bo+uc84Kou1VjqGnav9zbgiIh4HfBvwCqA9D14OvDa9Hv+Mc2TbruG3WtG0kHA24FHSxZ3bh9HRNf/AG8E1pc8XgWs6kUtDdT8DeBtwGbggHTZAcDmXtdWUuOBJG/OY4FbAJGcDTa/0n7v9R9gH+Ah0sH0kuWZ3MfACLAF2JfkWvq3AMuzuI+BJcDds+1T4J+AMyq9rpf1lj33R8B16dczsgJYD7wxC/s4XXYTyYHIw8B+nd7HvWqhFN8IRY+lyzJJ0hJgGXAHsH9EPJ4+9QSwf4/KquQK4CJgR/r4FcBkRGxLH2dtPx8CTABXp22fz0paSEb3cUSMA58gObp6HHgG2Ei293FRtX2ah/fiOcCt6deZrVfSycB4RNxV9lTHavYg5iwk7Q18DbggIp4tfS6Sj9NMzMOUdCLwVERs7HUtDZgPvB64KiKWAc9T1i7J2D5eBJxM8sEzDCykwn+jsy5L+3Q2kj5M0s68rte11CJpAfAh4CPd3G6vAnwcOKjk8YHpskyRNEgS3tdFxNp08ZOSDkifPwB4qlf1lXkTcJKkh4GvkLRRrgSGJBVvnZe1/fwY8FhE3JE+vokk0LO6j48DHoqIiYiYBtaS7Pcs7+Oiavs0s+9FSWcDJwLvST90ILv1vprkg/2u9D14IPBTSa+kgzX3KsD/H3BoOnq/B8mgxM09qqUiSQI+B9wbEZ8seepm4Kz067NIeuM9FxGrIuLAiFhCsj+/GxHvAb4HnJK+LDP1AkTEE8AWSUvTRW8Ffk5G9zFJ6+RoSQvS349ivZndxyWq7dObgT9NZ0ocDTxT0mrpGUnHk7QDT4qIF0qeuhk4XdKekg4hGRj8SS9qLBURmyLityJiSfoefAx4ffo73rl93Ivmf/phegLJ6PK/Ax/uVR016nszyX8zfwbcmf45gaSv/B3gfuD/APv2utYKtR8D3JJ+/dskv+APAF8F9ux1fWW1HgVsSPfzOmBRlvcx8DHgPuBu4EvAnlnbx8D1JD366TRIzq22T0kGuj+dvg83kcywyUK9D5D0jYvvvc+UvP7Dab2bgXdkZR+XPf8wuwYxO7aPfSq9mVlOeRDTzCynHOBmZjnlADczyykHuJlZTjnAzcxyygFuZpZTDnAzs5z6//Sn0SdRnW0yAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs mean audio duration\")\n", + "plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAEICAYAAABGaK+TAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAepElEQVR4nO3dfZRcdZ3n8fcnnQYqoHSQiKQhhlE3jqAQbWdxcUYXdYIOAuM4iCsrCG7O7M74tBg30RmBWWdhTkTBM44O4wMqiKDEiPgQWdHj6CBuYsCAkIEBJHR4iEqjkI50ku/+cW8l1ZV6rttd91Z9Xuf0SdW9t+/91i9V3/r19/e79yoiMDOz4pnT6wDMzKwzTuBmZgXlBG5mVlBO4GZmBeUEbmZWUE7gZmYF5QReUJJeKenBHh37AklX9uLYM6G6LSXdIemVPQypKUmLJYWkuenzb0k6K6N9h6TnZrGvFo+XWeyDxgk8Q5Lul/TqvO2ryzh69kXRKxFxdER8v9dxtCMiXhsRn+t1HM3U+vIvSux55ARuZpko/zVgs8cJPCOSvgAsAr4u6QlJ70uXHy/pXyVNSLqt/Ke5pP8k6ZeSjkyfHyvpMUnPr7evJsdfKOk6Sdsk3SfpnRXrLpB0raTPS/ptWiIYq1j/Ykkb03VflnSNpA9JOhD4FrAwjeMJSQvTX9uv3v6q4vqEpA9XLfuapP+ZPv5fksbT/WyW9Ko6+7lC0j+mf24/IelHkp4l6dK03e6StLTF9iil+3tM0s+Bl1Yda89fP5L+QNLN6f/fQ5L+QdJ+FduGpL+QdHe6zcclqc5rqLuv6pJIuuz7kt6ePh6S9OH0PXMv8CdV+67cdo6kv5b0C0mPpv9PB9eKKd1+RRrPVknn1Ntv+vxsST+sev1/Kelu4O502WWStkj6jaQNkv4wXX4S8H7gTen/4W3txF7RRmdJeiBtiw/Ue10DISL8k9EPcD/w6orno8CvgNeRfFm+Jn2+IF3/d8BNQAnYBPxVvX3VONYrgQfTx3OADcAHgf2A3wPuBZal6y8AdqRxDAEXAT9O1+0H/AJ4FzAMvAF4CvhQ9XEqjl13fzXi/CNgC6D0+XxgElgILEnXLUzXLQaeU2c/VwC/BF4CHJC2233AW9MYPgR8r8X2uBj4F+AQ4Ejg9srXWNn26fGOB+am8d0JvLti2wBuAEZIvnS3ASfVeQ1195U+D2BuxfbfB96ePv4L4K403kOA71VuX7XtOcA96es+CFgDfKFOTCcBjwDHAAcCX0z3+9zq/abPzwZ+WPX6b0xjKqXLzgSekb7O84CHgQMq3jtXVsXQUuwVbfTPJJ+ZY4HfAb/f689+r37cA59ZZwLfjIhvRsTuiLgRWE+S+CB5Mx8M/AQYBz7e4XFeSvKl8LcR8VRE3EvyJj+jYpsfpnHsAr5A8uaHvQnlYxExFRFr0niaqbe/av9C8qH7w/T5G4GbI2IrsAvYH3iBpOGIuD8i/r3BMb8aERsiYgfwVWBHRHw+jeEaoNwDb9YepwN/FxG/jogtwMfqHTA93o8jYmdE3A/8E/CKqs0ujoiJiHiAJLEe18W+6jkduDQitkTEr0m+NOt5C/CRiLg3Ip4AVgFnqHaJ43TgsxFxe0Q8SfKebNdFaVtOAkTElRHxq/R1XkLyf7ykxX21EvuFETEZEbcBt1H/vdf3nMBn1rOBP0//ZJ6QNAG8HDgcICKmSHqWxwCXRNrN6PA4C6uO837gsIptHq54vB04IP1QLATGq469pYVj1tvfNOl+vwS8OV30X4Cr0nX3AO8mSRqPSvpSRYmmlkcqHk/WeH5Q+rhZeyxk+mv8Rb0DSvoPkm6Q9LCk3wD/Bzi0arPqtjiIGlrcVz0tx5xuW7n+FyRf0ofV2bbV/dYz7f0i6b2S7pT0eNr2B9Pe62wWe0vtPQicwLNVnYC3kPz5N1Lxc2BEXAwgaRQ4H/gscImk/Rvsq5EtwH1Vx3laRLyu6W/CQ8BoVd32yA7jqOdq4I2Sng38R+C6PTuP+GJEvJwk6Qbw9xkcr1l7PMT017iowb4+QVK6eF5EPJ3ki6BmjbsFjfb1ZPrvvIrtn1XxuJ2Yt5K0Z+W2O5n+hdfqfp9sEFPZnvdIWu9+H0nPfn5EjACPs/d1Nns/tRP7wHMCz9YjJLW7siuB10talg5CHaBkWt4RacK8Avg0cC7JB+l/N9hXIz8BfqtkQLCUHusYSS9t+ptwM0kp468kzZV0KvAHVXE8o9EgWDMRsZGkfv0pYF1ETABIWiLpxPSLawdJL3p3p8ep0Kw9rgVWSZov6QjgHQ329TTgN8ATkp4P/Pcu4qq7r4jYRlJGOzON9xzgORW/ey3wzvS9Mx9Y2eA4VwPvkXSUpINIevrXRMTOGtteC5wt6QWS5pF0KCrdCrxB0jwlc8PPbeE17iQZC5gr6YPA0yvWPwIsllQv97QT+8BzAs/WRcBfp3+2vzetr55K0tPaRtIzXEHS7u8Engn8TVpmeBvwtvKIffW+Gh00rQGfTFJ7vY+9ybJp0o2Ip0gGLs8FJkjq9jeQDA4REXeRfKjuTWNpVOJo5IvAq9N/y/YnGVD8Jcmfxc8kqXl2pYX2uJDkT/P7gO+Q1PDreS9J2ee3JHX0a7oIrdm+/hvJ++NXwNHAv1as+2dgHUnN96ckg3v1fIbkNf2A5DXuoM6XVER8C7iUZFD4nvTfSh8lGdR+BPgcafmrgXXAt4F/I2njHUwvsXw5/fdXkn7aTey2d2aA2R6SbgE+GRGf7XUsZlafe+CGpFcomVM9V8kpzS8i6UWZWY75zCmDZIrXtSTzgO8F3hgRD/U2JDNrxiUUM7OCcgnFzKygZrWEcuihh8bixYtn85BmZoW3YcOGX0bEgurls5rAFy9ezPr162fzkGZmhSep5hmyLqGYmRWUE7iZWUE5gZuZFZQTuJlZQTmBm5kVlM/ENLOeWrtxnNXrNrN1YpKFIyVWLFvCaUtHex1WITiBm1nPrN04zqo1m5ic2gXA+MQkq9ZsAnASb4FLKGbWM6vXbd6TvMsmp3axet3mHkVULE7gZtYzWycm21pu0zmBm1nPLBwptbXcpnMCN7OeWbFsCaXhoWnLSsNDrFjW6k3sB5sHMc2sZ8oDlZ6F0hkncDPrqdOWjjphd6hpCUXSZyQ9Kun2imWrJd0l6WeSvippZGbDNDOzaq3UwK8ATqpadiNwTES8iOTu013fSdzMzNrTNIFHxA+AX1ct+05E7Eyf/hg4YgZiMzOzBrKYhXIO8K0M9mNmZm3oKoFL+gCwE7iqwTbLJa2XtH7btm3dHM7MzCp0nMAlnQ2cDLwlGtzaPiIuj4ixiBhbsGCfW7qZmVmHOppGKOkk4H3AKyJie7YhmZlZK1qZRng1cDOwRNKDks4F/gF4GnCjpFslfXKG4zQzsypNe+AR8eYaiz89A7GYmVkbfC0UM7OCcgI3MysoJ3Azs4JyAjczKygncDOzgnICNzMrKCdwM7OCcgI3MysoJ3Azs4JyAjczKygncDOzgnICNzMrKCdwM7OCcgI3MysoJ3Azs4Lq6I48ZmbWmrUbx1m9bjNbJyZZOFJixbIlnLZ0NJN9O4Gbmc2QtRvHWbVmE5NTuwAYn5hk1ZpNAJkkcZdQzMxmyOp1m/ck77LJqV2sXrc5k/07gZuZzZCtE5NtLW+XE7iZ2QxZOFJqa3m7nMDNzGbIimVLKA0PTVtWGh5ixbIlmezfg5hmZjOkPFDpWShmZgV02tLRzBJ2NZdQzMwKygnczKygmiZwSZ+R9Kik2yuWHSLpRkl3p//On9kwzcysWis98CuAk6qWrQS+GxHPA76bPjczs1nUNIFHxA+AX1ctPhX4XPr4c8BpGcdlZmZNdFoDPywiHkofPwwcllE8ZmbWoq4HMSMigKi3XtJySeslrd+2bVu3hzMzs1SnCfwRSYcDpP8+Wm/DiLg8IsYiYmzBggUdHs7MzKp1msCvB85KH58FfC2bcMzMrFWtTCO8GrgZWCLpQUnnAhcDr5F0N/Dq9LmZmc2ipqfSR8Sb66x6VcaxmJlZG3wmpplZQTmBm5kVlBO4mVlBOYGbmRWUE7iZWUH5hg5mNqvWbhyfsTvUDBoncDObNWs3jrNqzSYmp3YBMD4xyao1mwCcxDvgBG5mmWnWu169bvOe5F02ObWL1es2O4F3wAnczDLRSu9668Rkzd+tt9wa8yCmmWWiUe+6bOFIqebv1ltujTmBm1kmWuldr1i2hNLw0LT1peEhVixb0tIx1m4c54SLb+Kold/ghItvYu3G8c4D7gMuoZhZJhaOlBivkcQre9flUkons1A8ALovJ3Azy8SKZUumJVio3bs+beloRwnXA6D7cgI3s0w06l1nMffbA6D7cgI3s8zU6l1nVfpopUQzaDyIaWYzqpXZKa3odgC0H7kHbmYzKqvSRzcDoP3KCdzMZlSWpY9OB0D7lUsoZjajZqL04fngCffAzWxGZV368HzwvZzAzWzGZVn68HzwvZzAzSwXWp0r7vnge7kGbmY9Vy6LjE9MEuwti9SqbfuCWHs5gZtZV7IYUGxnrnirg6KDMNDpEoqZdSyrAcV2yiKtDIoOykBnVwlc0nuAtwMBbALeFhE7sgjMzPIvqwHFdueKNxsUHZSBzo5LKJJGgXcCYxFxDDAEnJFVYGaWf1kNKGY9V3xQBjq7LaHMBUqSpoB5wNbuQzKzomjWc251ZknWc8UH5cJXHSfwiBiX9GHgAWAS+E5EfCezyMwslyqT8sGlYYaHxNSu2LO+3HNuVoeuldx/tPLETGJs9drkRddxApc0HzgVOAqYAL4s6cyIuLJqu+XAcoBFixZ1EaqZ9dLajeNc+PU7eGz71J5lE5NTDM8R8+cNM7F9alrP+YSLb2o4s2QmBxkH5cJXiojmW9X6RenPgZMi4tz0+VuB4yPif9T7nbGxsVi/fn1HxzOz3qnuTVcbkrjk9GOnJcijVn6DWtlF1C9xjI6UMuuFZy2Lm1J0StKGiBirXt5NDfwB4HhJ80hKKK8CnJ3N+lCtWR2VdkXsUx6ZI7GrRgdx4UipcIOMeZ2W2PEslIi4BfgK8FOSKYRzgMszisvMcqSVxFouj5STXa3kXa5DF+1syqxuSpG1rmahRMT5wPkZxWJmPVavTFCv5FFt68Rk3d76kMRFb3jhnh5rkQYZ8/oXg0+lNzOg8fVIas3TrqVReWR3xJ7kfdrSUS56wwsZHSkhktp3ZXLPm7z+xeBT6c0MaFwmKA8slnvnI/OGeWLHTqZ27zt9cPW6zS3NwS7S3XXyOi3RCdzMgPrlgPGJSU64+KY9ZZWPvum4uvO4i1geaUVepyU6gZsZUH9qn2DP8urZF7NxVmVe5PEvho7ngXfC88DNeqfZPOZac70FNedy53m+dj+qNw/cg5hmA6DZDRPKyX1yahdDEpAk6Xrdu17PvrCEE7jZAGg0QFmZ3CE5Kadcsx7N6ewLSziBmxVYq3edaTRAecH1d9RN7llf5tWy5QRuVlBZ3EcSkgtS1TI+Mcl7rrmVA4bnMFIaLsR87UHjBG5WUN3eR7IVATy2fYrf7dzNR990HD9aeaKTd454GqFZQTU7vbs8MDk+MclQnQtLtaofb0fWD9wDNyuoRqd31xqY7JZnnuSPE7hZQdUrizz5u51c+PV9Bya75Zkn+eMSilkBVc7bniOouCRJ3UHJVo2Uhvndzt19dSp8v3IP3KxgqssjuzuojoyOlLj0TcfVnCJ4wSlHF+pKgYPMPXCzgml2d5xmyr3pZtcsccLOPydws4JpZzCxPPuk/O9ojSTtRF1cTuBmBdPq3XFg72nxLoH0J9fAzXKm2enxtWafDA+JkdJwzf3l4d6NNjPcAzfLkVp3P1/x5du48Ot3MLF9ioNLw0jsuWpgdVnkqJXfqHkFQc/h7k/ugZvlSK0ByqndwWPbpwiSKYKPbU+mCVZeNbBcHsnrvRttZjiBm/VAvTJJuz3l6vKIrx44WFxCMZtltcok77nmVt59za0d7a8y6ffr7cysNidws1lWq0zSzZVKiny3d+uOSyhmsyzLAUWXRwZbVz1wSSPAp4BjSDoR50TEzVkEZtZPKi/t2o3S8BwOGB5iYvuUyyPWdQnlMuDbEfFGSfsB8zKIyayv1Lrbe6uGJHZHOFlbTR0ncEkHA38EnA0QEU8BT2UTlln/6PTaJT6D0prppgd+FLAN+KykY4ENwLsi4slMIjMrqHK5pDwLpJWyyWi6Xb1rlpjV0k0Cnwu8GHhHRNwi6TJgJfA3lRtJWg4sB1i0aFEXhzPLv1pTBEXjWSajIyV+tPLEWYnP+ks3s1AeBB6MiFvS518hSejTRMTlETEWEWMLFizo4nBm+bZ24zjnXXtbW1MEPYvEutFxDzwiHpa0RdKSiNgMvAr4eXahmRXD2o3jXHD9HS3fCafcI3eZxLrV7SyUdwBXpTNQ7gXe1n1IZvlWWeMemTfMEzt2MtXGbXEWumRiGekqgUfErcBYRrGY5V51jbt8Yal2+MqAlhWfSm9WQ/VMknKpo9vbmYGvDGjZcQI3q9Lomtzt9LiH5wgEU7v2llc8aGlZcgI3q9Lomtytmj9vmPNff/Se/fnKgDYTnMDNqnRTox6SuOT0Y6claSdsmylO4DbwKuvdB5eGaXrmTR0+9d1mmxO4DbTqenerc7kBRkrDHLj/XJdHrGecwG2gdXOhqQtOOdoJ23rKCdwGWif17vIApZO39ZoTuA2cypr3nPTqf434mtyWV07gNjBqXbOkWfL2wKTlmRO4DYRW74rjW5ZZkTiBW1+pvPdk+eYI8+cNM7F9qqWZgYccuL8vNGWF4QRufWHtxvF9TnUvl0faOYPSF5qyInECt8Lr5qbB1XyhKSuSbu7IY5YLWVwhEHyhKSseJ3ArtLUbx1u6aXC1IYkzj1/E6EgJkdwdx7NNrGhcQrHCqDVAqQ7246mB1i+cwK0Qquvc5QHKRjNLJIhIzpyMgMcnPTXQ+osTuOVOrZ72UAtnTFa69E3HOUlb33MCt1yp19NuJ3mPjpScvG0geBDTcqXbGSWeSWKDxD1wy4XKskm7yvdfGHV92waME7j1XDcn4jhp2yBzArdZVWuAshOjIyVfs8QGnhO4zZp6A5SN1Lo9pevcZomuE7ikIWA9MB4RJ3cfkvWbTuvb5V525Q0YPI/bbK8seuDvAu4Enp7BvqzPdFrfruxln7Z01AnbrIauphFKOgL4E+BT2YRj/WTtxnHOu/a2tpP3kORT3c1a0G0P/FLgfcDTMojF+kSta3O3ytcpMWtdxwlc0snAoxGxQdIrG2y3HFgOsGjRok4PZwVQ656TjVSfJu8pgWbt6aYHfgJwiqTXAQcAT5d0ZUScWblRRFwOXA4wNjbW2Zwxy712at3uZZtlo+MEHhGrgFUAaQ/8vdXJ2/pPt/ecdH3bLDueB24tqzePu9Vat3veZtnKJIFHxPeB72exL8uXyjnYc7o4c3KkNMwFpxzt5G2WIffAra5OzpysNn/eMOe/3onbbCY4gds+urkyYNmQxCWnH+vEbTaDnMBtmm6uDFjmWrfZ7HACtz3KZ062WyrxPSfNesMJ3Do+c9KXdDXrLSfwAdbumZOVfElXs95zAh8QldMBDy4NM7VrN08+1VqduzQ8xJ+9ZJTv3bXNl3Q1yxEn8AFQPTDZTo/bZ06a5ZcTeJ/rdGASPJvELO+cwPtUN/Vt8JmTZkXgBN5nurkWN/jMSbMicQLvA5VnTta6CXAr3OM2Kx4n8AKrVSZpJXkLGEkvAesZJWbF5QReQN3O3/bApFl/cAIvkG4HJl3fNusvTuAF0O3ApOvbZv3JCTznOrk6YHkg0zcJNutvTuA5U+uek+1wmcRscDiB50StMkk7yduJ22zwOIH3mE+8MbNOOYH3gE+8MbMsOIHPolq97XaTtwcmzazMCXwWdFsmAZ+AY2b7cgKfAdNKJIIOruQKeDqgmTXmBJ6x6nnbnSRv17fNrBUdJ3BJRwKfBw4j6SheHhGXZRVY0VT2utvlnraZdaKbHvhO4LyI+KmkpwEbJN0YET/PKLZC8I0TzKxXOk7gEfEQ8FD6+LeS7gRGgYFI4L4+iZn1WiY1cEmLgaXALVnsL8984o2Z5UXXCVzSQcB1wLsj4jc11i8HlgMsWrSo28P1RDcn3pSvZ+L6tpllrasELmmYJHlfFRFram0TEZcDlwOMjY11OKGud/aZVdLi73netpnNtG5moQj4NHBnRHwku5B6r5sZJeAyiZnNjm564CcA/xXYJOnWdNn7I+Kb3YfVO51cf7vMA5NmNpu6mYXyQ5IpzH1j7cZxzrv2travwe3EbWa9MLBnYpbLJFsnJjm4NMzUrt08+VRrvW6feGNmeTCQCby6TNLOSThDEpecfqyTtpn13MAk8Moe95wOblUGnlliZvkyEAm8usfdSfJ2ucTM8qaQCbyyN72wSWLtdGAS3OM2s3wrXAKv7k2PT0yyas2mPesrE/t/fv4Crtsw3lHy9swSM8u7wiXw1es27zNHe3JqFxd+/Q52TO2eltiv+vEDLZ85OX/eMBPbp5r26M3M8qJwCXxrnbMja11cqtXkPTpS4kcrT+wiKjOz2Ten1wG0a+FIKdP9lYaHWLFsSab7NDObDbnvgVcPWJbr2p2c6g4wPCQO3G8uj0+6XGJmxZbrBF5rwPK6DeP82UtGufLHD7S9P08FNLN+kusEXm/A8upbtjB/3nBbN1VwndvM+k2ua+D1Bix3RfDEjp0MD7V2LS3Xuc2sH+U6gTcasJzaHRy431xG022GlCTz0ZESZx6/iNGREkqf+2QcM+tHuS6hrFi2pOG1uR+fnOLW8/94lqMyM8uHXPfAT1s6ykVveOGe3nW1rKcUmpkVSa4TOCRJ/JLTj6U0PDRtuevaZjbocl1CKSvXr1u9gJWZ2SAoRAKHJIk7YZuZ7ZX7EoqZmdXmBG5mVlBO4GZmBeUEbmZWUE7gZmYFpejgdmMdH0zaBvyiYtGhwC9nLYDuFS1eKF7MRYsXihdz0eKF4sWcdbzPjogF1QtnNYHvc3BpfUSM9SyANhUtXihezEWLF4oXc9HiheLFPFvxuoRiZlZQTuBmZgXV6wR+eY+P366ixQvFi7lo8ULxYi5avFC8mGcl3p7WwM3MrHO97oGbmVmHnMDNzAqqZwlc0kmSNku6R9LKXsVRj6QjJX1P0s8l3SHpXenyQyTdKOnu9N/5vY61kqQhSRsl3ZA+P0rSLWk7XyNpv17HWEnSiKSvSLpL0p2SXpbnNpb0nvT9cLukqyUdkLc2lvQZSY9Kur1iWc02VeJjaew/k/TinMS7On1P/EzSVyWNVKxblca7WdKy2Y63XswV686TFJIOTZ/PWBv3JIFLGgI+DrwWeAHwZkkv6EUsDewEzouIFwDHA3+ZxrgS+G5EPA/4bvo8T94F3Fnx/O+Bj0bEc4HHgHN7ElV9lwHfjojnA8eSxJ7LNpY0CrwTGIuIY4Ah4Azy18ZXACdVLavXpq8Fnpf+LAc+MUsxVrqCfeO9ETgmIl4E/BuwCiD9DJ4BHJ3+zj+m+WS2XcG+MSPpSOCPgQcqFs9cG0fErP8ALwPWVTxfBazqRSxtxPw14DXAZuDwdNnhwOZex1YR4xEkH84TgRsAkZwNNrdWu/f6BzgYuI90ML1ieS7bGBgFtgCHkFxL/wZgWR7bGFgM3N6sTYF/At5ca7texlu17k+Bq9LH03IFsA54WR7aOF32FZKOyP3AoTPdxr0qoZQ/CGUPpstySdJiYClwC3BYRDyUrnoYOKxHYdVyKfA+YHf6/BnARETsTJ/nrZ2PArYBn03LPp+SdCA5beOIGAc+TNK7egh4HNhAvtu4rF6bFuGzeA7wrfRxbuOVdCowHhG3Va2asZg9iNmEpIOA64B3R8RvKtdF8nWai3mYkk4GHo2IDb2OpQ1zgRcDn4iIpcCTVJVLctbG84FTSb54FgIHUuPP6LzLU5s2I+kDJOXMq3odSyOS5gHvBz44m8ftVQIfB46seH5EuixXJA2TJO+rImJNuvgRSYen6w8HHu1VfFVOAE6RdD/wJZIyymXAiKTyrfPy1s4PAg9GxC3p86+QJPS8tvGrgfsiYltETAFrSNo9z21cVq9Nc/tZlHQ2cDLwlvRLB/Ib73NIvthvSz+DRwA/lfQsZjDmXiXw/wc8Lx29349kUOL6HsVSkyQBnwbujIiPVKy6HjgrfXwWSW285yJiVUQcERGLSdrzpoh4C/A94I3pZrmJFyAiHga2SFqSLnoV8HNy2sYkpZPjJc1L3x/leHPbxhXqten1wFvTmRLHA49XlFp6RtJJJOXAUyJie8Wq64EzJO0v6SiSgcGf9CLGShGxKSKeGRGL08/gg8CL0/f4zLVxL4r/6Zfp60hGl/8d+ECv4mgQ38tJ/sz8GXBr+vM6krryd4G7gf8LHNLrWGvE/krghvTx75G8we8Bvgzs3+v4qmI9DliftvNaYH6e2xi4ELgLuB34ArB/3toYuJqkRj+VJpJz67UpyUD3x9PP4SaSGTZ5iPcekrpx+bP3yYrtP5DGuxl4bV7auGr9/ewdxJyxNvap9GZmBeVBTDOzgnICNzMrKCdwM7OCcgI3MysoJ3Azs4JyAjczKygncDOzgvr/34J4JquheD8AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs median audio duration\")\n", + "plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAgAElEQVR4nO3df5QlZX3n8fdnepqhZ3RtlEnC9DAOybKwIgra/lqSFX8giAYImhWM2ZhgOJsNG00MyRATRZMcxiWJmrNEQ4xhYwxo1IwTQckP8CRrlNCTARUEMwF0psEwKk0SZpSeme/+UXXHO7er6tb9Xbfu53XOHPreqlv13KLv9z79ref5PooIzMxs/K0adQPMzKw/HNDNzGrCAd3MrCYc0M3MasIB3cysJhzQzcxqwgHdxoKkMyXtGdG5r5T0J6M4t1knHNCta5IekPTSqh2rx3aM7IujpR0bJX1M0jckPSrpS5JeL+mHJP17+u8xSdH0+N8lbZL0GUnflvRvkv5V0g5JWyStGfX7ssFyQDerpg8Cu4GnAk8Bfhz4l4j4u4h4QkQ8ATgl3Xe28VxEfC197rKIeCJwHPBm4CLgJkka7tuwYXJAt65I+iCwCfiLtGf4S+nzz5f095KWJN0p6cz0+f+S9jaPTx8/U9Ijkk7OO1ab829Ie7B7Jd0v6eeatl0p6SOS/jjtpd4lab5p+7Mk7Uy3/ZmkD0v6DUnrgE8BG5p6vBvSlx2Vd7yWdr1X0m+1PPcJSb+Q/vzLkhbT49wr6SU5b/E5wHUR8VhEHIiInRHxqXbXpVX6+s8A5wEvAF7R6TFsjESE//lfV/+AB4CXNj2eA74JnEvSWTgrfbw+3f6bwC3ADPBFkl5k5rEyznUmsCf9eRWwA3grcBTw/cB9wNnp9iuBb6ftmAKuAj6fbjsK+CrwRmAauBB4HPiN1vM0nTv3eBnt/K8kPWulj48B9gMbgJPSbRvSbZuBH8g5zl8DnyXpWW/K2WczEMDqluc/A7whY/+/Bd456t8b/xvcP/fQrZ9eB9wUETdFxKGI+CtggSQQQhIYnwT8A7AIXNPleZ5D8iXxjoh4PCLuA/6AJPg1/L+0HQdJ0hfPTJ9/PrAa+N2IWI6Ij6ftaSfveK3+jiTI/lD6+NXA5yLiQeAgsAZ4mqTpiHggIv455zg/mh7r14D7Jd0h6Tkl2lnkQeDJPR7DKswB3frpqcCPpumWJUlLwA+S5HGJiGXgOuDpwG9HRLeV4Z5KkhZpPs+vAN/btM/Xm37eBxwtaTVJT3mx5dy7S5wz73hHSI97A3Bx+tRrgQ+l23YBbyL5YntY0g1NKZ3W4zwSEVsi4pT0fd0BbOsxBz4HfKuH11vFOaBbL1oD8m7ggxEx2/RvXURsBZA0B7wN+CPgt1tGXXQS3HcD97ec54kRcW7bV8JDwFxLYDy+y3bkuR54taSnAs8DPnb44BF/GhE/SPKlFMA72x0sIr4B/BbJl1FXPez03sWzSXr9VlMO6NaLfyHJXzf8CfDDks6WNCXp6HQY4MY0gF4H/CFwCUlg/fWCYxX5B+Df0huMM+m5nl4yJfE5ktTHZZJWSzofeG5LO54i6Ukl27JCROwEvgG8H7g5IpYAJJ0k6cXpF9m3SXLrh7KOIemd6XtaLemJwM8AuyLim520RdJaSS8EPkFy3W7q9n1Z9TmgWy+uAn41TXv8YkTsBs4nSX/sJelJX07ye/ZzwPcAv5amJX4S+ElJP5R1rKKTpnnsVwKnAffz3eDZNghHxOMkN0IvAZZI8v6fBL6Tbr+HpId9X9qWzJRICX8KvDT9b8MaYGva3q+TXI8rcl6/FvjztI33kfToz+vg/P9H0r+RfEG9m+SvhHMiIvMLxOpB0XUa06weJN0GvC8i/mjUbTHrhXvoNnEkvVDS96XpjJ8AngF8etTtMuvVirv0ZhPgJOAjwDqSdMarI+Kh0TbJrHdOuZiZ1YRTLmZmNTGylMuxxx4bmzdvHtXpzczG0o4dO74REeuzto0soG/evJmFhYVRnd7MbCxJ+mreNqdczMxqwgHdzKwmHNDNzGrCAd3MrCYc0M3MasIzRc0m0Ladi1x98708uLSfDbMzXH72SVxw+tyom2U9ckA3mzDbdi5yxce/yP7lgwAsLu3nio9/EcBBfcw55WI2Ya6++d7Dwbxh//JBrr753hG1yPrFAd1swjy4tL+j5218OKCbTZgNszMdPW/jwwHdbMJcfvZJzExPHfHczPQUl5990ohaZP3im6JmE6Zx49OjXOrHAd1sAl1w+pwDeA21TblI+oCkhyV9KWf7j0n6gqQvSvp7Sc/sfzPNzKydMjn064BzCrbfD7wwIk4Ffh24tg/tMjOzDrVNuUTE30raXLD975sefh7Y2HuzzMysU/0e5XIJ8Km8jZIulbQgaWHv3r19PrWZ2WTrW0CX9CKSgP7LeftExLURMR8R8+vXZ66gZGZmXerLKBdJzwDeD7w8Ir7Zj2OamVlneu6hS9oEfBz48Yj4Su9NMjOzbrTtoUu6HjgTOFbSHuBtwDRARLwPeCvwFOD3JAEciIj5QTV42Fxm1Kw9f06qocwol4vbbH8D8Ia+tahCXGbUrD1/TqrDtVwKuMyoWXv+nFSHA3oBlxk1a8+fk+pwQC/gMqNm7flzUh0O6AVcZtSsPX9OqsPVFgu4zKhZe/6cVIciYiQnnp+fj4WFhZGc28xsXEnakTc03CkXM7OacMrFzLrmCUXV4oBuZl3xhKLqccrFzLriCUXV44BuZl3xhKLqcUA3s654QlH1OKCbWVc8oah6fFPUzLriCUXV44BuZl274PQ5B/AKcUA3qwmPCTcHdLMa8JhwA98UNasFjwk3cEA3qwWPCTdwQDerBY8JN3BAN6sFjwk38E1Rs1rwmHCDEgFd0geAVwIPR8TTM7YLeA9wLrAPeH1E/GO/G2pmxTwm3MqkXK4DzinY/nLgxPTfpcB7e2+WmZl1qm1Aj4i/Bb5VsMv5wB9H4vPArKTj+tVAMzMrpx83ReeA3U2P96TPrSDpUkkLkhb27t3bh1ObmVnDUEe5RMS1ETEfEfPr168f5qnNzGqvHwF9ETi+6fHG9DkzMxuifgxb3A5cJukG4HnAoxHxUB+Oa2ZdcqGuyVRm2OL1wJnAsZL2AG8DpgEi4n3ATSRDFneRDFv8yUE11szac6GuydU2oEfExW22B/CzfWuRmfWkqFCXA3q9eeq/Wc24UNfkckA3qxkX6ppcDuhmNeNCXZPLxbnMasaFuiaXA7pZDblQ12RyysXMrCYc0M3MasIB3cysJhzQzcxqwgHdzKwmPMrFbMhcOMsGxQHdbIhcOMsGySkXsyEqKpxl1isHdLMhcuEsGyQHdLMhcuEsGyQHdLMhcuEsGyTfFDUbIhfOskFyQDcbsKxhip/d8uJRN2sFD6ccfw7oZgM0LsMUx6WdVswB3awP8nq3VVvfc1zaad1xQDfrUVHvtuwwxbxA2880SD/aadXmgG7WheZAu0riYMQR2xu92w2zMyxmBMXmYYp5gXbhq9/iYzsW+5YGKeqFl2mnVZ+HLZp1qBGAF5f2E7AimDc8uLS/1DDFvEB7/W27+zqrtKgX7uGU9VAqoEs6R9K9knZJ2pKxfZOkWyXtlPQFSef2v6lm1ZAVgLNsmJ3hgtPnuOrCU5mbnUHA3OwMV1146hE97LxAW/RF0Y2iSU1l2mnV1zblImkKuAY4C9gD3C5pe0Tc3bTbrwIfiYj3SnoacBOweQDtNRu5MgG1uXfbbn3PvHTHVEYqp7E/dD7M8PKzTzoitdNpO636yvTQnwvsioj7IuJx4Abg/JZ9AvgP6c9PAh7sXxPNqiWvpzslddW7zUp3TK8SR0+v/Hg2AnBr2qeRX9+2czH3PO6F11+Zm6JzwO6mx3uA57XscyXwl5L+F7AOeGnWgSRdClwKsGnTpk7balYJeT3dboNj6+zRJ81M89jjB3js8SPTOrMz01x53ilccPocZ2y9pathhu6F11u/bopeDFwXERuBc4EPSlpx7Ii4NiLmI2J+/fr1fTq12XANoqd7welzfHbLi7l/6ytYt2Y1ywdXplrWrVl9+BweZmhZyvTQF4Hjmx5vTJ9rdglwDkBEfE7S0cCxwMP9aKRZ1Qyyp9suWG/buZg5VBI8zHDSlemh3w6cKOkESUcBFwHbW/b5GvASAEn/GTga2NvPhpqN2radi5yx9RZO2HIjZ2y9pTBf3Yui0SiN3HlWMPcwQ2sb0CPiAHAZcDPwZZLRLHdJeoek89Ld3gz8tKQ7geuB10fkjLkyG0Pd3ITsVtGY8Lwhk1OSb3BauZmiEXETyVDE5ufe2vTz3cAZ/W2aWXUMs9ZJUYndn//wHZmvORSR2Q5XUJwsnvpvVsKwb0Lm5eg7maLvCoqTx1P/zUqoytJxeWPW9z1+YEVu3wtSTx4HdLMSqlLrpHXI5OzMNAge2be8IrfvoY2TxykXsxKqtHRcczrmjK23sLR/+YjtrqA4uRzQzUqq4izLol74u15zWmHtlgbfOK0PB3SzMVbUCy/zV4VvnNaLA7rZGOu1gqKXnqsXB3SzMdZrbt83TuvFAd1szPWS2/eN03pxQLeJNsobglW4GdkuZWPjRaMquTI/Px8LCwsjOXfdVSFQ9GoY76H1hmDDMWunedsPnzLQa5Z17l5qqvfalnH/fZkkknZExHzWNvfQa6YOoxaG9R7yCl09sm954Nds2Dcji4J2mZSNg/548EzRmqnDdO9hvYeiG3+DvmbDvBnZa6XIYVaatN44oNdMHUYtDOs9tLvxt7i0f2D1z4dZG6bXL8g6dBImhVMuNVOHUQudvodu0gHbdi7y2HcOFO4jONyOsmmforY0b5tdO830KrF86Lv3sGamp3jRyes5Y+stfU1t9PoFWYdOwqRwD71mqlJEqhedvIesdMCbPnwHp739L3N71I3XtNZAaSagdbhAu15pUWqiddsj+5ZBSXGtxrqkr3r2HB/bsdj31Eavfw1UpdKkteeAXjODWMB42Dp5D3k3Npf2L+cGw6JVfxrnyxv7VdQrLUpNZG1bPhisW7Oa+7e+gs9ueTG33rN3IKmNXr/k69BJmBROudRQFYtIdarseyhzY7P1OHmvORTB/VtfASRVDDtNXXWTmmjeNqjURq+zSatUadKKOaDbWMvLtzdkBcMyOfpuJty0O267cw7y/kevX/J16CRMAqdcbKxs27l4xMiTF528fkU6oFlWMCyTQugmdVV03DLndGrDeuUeuo2NrAlHH9uxyKuePceNX3goudHYYt/jB9i2c/GIQFw2hdBpr7TMcYu2ObVhvfLUfxsbeXntudkZPrvlxWzbuciV2+9aMXqleUr9uM54HNd2W/956r9VWtlg1e6m4QWnz3H1zffmLskGjGVZhDqUc7DhKJVDl3SOpHsl7ZK0JWef/ybpbkl3SfrT/jbTxk1rrrvdmPAyY6/LjIcuCvrjOuNxXNttw9e2hy5pCrgGOAvYA9wuaXtE3N20z4nAFcAZEfGIpO8ZVIOtmlpnQf77tw8cngVZ1KPspEhVmZEnRSNFyg4LrFp6wzM1rawyPfTnArsi4r6IeBy4ATi/ZZ+fBq6JiEcAIuLh/jbTqixrFmTzlHbI71F2EqzKjDzJGikC8Nh3DjC7djrzXM09/Ky/GC7/6J2c9va/HEhNlzJGNVOz7F9ZVh1lcuhzwO6mx3uA57Xs858AJH0WmAKujIhPtx5I0qXApQCbNm3qpr1WYFQ9y7yZl626HRMOK9/bu15zWuZ7azz39r+464hRL0v7l5leJaanxPLBI+unNPfw82Z0NvLyo8hfj2IRCuftx1O/xqGvBk4EzgQuBv5A0mzrThFxbUTMR8T8+vXr+3Rqg9GWOC37p38jSDd6fpu33MhDj658bWuw6vS9XXD6HGuPWtlXWT4UrDtqdWEPv8x7GXb+ehTlHNrl7d17r6YyPfRF4PimxxvT55rtAW6LiGXgfklfIQnwt/elldbWKFdvbzdbE74bpFt7fi2ZGWZnprnyvFNWjN3u9L3lBeZH9y9zx9te1tN7KTr+oAx7pmZRKsy99+oq00O/HThR0gmSjgIuAra37LONpHeOpGNJUjD39bGd1sYob5xl5a2np3REJcFGj7JdembdmtVHBIVtOxdzA+zi0v7cHmJefjmgsEeZl4NvVfdKg0V5e4+6qa62PfSIOCDpMuBmkvz4ByLiLknvABYiYnu67WWS7gYOApdHxDcH2XA70ijroHcyw7HdF0zz9kZPsEhzCqa5LVl554aiHmXre2kdsQNHpoSqNiKmX4ry9j//4TsyX+NRN6PnmaI1UaVFh4vkzfZsmJI4FMGG2Rke+86BwprlrRozRhsawTbvfK3758kL2uNyzbuV977bzdi1wSqaKeqAXiPj0FvMCoL9Ijhc/rbZCVtuzKxvnrd/WZMa2Or+RVZ1nvo/ITq5cTao4N963BedvJ5b79l7xHmuuvDUwz3nKYmDEYf/W0bevkV530GkoyZ1wo+LiFWXA/oE6nSUQnPqohFM5zI+xFnH/ZPPf+3w9sZ5rrrw1BU92BO23Fiq7TPTU4eXais7LntQ47iLvijG4a+lXrg+ejU55TKBOkkVFKVIpleJJxy9mqV9yx3lvLPOk9emY9ZOs/ao1Zn5604C5iACbF7qIe8LxykJ6wenXOwInaQKioYZLh+Kw7Mxy4zdblhMxzK3TtnPCo5v++FTMoNgN7XK+x1M81IPo5wTMGh1/8tj3DmgT6BOcsqDyge3pnjGNS+b9UVR12F9nlBUfQ7oE6iTnHLZmZOdyuqx1iUvO8o5AYNU57886sJrik6grNogr3p2MouzdeZl2ZmTDcesnT7iuK97fn4RtnHvseap69qgkzqqZ5y4hz6hmnvDZf6ULpqg05CX8771nr217LHmGdf0UTt1/cujThzQLfdP6Td/5E6gOPg3HLN2OvcG5ijKv45aXdJHzSbx/+O4cUC33D+ZD0b05eZlXXusk8b/H6vP49CtbX2Vuk9lNxsnRePQfVN0zAxiYYF2Nz5908tsPLiHXmFZdVFaZyBC9qIQnZ6j3Q3PrKn+ZjZ8nilacVmz74AVI08+9PmvZVYNXNq/3NUEj04qH3oSiVn1uYc+YllBdXpVUgCrdXm2dvJy3Z3WtS6qfOh8utlouYdeYZmrzHcayVNZue6sMeZv+vAdvP0v7jpch6XVoQgEmX8NOJ9uVl0O6CPWzwCZNcEjr7jWI/uWc4N24zieRDKZXIBrfHmUy4j1K0DmTfAo+sIIklV7so5T1+nrVqzxF93i0v4j1mvtx2gqGzz30Iesufczu3aa7/SwFFvz+ptZvahtOxdZ1WYloCDJi+f1xtxTmywuwDXeHNAHrDWAN68gn5XDnplexYFDwfLB9nn0QxG5a2I2elrtlnUruslZx+nrVswFuMabA/oAtd6QzLsJ2ezJ69YcXiSh0TPOWwmoaKmzooUpGpxCsVajKsDlvH1/lAroks4B3gNMAe+PiK05+70K+CjwnIiYuDGJrb+U+x4/0PHq9g8u7V/RM85b6uxFJ6/PrZJY1KMS+ENjmUZRgMsLZ/RP24AuaQq4BjgL2APcLml7RNzdst8TgTcCtw2ioVWX9UvZjayeUDdLneX1tDyO3IqMogCX8/b9U6aH/lxgV0TcByDpBuB84O6W/X4deCdweV9bOCbKpDjaKeoJdbrU2btec5pLnVpXhn3vxHn7/ikzbHEO2N30eE/63GGSngUcHxE39rFtAzOIAldlf/mmp8TszDQiqSHe+Llo1aA8eXnNDbMzmasSedV5q6Ki32PrTM83RSWtAn4HeH2JfS8FLgXYtCl/abJBGlS+Li/FMTszzbo1q1lc2s+UxPLBYN2a1SuKaXXTrnb5To9SsXHghTP6p0xAXwSOb3q8MX2u4YnA04HPSAL4PmC7pPNab4xGxLXAtZDUcumh3V3rR74uqwrivscPrNhvZnqKK887BVhZaCtribdO2+UFB6wO/HvcP22Lc0laDXwFeAlJIL8deG1E3JWz/2eAX2w3ymVQxbmKhj9t27nIm3LyzoLcMd2txy9ToXBmehVHT0+xtG85d3JP8w3KE7bcmDkNv2y7zGwy9FScKyIOSLoMuJlk2OIHIuIuSe8AFiJie3+b272itAVwxM+tGvm6duNhy978/PbyIfYvHwLIndzTnHf3Arxm1qtalc/NKwc7V1BsCpLUyFUXngqQ2ftuXkAiryfdjeYeet5Yc9/INLNmE1M+t9vhT41g/uaP3JnZm25eQCKvJ92p1ps+ziOaWa9qFdDbpS2Keu/t6p40blBm3ZFvlVeWtl0xLY9KMbNe1Cqgtxv+lLetbF68MS0f4Mrtd2XWV5mdmeaVzzxuxdqfTp+Y2aDVKqCXSVtkbcubcdmq0dNv9KSLbqDOP/XJTp+Y2VDV6qZot/JupjZzD9vMqqDopqhXLILM1XmmV4lj1k572ryZjY1apVy6VXaEiWs2m1mVOaCn2o0wcc1mM6s6p1xKKqq1YmZWBQ7oJblms5lV3USnXDrJibvWiplV3cT20Bs58cWl/QTfzYnnLSqRNRLGNZvNrEomNqB3mhP3CkBmVnUTkXLJSq10kxN3rRUzq7LaB/S84Yaza6d5ZN/KWizOiZvZuKp9yiUvtRKBc+JmVitjH9C37VzkjK23cMKWGzlj6y0rbmrmpVAe3b/snLiZ1cpYp1zazd7ctnMxdz3PDbMzzombWa2MdUBvN1Ilb9EKp1bMrI7GOuVSNFIlb9GKKcmpFTOrpbEO6HkjUjbMzuQG+0MRDuZmVktjHdCLZm8WBXszG752Axisd2OdQ29Xx7xofVEzGx6Xnx6OUgFd0jnAe4Ap4P0RsbVl+y8AbwAOAHuBn4qIr/a5rZnyRqqUXbTCzAavaACDP5P90zagS5oCrgHOAvYAt0vaHhF3N+22E5iPiH2Sfgb438BrBtHgTnhYotnobdu5mLtmr8tP91eZHvpzgV0RcR+ApBuA84HDAT0ibm3a//PA6/rZyCxeDs6s+hqpljy+p9VfZW6KzgG7mx7vSZ/LcwnwqawNki6VtCBpYe/eveVb2aLT0rdmNhp5w4fB97QGoa+jXCS9DpgHrs7aHhHXRsR8RMyvX7++6/N4OTiz8VCUUvF8kP4rE9AXgeObHm9MnzuCpJcCbwHOi4jv9Kd52fJ+SRaX9ntIlFmF5KVU5tLSG9ZfZQL67cCJkk6QdBRwEbC9eQdJpwO/TxLMH+5/M49UlHdzCsasOrzS13C1DegRcQC4DLgZ+DLwkYi4S9I7JJ2X7nY18ATgzyTdIWl7zuH6IuuXpJVTMGaj55W+hkuRUbxqGObn52NhYaHr1zePcsl7BwLu3/qKrs9hZqPhUWz5JO2IiPmsbWM7U7R5jPkZW2/JHOfqIVFm48ezSrs31rVcGpynM6sPj2Lr3tj20Jt5mr9ZfXSzgLslahHQwdP8zepiw+yMU6hdqkXKxczqwynU7tWmh25m9eAUavcc0M2scpxC7Y5TLmZmNeGAbmZWEw7oZmY14YBuZlYTY3VT1PUdzMzyjU1Ad30HM7NiY5NycX0HM7NiYxPQXd/BzKzY2AT0vDoOru9gZpYYm4Du+g5mZsXG5qao6zuYmRUbm4AOru9gZlZkbFIuZmZWzAHdzKwmHNDNzGrCAd3MrCZK3RSVdA7wHmAKeH9EbG3Zvgb4Y+DZwDeB10TEA/1tqpmNg0bNpcWl/UxJHIxgrmVUWt4+Lzp5Pbfes7fUSLbm2k5PmplGgqV9y5mv63cdqG6PN+h6VIqI4h2kKeArwFnAHuB24OKIuLtpn/8JPCMi/oeki4AfiYjXFB13fn4+FhYWem2/mVVIa82lZjPTU1x14akAufvkvaY16BWdp/V1WfvmHbeMbo/Xr3ZI2hER81nbyqRcngvsioj7IuJx4Abg/JZ9zgf+b/rzR4GXSFLpFppZLWTVXGpo1F4q2ifvNZ2cp/V1/a4D1e3xhlGPqkxAnwN2Nz3ekz6XuU9EHAAeBZ7SeiBJl0pakLSwd+/e7lpsZpXVrrbSg0v7O66/lLV/mWM09ul3HahujzeMelRDvSkaEddGxHxEzK9fv36YpzazIWhXW2nD7EzH9Zey9i9zjMY+/a4D1e3xhlGPqkxAXwSOb3q8MX0ucx9Jq4EnkdwcNbMJklVzqaFRe6lon7zXdHKe1tf1uw5Ut8cbRj2qMqNcbgdOlHQCSeC+CHhtyz7bgZ8APge8Grgl2t1tNbPaaa65VDTKJW+fsqNcWms7FY1y6XcdqG6PN4x6VG1HuQBIOhd4N8mwxQ9ExG9KegewEBHbJR0NfBA4HfgWcFFE3Fd0TI9yMTPrXNEol1Lj0CPiJuCmlufe2vTzt4Ef7aWRZmbWG88UNTOrCQd0M7OacEA3M6sJB3Qzs5ooNcplICeW9gJfbXrqWOAbI2lM98atzePWXhi/Nru9gzdube53e58aEZkzM0cW0FtJWsgbilNV49bmcWsvjF+b3d7BG7c2D7O9TrmYmdWEA7qZWU1UKaBfO+oGdGHc2jxu7YXxa7PbO3jj1uahtbcyOXQzM+tNlXroZmbWAwd0M7OaqERAl3SOpHsl7ZK0ZdTtaSXpeEm3Srpb0l2S3pg+/2RJfyXpn9L/HjPqtjaTNCVpp6RPpo9PkHRbep0/LOmoUbexmaRZSR+VdI+kL0t6QZWvsaSfT38fviTpeklHV+0aS/qApIclfanpucxrqsTvpm3/gqRnVajNV6e/F1+Q9OeSZpu2XZG2+V5JZ1ehvU3b3iwpJB2bPh7oNR55QE8Xob4GeDnwNOBiSU8bbatWOAC8OSKeBjwf+Nm0jVuAv4mIE4G/SR9XyRuBLzc9fifwroj4j8AjwCUjaVW+9wCfjoiTgWeStL2S11jSHPBzwHxEPJ2ktPRFVO8aXwec0/Jc3jV9OXBi+u9S4L1DamOr61jZ5r8Cnh4RzyBZtP4KgPRzeBFwSvqa30tjyjBdx8r2Iul44GXA15qeHuw1joiR/gNeANzc9PgK4IpRt6tNmz8BnAXcCxyXPncccO+o29bUxo0kH9YXA58ERDJbbXXWdR/1P5JVru4nvVHf9Hwlr7yx4c4AAALeSURBVDHfXUf3ySRlqD8JnF3FawxsBr7U7poCvw9cnLXfqNvcsu1HgA+lPx8RL4CbgRdUob3AR0k6Jg8Axw7jGo+8h065RagrQ9JmkoU8bgO+NyIeSjd9HfjeETUry7uBXwIOpY+fAixFsog3VO86nwDsBf4oTRO9X9I6KnqNI2IR+C2S3tdDJAuj76Da17gh75qOy2fxp4BPpT9Xss2SzgcWI+LOlk0DbW8VAvrYkPQE4GPAmyLiX5u3RfJ1W4kxoJJeCTwcETtG3ZYOrAaeBbw3Ik4HHqMlvVKxa3wMcD7JF9EGYB0Zf3ZXXZWuaRmS3kKSAv3QqNuSR9Ja4FeAt7bbt9+qENDLLEI9cpKmSYL5hyLi4+nT/yLpuHT7ccDDo2pfizOA8yQ9ANxAknZ5DzCbLuIN1bvOe4A9EXFb+vijJAG+qtf4pcD9EbE3IpaBj5Nc9ypf44a8a1rpz6Kk1wOvBH4s/SKCarb5B0i+6O9MP4MbgX+U9H0MuL1VCOiHF6FORwRcRLLodGVIEvCHwJcj4neaNjUWxyb97yeG3bYsEXFFRGyMiM0k1/OWiPgx4FaSRbyhQu0FiIivA7slNZZAfwlwNxW9xiSpludLWpv+fjTaW9lr3CTvmm4H/ns6EuP5wKNNqZmRknQOSQrxvIjY17RpO3CRpDVKFrI/EfiHUbSxISK+GBHfExGb08/gHuBZ6e/4YK/xKG54ZNxQOJfkzvU/A28ZdXsy2veDJH+WfgG4I/13Lkle+m+AfwL+GnjyqNua0fYzgU+mP38/yS/7LuDPgDWjbl9LW08DFtLrvA04psrXGHg7cA/wJZJF0tdU7RoD15Pk+JdJAssledeU5Mb5Nenn8IskI3iq0uZdJLnnxufvfU37vyVt873Ay6vQ3pbtD/Ddm6IDvcae+m9mVhNVSLmYmVkfOKCbmdWEA7qZWU04oJuZ1YQDuplZTTigm5nVhAO6mVlN/H+kdrVI+Iu3mgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs STD\")\n", + "plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEICAYAAACktLTqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAgAElEQVR4nO3df5xcdX3v8dc7mwUX8LIgW4QlkVQpCCIJbBUb2wL+iICVFBWwatHSm95efVRaSg3aW/FHH8SiYn3cFi+KgooCFRop0CIl2F65oiaG35AaJZisAYKygGaFTfK5f5wzydnJObuzs/PjzMz7+XjsY2fOOXPmu2dmP/Od7/fz/X4VEZiZWXeZ0+4CmJlZ4zm4m5l1IQd3M7Mu5OBuZtaFHNzNzLqQg7uZWRdycLeGk3SCpE1teu4LJX2lHc89U5L+VdLZ7S6HdScH9x4haYOk15btXLMsR9s+RIpI+jtJy9LbGyTtW3RsRJwcEVfO8vneJenbszmHdScHd7PGOg5YLWkImIiIp9pdIOtNDu49QNKXgfnAv0j6haS/SrcfL+n/SRqTdLekE9LtvyXpCUnz0vvHSHpS0hFF55rm+Q+WdJ2kLZIelvRnmX0XSrpW0pckPSPpfkkjmf3HSlqb7vsnSddI+pikvYF/BQ5Oy/ELSQenD9uj6HxV5bpU0ieqtn1D0l+kt98vaTQ9zzpJr5nm7xRwFHAfMAKsneb4b0n64/T2uyR9W9In0mv9sKSTM8e+S9KP07I8LOntkl4KfBZ4Vfr3j6XHnppes6clbZR0YeY8h0oKSWdL+kn6On8ws79P0gck/Sh9rjWZ98ERkm6V9PP0epyRedwpkh5IHzMq6S+n+tutBSLCPz3wA2wAXpu5Pwz8DDiF5EP+den9oXT/3wKrgAHgXuC9RefKea4TgE3p7TnAGuBvgD2AXwd+DCxJ918I/CotRx9wEXBnum8P4BHgfUA/cDrwHPCx6ufJPHfh+XLK+TvARkDp/f2AceBg4PB038HpvkOBFxec5zBgDHga2Jbe/lV6rjHgnQWP+xbwx+ntdwETwH9Py/2nwE8BAXun5z48PfYg4KjM476dc/2PTq/9y4HHgKWZvyOAz6Wv7THAs8BL0/3np6/34elzHwO8IC3DRuDdwFxgEfAEcGT6uM3Ab2eu47Htfs/3+o9r7r3rHcDNEXFzROyIiFuB1SRBEZIguS/wPWAU+Ic6n+c3ST4wPhIRz0XEj0kCy1mZY76dlmM78GWSgAJwPEkg+UxETETE9Wl5plN0vmr/lyTQ/XZ6/y3AdyLip8B2YE/gSEn9EbEhIn6Ud5KI+GFEDAKfAc4jCW7/BbwkIgYj4ss1lBngkYj4XFruK0mC+IHpvh3AyyQNRMTmiLi/6CQR8a2IuDd9Xe8Bvgb8btVhH46I8Yi4G7ibXdfoj4G/joh1kbg7In4GvBHYEBFfjIhtEbEWuA54a/q4ifRa/beIeDIiflDj32xN4uDeu14EvDVtkhlLv9K/miSgEBETwBXAy4BPRkS9M8y9iKTpJPs8H2BX0AJ4NHN7K/A8SXNJatCjVc+9sYbnLDrfJOl5rwbelm76A+CqdN964FySD7nHJV2dafaZpNK0BVwAfISklv1S4H5JX6+hvLuVOyK2pjf3iYhfAmcC/wPYLOkmSUcUnUTSKyXdnjaDPZU+7oCi5yK5Rvukt+cBeR9iLwJeWfU6vh14Ybr/zSQVg0ck/YekV9XyB1vzOLj3jurgvBH4clqzrPzsHRErACQNAx8Cvgh8UtKeU5xrKhuBh6ue5/kRccq0j0y+6g+nbdkV8+osR5GvAW+R9CLglSS10eTkEV+NiFeTBLYAPp53goj4LeAI4IcRsS/w18DH07/1LQ0oIxFxS0S8juTD9yGSbz+Qfw2+CtwAzEvL81mSJpZabAReXLD9P6pex30i4k/T8n0/Ik4Dfg1YCVxb699mzeHg3jseI2nvrvgK8HuSlqSdaM9Tklp4SBpMrwAuB84hCbIfneJcU/ke8EzaOTmQPtfLJP1mDY/9DknzyHslzZV0GvCKqnK8QFOkG04nbV54Avg8cEtEVDolD5d0UvqhVmk/3zHFqY5jVwfqsSRNXA0h6UBJp6WdyM8Cv8iU5THgEEl7ZB7yfODnEfErSa8g+UZSq88DH5V0mBIvl/QC4EbgNyS9U1J/+vObkl4qaY+0g3ff9Bvf00x9rawFHNx7x0XAX6dfqf8yIjYCp5E0kWwhqZmdT/Ke+DOSGtj/Spsu3g28W9Jv551rqidN24/fCCwEHmZXIJ02IEfEcySdqOeQdEy+gyTIPJvuf4ik5v3jtCy5zSY1+Crw2vR3xZ7AirS8j5JcjwumOMdxQKWd+ViSTuRGmQP8BUkH689J2s//NN23CrgfeFTSE+m2/wl8RNIzJB3ZM6lFfyo9/pskQfpyYCAingFeT9JX8lOSa/JxkusE8E5gg6SnSZqB3j7zP9MaqZIlYNYRJH0X+GxEfLHdZTErM9fcrdQk/a6kF6bNMmeTpPb9W7vLZVZ2u2UQmJXM4STNBHuT5Me/JSI2t7dIZuU3bbOMpOcB/0nStjYX+HpEfEjSApI0sheQtC++MyKeSzugvkTSBvkz4MyI2NC8P8HMzKrV0izzLHBSRBxD0in2BknHk3SmXBIRLwGeJOn0Iv39ZLr9EgrSx8zMrHlm1KEqaS/g2yQ99TcBL4yIbemAhQsjYomkW9Lb30kHjjxKMkKx8IkOOOCAOPTQQ2fzd5iZ9Zw1a9Y8ERFDeftqanOX1EfS9PISkmHoPwLGImJbesgmkrlKSH9vBEgD/1MkTTdPVJ1zGbAMYP78+axe3bC0YDOzniDpkaJ9NWXLRMT2iFgIHEIyiKRw6HOtIuKyiBiJiJGhodwPHjMzq9OMUiHT0Xu3A68CBjPzdRxCMrkU6e/KFKFzSQar/KwhpTUzs5pMG9wlDUkaTG8PkEwN+yBJkK/Mm3E28I309g3pfdL9q2Yx6ZSZmdWhljb3g4Ar03b3OcC1EXGjpAeAqyV9jGROjcvT4y8HvixpPclQ6bPyTmpmZs0zbXBP54NelLP9x0yexKmy/VfsmuPZzMzawCNUbaeVa0e5+JZ1/HRsnIMHBzh/yeEsXTQ8/QPNrHQc3A1IAvsF19/L+MR2AEbHxrng+nsBHODNOpCDe48pqp1ffMu6nYG9YnxiOxffss7B3awDObj3iJVrR/nwv9zPk1sndm7L1s5/Ojae+7ii7WZWbp7ytwdUmlyygb2iUjs/eHAg97FF282s3Bzce0Bek0vWT8fGOX/J4Qz0903aPtDfx/lLDm928cysCdws06UqbeujNTSrzEnXn77o9KOdLWPWJRzcu1B15st0tkdwwfX3ctHpR3PH8pMaWg5/WJi1h5tlutB0zTB5Km3vjVL5gBkdGyfY1Xm7cu3otI81s9lzcO9C9Wa4NDIzZqrUSjNrPgf3LjRdhktf2sY+08fNhFMrzdrLwb0L5WW+VAz09/G2V85remaMUyvN2svBvQstXTTMRacfzXAaSCs19eHBAS46/Wg+tvTonfuV2d7Izs6iD5hfPrvN7e5mLTCjNVSbZWRkJLzMXnnMNsslm4Y5R7Cj6i020N+324eJM2vMZk7SmogYydvnmrtNMtssl+zjYffADrt3rDqzxqzxnOduk0w3gVh1DfvEI4a4/aEtO+9vfW5bTWmY2Y5VT1pm1ngO7jZJUTbL6Ng4iz7yzd0mHvvKnT+ZdL9W2Y5VZ9aYNZ6bZWySomwWQe7EY/WozsxxZo1Z4zm42yRFWS6z6Xbv7xODA/2FmTmetMys8dwsY5NUgu6FN9zP2Hh9NfXBgX723nNuzZkvlX3OljFrHKdCdpFGphMuXrFqRm3oFXlpjmbWHE6F7AGNTiespTNzcKCfdxw/f9JgqUqWi9MYzdrLwb0LrFw7ynnX3t3Qibqm6swcHhzg02cu5K4PvZ6PLT16Z5v59vRboPPUzdrPbe4drlJj317QvFZLDTyvOef8JYfvNid8UZOL89TNysc19w433dzt06UTFjXnADXPP+M8dbPycc29w00VQGtJJ5yq1n3H8pNqqnkfPDiQ2/nqPHWz9pm25i5pnqTbJT0g6X5J70u3XyhpVNJd6c8pmcdcIGm9pHWSljTzD+h1RQG0T6opa6URtW7nqZuVTy01923AeRHxA0nPB9ZIujXdd0lEfCJ7sKQjgbOAo4CDgX+X9BsRMbN132ySvHZxgK3Pbdvt2JmkIzai1u08dbPymTa4R8RmYHN6+xlJDwJT/deeBlwdEc8CD0taD7wC+E4DytuTqhe8Hh0b5/x/uhsEE9snd6QODvRz4ZuOqjmwFnWczrTWvXTRsIO5WYnMqENV0qHAIuC76ab3SrpH0hck7ZduGwY2Zh62iZwPA0nLJK2WtHrLli0zLngvyWsXn9gRuwV2gL33nDujIJtd2KNZC3eYWevV3KEqaR/gOuDciHha0qXAR0mmHfko8Engj2o9X0RcBlwGyQjVmRS618yk/bueDBXXus26T03BXVI/SWC/KiKuB4iIxzL7PwfcmN4dBeZlHn5Ius3qsHLtKHOkwjz2ap2QoeJVl8yar5ZsGQGXAw9GxKcy2w/KHPb7wH3p7RuAsyTtKWkBcBjwvcYVuXdMNUCpf47o79OkbZ2QoeJVl8xao5aa+2LgncC9ku5Kt30AeJukhSTNMhuAPwGIiPslXQs8QJJp8x5nytSnaIBSn8TFbz1m5zGdVAMuyqs/95q7uPiWdTtXdhodG6cv/cYy3CF/m1mZeFbIEluw/KbCedSHBwc6KqhXTPU3TcWzTZrtzrNCdqipVkVqd7PGyrWjLF6xigXLb2LxilU1PX+l/6Aes5kEzawXObiXWN7IT7H7qkitDnz1tJtPN8FZLTxXjVntHNxLLC8HvSg0tjLwTTUfzUweM1OdkAlkVhaeOKxk8tIE71h+0s79RSsktTLw1TMfzWw/fDohE8isTFxzL5FamjvKMElX0QfJVB8wRfsGB/onfTOpXtkJPGrWrB6uuZdILYtelGGSrnrmoyl6zEzmwTGz2jm4l0itzR3tni6gng+YMnwomfUSB/cS6aRFL+r5gGnWh5KnMzDbndvcS6QM7emdxtMZmOVzzb1E3HRRm2xNPW9SNS/ObebgXjrtbk8vu+qFS4oGRXnAk/U6N8tYR6l1MFQZ+ynMWsnB3TpKLTVy91OYObhbh6mlRu4BT2YO7qVVz6yLvSAvoyhreHDAgd0Md6iWUnWnYSW9D+jJwFWdx/7m44a58e7NjI1PTDrOzTFmu7jmXkL1zLrYrfLy2K9bM8qFbzqKT5+5cNK8NG6OMdvFNfcSqmfWxW411QfdHctPcjA3K+DgXkKdNA1Bs9X7QecpCazXuVmmhDwNwS5FH2hzpMJOZk9JYObgXkp5KzD1antyUXbM9ojCgO0+CzM3y5SWpyFIVK7BedfeXdMcMivXjuY2aUFv9llY73LN3Upv6aJhdtQwh0ylOaZIL/ZZWO9ycLeOUMvSflPNO9OrfRbWuxzcrSPU0sk8VbNLr/ZZWO+aNrhLmifpdkkPSLpf0vvS7ftLulXSD9Pf+6XbJekzktZLukfSsc3+I6z71dLJXFS795QE1otq6VDdBpwXET+Q9HxgjaRbgXcBt0XECknLgeXA+4GTgcPSn1cCl6a/zWZluk7mehbuNutW09bcI2JzRPwgvf0M8CAwDJwGXJkediWwNL19GvClSNwJDEo6qOElN6viFFKzXWaUCinpUGAR8F3gwIjYnO56FDgwvT0MbMw8bFO6bXNmG5KWAcsA5s+fP8Nim+VzCqlZouYOVUn7ANcB50bE09l9ERFAfq5agYi4LCJGImJkaGhoJg81M7Np1BTcJfWTBParIuL6dPNjleaW9Pfj6fZRYF7m4Yek28zMrEVqyZYRcDnwYER8KrPrBuDs9PbZwDcy2/8wzZo5Hngq03xjZmYtUEub+2LgncC9ku5Kt30AWAFcK+kc4BHgjHTfzcApwHpgK/DuhpbYzMymNW1wj4hvAyrY/Zqc4wN4zyzLZdYQnvrXepUnDmsjB57m8nKF1ss8/UCbeM7x5vPUv9bLHNzbpCjwnHvNXSxescpBvgG8XKH1Mgf3NpkqwLgW3xi1zCRp1q0c3NtkugDj5oPZ83KF1ssc3NukaPm4LDcfzI7nmrFe5myZNqkEmItvWVe4LJybD2bPc81Yr3LNvY2WLhrmjuUn8ekzF7r5wMwayjX3EsjW4p3z3hweU2C9xsG9JNx80DwezGS9yMG9DVyLbK2pBjP5ulu3cnBvMdciW8+DmawXuUO1xTwkvvU8mMl6kYN7i7kW2XpFg5lOPGKIxStWsWD5TZ7ywbqOm2Va7ODBgdy8dtcimycvG+nEI4a4bs2om8esazm4t9j5Sw6f1OYOzmlvhepspMUrVrmT1bqag3sLVGfHvPm4YW5/aIuzZdpk5drRwlHBbh6zbuHg3mR52THXrRn1HCdtUnk9isyRWLD8Jn/oWsdzh2qTOTumXPJej6ztEV48xbqCg3uTOTumXGZy3f0hbJ3Mwb3JnGNdLjO97v4Qtk7l4N5kXjCiXIpej8GB/tzj/SFsncodqk1WPW97nzTp67477FqrKOf9xrs373Zs9kPY8wFZp3Fwb4FKEPCcMuWQzXmvzmaq2G+vfj70e0exdNGw5wOyjuRmmRZx1kw5FWXP7LXH3Em1fL921mkc3FvEWTPlVMvr4tfOOtG0wV3SFyQ9Lum+zLYLJY1Kuiv9OSWz7wJJ6yWtk7SkWQXvNM6aKadaXhe/dtaJaqm5XwG8IWf7JRGxMP25GUDSkcBZwFHpY/5RUl/OY7veyrWjk2YcPPGIIWfNlFAt2UzOeLJONG1wj4j/BH5e4/lOA66OiGcj4mFgPfCKWZSvI1U64EbHxneOdrxuzShvPm6Y4cEBBAwPDngKghJYumiYi04/esrXpZZjzMpmNtky75X0h8Bq4LyIeBIYBu7MHLMp3bYbScuAZQDz58+fRTHKp6gD7vaHtnDH8pPaVCorUj1jZOVbV3Xao4O5dZJ6O1QvBV4MLAQ2A5+c6Qki4rKIGImIkaGhoTqLUU7ugOtced+6PMeMdaK6gntEPBYR2yNiB/A5djW9jALzMocekm7rKe6A61xOe7RuUVdwl3RQ5u7vA5VMmhuAsyTtKWkBcBjwvdkVsfO4A65z+VuXdYtp29wlfQ04AThA0ibgQ8AJkhYCAWwA/gQgIu6XdC3wALANeE9EFM+v2qXyhrh7uHpn8DKI1i0UEe0uAyMjI7F69ep2F8MsdzqCgf4+Z8dYKUlaExEjefs8t4xZhr91WbdwcDer4rRH6wYO7mZ18BTAVnYO7mYz5CmArRN4VkizGXIuvHUCB/cGq54wzCMbu09Rzvvo2LhfcysNB/cG8tD13jBVzrtfcysLB/cG8tf13pA3AjnLr7mVgTtUG8hD17tPdVbMiUcMcftDWxif2E6fxPaCQYB+za3dHNwbyEPXu0teVsxX7vzJzv3bIxDJHBzV/Jpbu7lZpoE8YVh3KVo8OysAVW3za25l4Jp7A3noeneptWklSFZn8mtuZeLg3mAeut49iprZqg0PDuSusOVRrNZObpYxKzBdVgwUN8E4LdbazcG9QTx4qfvkLYz9juPn17RQttNird3cLNMAnmuke9XbzOa0WGs319wbwLU0q+Z1dK3dHNwbwLU0q+a0WGs3B/cGcC3NquW113upPmslt7nXKZvmtu9AP/19YmL7rrGKrqX1ruoUyEvOXOigbi3n4F6H6g7UsfEJ+ueI/fbqZ2zrhHOae5g7160sHNzrkNeBOrEj2GuPuaz9m9e3qVRWBlN1rju4Wyu5zb0O7kC1In5vWFk4uNfBHahWxO8NKwsH9zo4zc3yrFw7yi+f3bbbdr83rB2mDe6SviDpcUn3ZbbtL+lWST9Mf++Xbpekz0haL+keScc2s/Dt4jQ3q1bpSB0bn5i0fb+9+v3esLaopUP1CuB/A1/KbFsO3BYRKyQtT++/HzgZOCz9eSVwafq763j2R8sqmvt9rz3m+n1ibTFtzT0i/hP4edXm04Ar09tXAksz278UiTuBQUkHNaqwZmXljlQrm3pTIQ+MiM3p7UeBA9Pbw8DGzHGb0m2bqSJpGbAMYP78+XUWo/08Z7eBl1i08pl1h2pEBPnLSE73uMsiYiQiRoaGhmZbjLbwnN1W4U52K5t6g/tjleaW9Pfj6fZRYF7muEPSbV3Js0FahTvZrWzqbZa5ATgbWJH+/kZm+3slXU3SkfpUpvmm67id1bKKOtnddGftMG1wl/Q14ATgAEmbgA+RBPVrJZ0DPAKckR5+M3AKsB7YCry7CWUuDbez2nQ814y1y7TBPSLeVrDrNTnHBvCe2RaqU5y/5PBJ/7jgdlabzHPNWLt44rBZqPxz+iu3FXHTnbWLg/sseTCTTcVNd9YunltmBlauHWXxilUsWH4Ti1escsqjTcspktYuDu41ck671aM6RXJwoJ/n9c/hz6+5yxUEayoH9xo5p93qtXTRMHcsP4lLzlzIs9t28OTWCVcQrOkc3GvkjjGbLVcQrJUc3GvkRRhstlxBsFZycK+RO8ZstooqAnMkd9JbwzkVcgrVw8bffNwwtz+0xTntVpe8QW8A2yOZd8+jV62RHNwL5A0bv27NqCeDsrpVD3qbI+0M7BUevWqN4maZAu78smaoZM48vOJUdkT+TNlug7dGcHAv4M4vazZ30lszObgX8D+eNdPKtaNsfW7bbtvdSW+N4uBewNkx1iyV/pwnt05M2j440O8+HWsYd6gW8IyP1ix5/TkAe+85d9L7y4t82GwoCjp1WmlkZCRWr17d7mKYtcSC5TcVLjoskqa/E48Y4ro1o7utFeCavWVJWhMRI3n73Cxj1mJT9dtU5py56s6fOFvLZsXB3azF8vpzqhXV7J2tZbVym7tZi1X358ykYdTZWlYrB3ezNsiu4LV4xarc1ZqqOVvLZsLNMjm84pK1Ui3NNIA7U21GXHOvkjenjCdzsmbKNtMU1eCHBwf8/rMZcc29iueUsXaozDnz6TMXevCcNYRr7lU8p4y1Uy2D5zy4yWrh4F7l4MGB3K/GzlKwVsl2tlZzs6HVysG9St6CCv1zxNbntrFg+U2uKVnLZWvqngPeajWr4C5pA/AMsB3YFhEjkvYHrgEOBTYAZ0TEk7MrZutUfy3ed6CfXz63beckT64pWStV19SrA3uFmw2tWiM6VE+MiIWZ+Q2WA7dFxGHAben90sumP158yzrOX3I4D684lb33nMvE9vyaklmzFU0yVs3NhlatGc0ypwEnpLevBL4FvL8Jz9MwU7VjuoPV2qmW95mzaSzPbGvuAXxT0hpJy9JtB0bE5vT2o8CBeQ+UtEzSakmrt2zZMstizM5U6Y9etMPaabr32fDggAc3Wa7ZBvdXR8SxwMnAeyT9TnZnJPMJ5zYSRsRlETESESNDQ0OzLMbsTFU796Id1k5TjV6tvA8d2C3PrJplImI0/f24pH8GXgE8JumgiNgs6SDg8QaUs6mmSn/0oh3WTpX32XnX3p2bJXPhDff7vWm56l6sQ9LewJyIeCa9fSvwEeA1wM8iYoWk5cD+EfFXU52r3Yt1VLe5gxdGsHKZaoGPrP45Yp/nzWVs64SDfQ+YarGO2dTcDwT+WVLlPF+NiH+T9H3gWknnAI8AZ8ziOVrCtXMru6Jvl9UmdoTTdg3wMntmHSHv22WthgcHuGP5SU0olbVbs2ruZtYied8ut2YG103Fabu9ycHdrENUzzlTa23eabu9qeeDu2fYs05VNFVGdkS103Z7V88F92wwr/5ncAeUdZpKbb7yvh4bn6AvnVxs2JWVntZTi3VUvsaOposSj41PeN4Y63jZ9zUkk4t5gJP1VHCvdRImd0BZJ/HqYZanp4J7rUHbHVDWSTy5neXpqeBeS9B2B5R1Gk9uZ3l6JrivXDvK1ue2TXlMn+QpB6zj5E0ull09bPGKVaxcO9qm0lm79ES2TK35wDsiHNit43j1MMvTE9MPLF6xqqZ5OTxM27rBVO/34cEBTjxiiNsf2uKxHV1gqukHeqJZxqvZWC+Z6v0+OjbOV+78yc504NGxcc695i4WfvibbrrpMj0R3Is6lvokhFezse5ST0fq2PgEF1x/rwN8F+mJ4F60mtInzziGh1ecyh3LT3Jgt64x1epNU3FufHfpiQ5Vz9duvST7fq+lrynLufHdoyc6VM161Uznge+T+OQZx7ji0yF6bj53z/RolphqDdY82yOcNtkluq7mXlRTGRzo58I3HeU3rPWkvP+L/jnJ7JE7ckLAfnv1s9cec11BKrmeSoUsmhzM2QDWy5YuGuai049meHBgZ4bYxW89hqK63ZNbJyalS2b/d1auHWXxilUe/VpyXdcsM1WHUCUbwDUQ60XVKzlB7Z2u2Uya7DcAj34tr66ruU+X4+tsALNdZpI2WRnwlDe98LnX3MWhrsmXStcF9+nerJ4pz2yXbHNNI4yOjXP+1+9m4Ye/6WabNuu64F55s+63V/9u+zzFgNnuli4a5o7lJ6EGnW9iezA2PpHbXm+t0xVt7pXUx9Gx8UnrR5768oM8QZJZjQ4eHJjxoKdazLSvy6nMjdGxqZDZgC4g768Y6O/znDFmNZrpgKeZEPDwilMLK2KVAJ6bstkn9t5jLk+NTzjYV5kqFbJpwV3SG4C/B/qAz0fEiqJjZxrcZ/Im9DS+ZrXL1poH9+rnF7/axkROInx/nyDI3VdkjsjNqQcKK2h58ipt2XLvO9CPBGNb8z8MGv3NoN7zNaIcLR+hKqkP+AfgdcAm4PuSboiIBxpx/loXugZnx5jNRHW65FQ1bZjZ/DVTfQ7MpIpZ3cxTXdkbG5/YeWx1qmb1sbNN5az3fI0uR55mtbm/AlgfET8GkHQ1cBrQkOA+k4Dt7Biz+uXlxlfvh+Y26eTJxoDpKnvZD4O8Y2cz/qXe8zW6HHmalS0zDGzM3N+UbmuIWgO2s2PMWiNvBGwzZWNALZW9yjFFx9b7Db/e8zW6HHnalgopaZmk1ZJWb9myZUaPzctlr6Rx9Sm55QU4zFqrkkFBxWIAAAWuSURBVFJZWSOhWQG+utJWS2WvckzRsfV+w6/3fI0uR55mBfdRYF7m/iHptp0i4rKIGImIkaGhoRmdPK+WcMmZC9mw4lR+dNEpbPACHGZtN92AQlX9rsXgQP9ulbbpnif7YVC0cE+93/DrPV+jy5GnWW3u3wcOk7SAJKifBfxBI59gurZAM2uv6kVD8lIfobjTttaFvKsX45kqW6bRC/fUe75WLCDUzFTIU4BPk6RCfiEi/rboWC/WYWY2c21ZrCMibgZubtb5zcysWNfNLWNmZg7uZmZdycHdzKwLObibmXWhUswKKWkL8Ehm0wHAE20qTr06rcydVl7ovDK7vM3XaWVudHlfFBG5A4VKEdyrSVpdlN5TVp1W5k4rL3RemV3e5uu0MreyvG6WMTPrQg7uZmZdqKzB/bJ2F6AOnVbmTisvdF6ZXd7m67Qyt6y8pWxzNzOz2Slrzd3MzGbBwd3MrAuVLrhLeoOkdZLWS1re7vJUkzRP0u2SHpB0v6T3pdv3l3SrpB+mv/drd1mzJPVJWivpxvT+AknfTa/zNZL2aHcZsyQNSvq6pIckPSjpVWW+xpL+PH0/3Cfpa5KeV7ZrLOkLkh6XdF9mW+41VeIzadnvkXRsicp8cfq+uEfSP0sazOy7IC3zOklLylDezL7zJIWkA9L7Tb3GpQrumYW1TwaOBN4m6cj2lmo324DzIuJI4HjgPWkZlwO3RcRhwG3p/TJ5H/Bg5v7HgUsi4iXAk8A5bSlVsb8H/i0ijgCOISl7Ka+xpGHgz4CRiHgZyTTXZ1G+a3wF8IaqbUXX9GTgsPRnGXBpi8pY7Qp2L/OtwMsi4uXAfwEXAKT/h2cBR6WP+cc0prTSFexeXiTNA14P/CSzubnXOCJK8wO8Crglc/8C4IJ2l2uaMn8DeB2wDjgo3XYQsK7dZcuU8RCSf9yTgBtJFr95Apibd93b/QPsCzxM2uGf2V7Ka8yuNYP3J5lG+0ZgSRmvMXAocN901xT4P8Db8o5rd5mr9v0+cFV6e1K8AG4BXlWG8gJfJ6mkbAAOaMU1LlXNnSYvrN1okg4FFgHfBQ6MiM3prkeBA9tUrDyfBv4K2JHefwEwFhHb0vtlu84LgC3AF9OmpM9L2puSXuOIGAU+QVIr2ww8Bayh3Ne4ouiadsr/4h8B/5reLmWZJZ0GjEbE3VW7mlresgX3jiFpH+A64NyIeDq7L5KP4VLkmEp6I/B4RKxpd1lmYC5wLHBpRCwCfklVE0zJrvF+wGkkH0oHA3uT89W87Mp0TWsh6YMkzaRXtbssRSTtBXwA+JtWP3fZgvu0C2uXgaR+ksB+VURcn25+TNJB6f6DgMfbVb4qi4E3SdoAXE3SNPP3wKCkykpcZbvOm4BNEfHd9P7XSYJ9Wa/xa4GHI2JLREwA15Nc9zJf44qia1rq/0VJ7wLeCLw9/VCCcpb5xSQf+nen/4OHAD+Q9EKaXN6yBfedC2unmQVnATe0uUyTSBJwOfBgRHwqs+sG4Oz09tkkbfFtFxEXRMQhEXEoyfVcFRFvB24H3pIeVpryAkTEo8BGSZWl4F8DPEBJrzFJc8zxkvZK3x+V8pb2GmcUXdMbgD9MMzqOB57KNN+0laQ3kDQzvikitmZ23QCcJWlPSQtIOiq/144yVkTEvRHxaxFxaPo/uAk4Nn2PN/cat6ODZJrOiFNIesB/BHyw3eXJKd+rSb663gPclf6cQtKOfRvwQ+Dfgf3bXdacsp8A3Jje/nWSN/564J+APdtdvqqyLgRWp9d5JbBfma8x8GHgIeA+4MvAnmW7xsDXSPoEJkiCzDlF15Sk0/0f0v/De0kygcpS5vUkbdWV/7/PZo7/YFrmdcDJZShv1f4N7OpQbeo19vQDZmZdqGzNMmZm1gAO7mZmXcjB3cysCzm4m5l1IQd3M7Mu5OBuZtaFHNzNzLrQ/wfXr4XGNVPn1wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs # instances\")\n", + "plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Check words frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n", + "w_count_df.sort_values(0, ascending=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
die3066
der2362
das1794
ist1767
nicht1467
......
wertvollsten,1
blutgruppe1
gelenkschmerzen1
entgeltbefreiung1
anrã¼cken.1
\n", + "

27102 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " 0\n", + "die 3066\n", + "der 2362\n", + "das 1794\n", + "ist 1767\n", + "nicht 1467\n", + "... ...\n", + "wertvollsten, 1\n", + "blutgruppe 1\n", + "gelenkschmerzen 1\n", + "entgeltbefreiung 1\n", + "anrã¼cken. 1\n", + "\n", + "[27102 rows x 1 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w_count_df" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "18" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check a certain word\n", + "w_count_df.at['auto', 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:214: RuntimeWarning: Glyph 159 missing from current font.\n", + " font.set_text(s, 0.0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:214: RuntimeWarning: Glyph 156 missing from current font.\n", + " font.set_text(s, 0.0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:183: RuntimeWarning: Glyph 159 missing from current font.\n", + " font.set_text(s, 0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:183: RuntimeWarning: Glyph 156 missing from current font.\n", + " font.set_text(s, 0, flags=flags)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAG5CAYAAACDRzPnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy9h23ruAAAgAElEQVR4nOy9e7hdVXX3/xm5h6tcAkKCXCOCFblEtGq13kCxBa0Vwb6Vqr9ivdVq1R+2r0Xb+mp98V61pRVFCwK2KgiIIoIKKCQgJBAICYRAQkhC7vecnDPeP8aYrnV29t5nn+SEc5L9/TzPfvbea80111xzzTnGnGPMi7k7QgghupdRw50AIYQQw4sUgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5Y4Y7Ae048MAD/YgjjhjuZAghxC7FXXfd9ZS7T+o0/IhWBEcccQQzZswY7mQIIcQuhZktGEx4mYaEEKLLkSIQQoguR4pACCG6nBHtIxBCiOGip6eHhQsXsmnTpuFOSksmTJjAlClTGDt27A7FI0UghBBNWLhwIXvvvTdHHHEEZjbcydkGd2f58uUsXLiQI488cofikmlICCGasGnTJg444IARqQQAzIwDDjhgSHosUgRCCNGCkaoECkOVPikCIYQYwdxwww0ce+yxHHPMMXzmM5/ZKfeQj0AIITrgiAuuG9L4Hv3M6wcM09vby3vf+15uvPFGpkyZwgte8ALOPPNMjj/++CFNy4A9AjObYGZ3mtm9Zna/mX0yjx9pZneY2Twzu9LMxuXx8fl/Xp4/ohbXx/L4HDM7fUifRAghdjPuvPNOjjnmGI466ijGjRvHOeecw9VXXz3k9+nENLQZeKW7Px84EXitmb0I+BfgC+5+DLASeGeGfyewMo9/IcNhZscD5wDPBV4LfM3MRg/lwwghxO7EokWLOOyww373f8qUKSxatGjI7zOgIvBgXf4dmx8HXgn8dx6/FHhD/j4r/5PnX2Xh0TgLuMLdN7v7fGAecOqQPIUQQojtpiNnsZmNNrN7gKXAjcDDwCp335pBFgKT8/dk4HGAPL8aOKB+vMk1QgghGpg8eTKPP16JzYULFzJ58tCLzY4Ugbv3uvuJwBSiFf+cIU9JYmbnm9kMM5uxbNmynXUbIYQY8bzgBS9g7ty5zJ8/ny1btnDFFVdw5plnDvl9BjV81N1XATcDvw88w8zKqKMpQDFcLQIOA8jz+wLL68ebXFO/x8XuPs3dp02a1PFy2kIIsdsxZswY/vVf/5XTTz+d4447jrPPPpvnPve5Q3+fgQKY2SSgx91XmdlE4DWEA/hm4E+BK4DzgOLKvib//zrP/9zd3cyuAS43s88DhwJTgTuH+HmEEGKn0Mlwz53BGWecwRlnnLFT79HJPIJDgEtzhM8o4Cp3v9bMZgNXmNk/A78FvpHhvwF8x8zmASuIkUK4+/1mdhUwG9gKvNfde4f2cYQQQgyWARWBu88ETmpy/BGajPpx903Am1vE9SngU4NPphBCiJ2FlpgQQoguR4pACCFa4O7DnYS2DFX6pAiEEKIJEyZMYPny5SNWGZT9CCZMmLDDcWnROSGEaMKUKVNYuHAhI3k+U9mhbEeRIhBCiCaMHTt2h3f+2lWQaUgIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILkeKQAghuhwpAiGE6HKkCIQQosuRIhBCiC5HikAIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILmdARWBmh5nZzWY228zuN7MP5PFPmNkiM7snP2fUrvmYmc0zszlmdnrt+Gvz2Dwzu2DnPJIQQojBMKaDMFuBv3X3u81sb+AuM7sxz33B3S+qBzaz44FzgOcChwI/M7Nn5+mvAq8BFgLTzewad589FA8ihBBi+xhQEbj7YmBx/l5rZg8Ak9tcchZwhbtvBuab2Tzg1Dw3z90fATCzKzKsFIEQQgwjg/IRmNkRwEnAHXnofWY208wuMbP98thk4PHaZQvzWKvjjfc438xmmNmMZcuWDSZ5QgghtoOOFYGZ7QX8D/A37r4G+DpwNHAi0WP43FAkyN0vdvdp7j5t0qRJQxGlEEKINnTiI8DMxhJK4DJ3/z6Auy+pnf8P4Nr8uwg4rHb5lDxGm+NCCCGGiU5GDRnwDeABd/987fghtWBvBO7L39cA55jZeDM7EpgK3AlMB6aa2ZFmNo5wKF8zNI8hhBBie+mkR/AS4M+BWWZ2Tx77O+BcMzsRcOBR4F0A7n6/mV1FOIG3Au91914AM3sf8BNgNHCJu98/hM8ihBBiOzB3H+40tGTatGk+Y8aM4U6GEELsUpjZXe4+rdPwmlkshBBdjhSBEEJ0OVIEQgjR5UgRCCFElyNFIIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdjhSBEEJ0OVIEQgjR5UgRCCFElyNFIIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdzoCKwMwOM7ObzWy2md1vZh/I4/ub2Y1mNje/98vjZmZfNrN5ZjbTzE6uxXVehp9rZuftvMcSQgjRKZ30CLYCf+vuxwMvAt5rZscDFwA3uftU4Kb8D/A6YGp+zge+DqE4gAuBFwKnAhcW5SGEEGL4GFARuPtid787f68FHgAmA2cBl2awS4E35O+zgG978BvgGWZ2CHA6cKO7r3D3lcCNwGuH9GmEEEIMmkH5CMzsCOAk4A7gYHdfnKeeBA7O35OBx2uXLcxjrY433uN8M5thZjOWLVs2mOQJIYTYDjpWBGa2F/A/wN+4+5r6OXd3wIciQe5+sbtPc/dpkyZNGooohRBCtKEjRWBmYwklcJm7fz8PL0mTD/m9NI8vAg6rXT4lj7U6LoQQYhjpZNSQAd8AHnD3z9dOXQOUkT/nAVfXjr8tRw+9CFidJqSfAKeZ2X7pJD4tjwkhhBhGxnQQ5iXAnwOzzOyePPZ3wGeAq8zsncAC4Ow8dz1wBjAP2AC8HcDdV5jZPwHTM9w/uvuKIXkKIYQQ242FeX9kMm3aNJ8xY8ZwJ0MIIXYpzOwud5/WaXjNLBZCiC5HikAIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILkeKQAghuhwpAiGE6HKkCIQQosuRIhBCiC5HikAIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILkeKQAghuhwpAiGE6HKkCIQQossZUBGY2SVmttTM7qsd+4SZLTKze/JzRu3cx8xsnpnNMbPTa8dfm8fmmdkFQ/8oQgghtodOegTfAl7b5PgX3P3E/FwPYGbHA+cAz81rvmZmo81sNPBV4HXA8cC5GVYIIcQwM2agAO7+SzM7osP4zgKucPfNwHwzmwecmufmufsjAGZ2RYadPegUCyGEGFJ2xEfwPjObmaaj/fLYZODxWpiFeazVcSGEEMPM9iqCrwNHAycCi4HPDVWCzOx8M5thZjOWLVs2VNEKIYRowXYpAndf4u697t4H/AeV+WcRcFgt6JQ81up4s7gvdvdp7j5t0qRJ25M8IYQQg2C7FIGZHVL7+0agjCi6BjjHzMab2ZHAVOBOYDow1cyONLNxhEP5mu1PthBCiKFiQGexmX0X+EPgQDNbCFwI/KGZnQg48CjwLgB3v9/MriKcwFuB97p7b8bzPuAnwGjgEne/f8ifRgghxKAxdx/uNLRk2rRpPmPGjOFOhhBC7FKY2V3uPq3T8JpZLIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdjhSBEEJ0OVIEQgjR5UgRCCFElyNFIIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdjhSBEEJ0OVIEQgjR5UgRCCFElyNFIIQQXc6AisDMLjGzpWZ2X+3Y/mZ2o5nNze/98riZ2ZfNbJ6ZzTSzk2vXnJfh55rZeTvncYQQQgyWTnoE3wJe23DsAuAmd58K3JT/AV4HTM3P+cDXIRQHcCHwQuBU4MKiPIQQQgwvAyoCd/8lsKLh8FnApfn7UuANtePf9uA3wDPM7BDgdOBGd1/h7iuBG9lWuQghhBgGttdHcLC7L87fTwIH5+/JwOO1cAvzWKvjQgghhpkddha7uwM+BGkBwMzON7MZZjZj2bJlQxWtEEKIFmyvIliSJh/ye2keXwQcVgs3JY+1Or4N7n6xu09z92mTJk3azuQJIYTolO1VBNcAZeTPecDVteNvy9FDLwJWpwnpJ8BpZrZfOolPy2NCCCGGmTEDBTCz7wJ/CBxoZguJ0T+fAa4ys3cCC4CzM/j1wBnAPGAD8HYAd19hZv8ETM9w/+jujQ5oIYQQw4CFiX9kMm3aNJ8xY8ZwJ0MIIXYpzOwud5/WaXjNLBZCiC5HikAIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILkeKQAghuhwpAiGE6HKkCIQQosuRIhBCiC5HikAIIbocKQIhhOhypAiEEKLLkSIQQoguR4pACCG6HCkCIYTocqQIhBCiy5EiEEKILkeKQAghuhwpAiGE6HKkCIQQosuRIhBCiC5nhxSBmT1qZrPM7B4zm5HH9jezG81sbn7vl8fNzL5sZvPMbKaZnTwUDyCEEGLHGIoewSvc/UR3n5b/LwBucvepwE35H+B1wNT8nA98fQjuLYQQYgfZGaahs4BL8/elwBtqx7/twW+AZ5jZITvh/kIIIQbBjioCB35qZneZ2fl57GB3X5y/nwQOzt+Tgcdr1y7MY/0ws/PNbIaZzVi2bNkOJk8IIcRAjNnB61/q7ovM7CDgRjN7sH7S3d3MfDARuvvFwMUA06ZNG9S1QgghBs8O9QjcfVF+LwV+AJwKLCkmn/xemsEXAYfVLp+Sx4QQQgwj260IzGxPM9u7/AZOA+4DrgHOy2DnAVfn72uAt+XooRcBq2smJCGEEMPEjpiGDgZ+YGYlnsvd/QYzmw5cZWbvBBYAZ2f464EzgHnABuDtO3BvIYQQQ8R2KwJ3fwR4fpPjy4FXNTnuwHu3935CCCF2DppZLIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdjhSBEEJ0OVIEQgjR5UgRCCFElyNFIIQQXY4UgRBCdDlSBEII0eVIEQghRJcjRSCEEF2OFIEQQnQ5UgRCCNHlSBEIIUSXI0UghBBdjhSBEEJ0OVIEQgjR5UgRCCFEl7NLKIIjLrhuuJMghBC7LU+7IjCz15rZHDObZ2YXdHqdlIEQQuwcnlZFYGajga8CrwOOB841s+M7vV7KQAghhp4xT/P9TgXmufsjAGZ2BXAWMHuwEdWVwqOfef1QpU8IIbqOp1sRTAYer/1fCLywHsDMzgfOz79uZtbv/L9sG2mzY0II0cWcMpjAT7ciGBB3vxi4GMDMfJiTI4QQuz1Pt7N4EXBY7f+UPCaEEGKYeLoVwXRgqpkdaWbjgHOAa57mNAghhKjxtJqG3H2rmb0P+AkwGrjE3e9vc8l84IinI21CCNGtmLvM8EII0c3sEjOLhRBC7DykCIQQossZUYrAzEaZ2VvM7CIzG187foCZfbYcy3D7mNkz8/9oM/tBk/g+UDv/rfz9cTP7oJm9xMzenJ978twX8/vN+T3RzI7N33uZ2a8ybaeZ2Yvz+J/l90sGeLb9zez325z/gybHLjSz59avy/SObgg32swuG+D+ozPt1i5ck+tGlWvy99n5e4/8HmNmV5rZ3h3Gd2QnxzqMa4yZXdTk+MkWHFY79uZ8n79XP5bf4xvjaIjvnWZ2Qu3/S8xsekMcn8x7HFkL93Yzu87Mjhoo3xuuG21ml+V1Jb/NzJ5lZmeXslbqQUlTu/gb0v7mJsf+LO/7wVoaLjKzPzazUbWwe9TLxGBpqNejsy7Wjz2vsZ43XlcLW8+zcs3RpUy0uKZep0fV0jDRzF7eLO52x9o85wc6PPa7PG9yblSRM7Vje3SahsEwYnwEWZA/DpwG1AuZN/xvRz1sebDtKrA7SOO9G5+hMZ3bm8Y+Qpk3ix/ax9vqvtub362OO9ADjAO25vHRQG+mvTHdfU2O9+X3ZmB8/m8c6FCu6zSNzcKVsCuB/Qd5fWPYZtf1EXkwpnbOamF783wzxdTJ89Xj6QEmUOXp6CbXNtKb4QZbd3qAsXkfa7hfPU31NNTff3m/xrbloXzq1/ZQ5eFG4jlLPKU+NNL4LI3vp1zbmM/lfW2pnR9dC1N/f63ibleG6ufq+b857zGuxXWFkhcOLAOuAw4C9gZWATe7+5cGiGNEKYIngYOHOx1CCLGbMdbdt7YLMJJMQ1ICQggx9Kwxs9PaBRhJiuAXw50AIYTYDfk94LPtAowYReDuf0jYxYQQQgwRZbXndowYRZA+grajN4QQQgwOM/slcHi7MCNCEVisO3TAcKdDCCF2Qw4Ezm0XYEQoAvoPxxJCCDF07APc0i7AiBC+7r6RaiyxEEKIoWMP4D/aBRgRiiD5zXAnQAghdkP2BU5uF2Ak7VC2L9UsPiGEEEPDLJrPtv4dI6JHYGYTgGOREhBCiKFmErCgXYCRInjHUq1HI4QQYuj4EbC4XYAR0SNw97XATcOdDiGE2A2Z4u6fbBdgJC06V1bRE0IIMXQ48Bfu/u1WAUZEjyC5gQEcGkIIIQZND/C1dgFGkiK4meHZO0AIIXZnFjPAfhQjSRGcM9wJEEKI3ZCVDGBtGRE2eTPbF9hvuNMhhBC7IR8jdj9ryUhyFi8htlgTQggxdKxw97aLeo4k09BHCKeGEEKIoWP/gQKMpB5BH7HZskxEQggxhLh724E4I6lH8Cix3pAQQoiho9fMXt0uwIhQBGb2DGLBuRGRHiGE2I3oBT7fLsBIEbzrgGuADcOdECGE2M3YAuzdLsCIUATuvhX4BAOskCeEEGLQ7AHc2i7AiHEWA5jZr4EXMMAsOCGEEB3jwGhvI+xHRI+gxumMvDQJIcSuzkfanRwxPQIzmwhcD9wLfGCYkyOEELsTy939wFYnR5IiGAc8CeyDTENCCDFUrAT2dPfxrQKMGDOMu28B7ieGkQohhBgabiVGDrVkRCw6B2Bm1xHp0XaVQggxdLwaeFu7ACPJNDTe3Teb2chIkBBC7CbsSktM3Gxm/wBcOdwJEUKI3Qkzm9nu/IgxDRHdlwV0sFKeEEKIQfHZdidHTI/A3TcAp7n7aLR3sRBCDCUfbndyxPgICma2J7H2kBBCiKFhk7tPbHVyxPQIajyf2KCm7dZqQgghOmbX2KqyYGaLgYOBtl5uIYQQHbEW2MPdW/qER2KP4HI0qUwIIYaKHw8UYCQqghOAh4Y7EUIIsZvwcuD2dgFGlCIws08DXybWzxZCCLHjrHD3l7ULMKIUAfAyYC4webgTIoQQuwnPHijASFME04Fvoy0rhRBiqBhtZh9tF2BEjRoys1OAvYCby6FhTI4QQuwO9AEL3P2oVgFGlCIomNkCYghpy/WzhRBCdMQWYIu7t9zAfqSZhgrjgLHDnQghhNgNeCcDjMQcSYvOYWb3EzuUwchVUkIIsStxKtCyNwAjT9ieAXwJmIgWnhNCiKHgjUDL/Yph5PoI9gBWED4CR05jIYTYXrYCK939oFYBRlqPoHAH1Qb2UgJCCLH9fB/Y2C7ASFUEfwTMAG5DcwqEEGJHeCNwUbsAI8o0ZGbPILQXhINjFTGCaF+0qb0QQmwP69oNHYWR1yNYB9zt7q8kViH9OnA3MeO4bzgTJoQQuyh3DxRgRPUIAMxsL3dfZ2azgWOA+cAhDDD8SQghRFMedvdj2gUYUfMIANy9bFP5uvy+H61GKoQQ28vSgQKMNNNQnaOJLSvPJXwFy5F5SAghBst+ZjazXYAR1yOo8XLCZ/AxYD9iLOwGYlG6Qh8jW5kJIcRw8yFgdrsAI85H0IiZzSJGDD0bCX4hhBgsM4DvuftnWwUY0ULVzPYnzEP3AouAn+apHkIpbAWWDU/qhBBil+Cvgb9oF2DE9QjM7Dp3f33+ng8cRKxEup5wGo8jFEAxa20CJgxDUoUQYldgPTDX3U9qFWAk9gj+BMDM1gIHAL3EchNjiOUmHiOGk0KsQyQlIIQQrXEG8BGMREVwE4C77+3u+7j7PsCP87OSWFf7LKJX8BXgKuAp4mFHVvdGCCGGn72AF7ULMOJMQ80ws38mhP8hRK+gl1Bie2YQ9QqEEGJbnBh9ucLdj2gVaCT2CJpxFrHcxJ7EXgVPEUNKx9NcCTSutNebHyGE6DZuJqwpLdlVFMEm4DJgHnAdMeu4BziSmDX3FGEqghhN1LjN5WiqZa1BJiQhRHewnmg0/6xdoF1FEUwnTEJfBU4EfggsAVYTD7ov1bOMYuCJcmWPAykEIcTuzCpgX3f/SLtAu4oi2Ad4M3AL8BrgPHc/HHgl8HFiN7NWz9JO2GvTGyHE7swedNDg3VWcxa8A/iA/RwO/BZ4D3Am8mvAT7E+YhXqAu4AXE4K+CHvNShZCdBtrgbe4+4/bBdolFAGAmY0GXgC8Avgrwm/wb8A/EopgDNWcg3b0Eo6TA6kmpmlSmhBid2W+ux/VLsAuoQjM7CZixNCvgV8BtxI7mY1z91MzzCrCV1AfHVSUQpljUHoEnfYOHJmPhBC7Ln3uPlDjeJcxlcwEtgC/B5yQ368BbjOzfzWzPwDWEC38rYQyqO91bPR/1k6fW0pACLHbs0v0CApmtjexeNKHgWcCt+cpB16Yv8cRSmMCIcgXAIcTyqE+mmgtMQ9hoL2QG3sTQgixK7E2V2hoyS6hCMzsfYSj+BTgUcI89CvgJCrzzX75+53AecBnCYdysf2vJMxLRfCvo9rb4Ang0NotZRISQuwObHT3AXd43FUUwYcJwX+Xu2+tHb8cmAZcQwjuPwImAQ8Tgv0uYt7BZOBJ+gv7VtQzpCiDZk7ozUSPQgghRio97j6Q1WNE71D2O9z9ohanpgAnl32OzexC4D5i/4JTgIXA6wnhXl+xtAj4+nLW0F/gb6DaK7luFirXj29yTAghRhKrOgm0q9u9DyJa5oUewndwJbFhzbOJdYeMqqVfF9hFCZRzjaOK+ppcU1ci5fvp7FaN/C6cEGKksNfAQXaRHkEbLgPuMLOr8/8fAw8Q8wJ+RuxdMJfY+P6DxIJ1fUTLfxxVS76xNT+Kbc0+9SGn9Z7E052H6nkIITphEzDRzC5397e2C7hL+AjaYWbTgJfk39uAHxF+ggXEOkQHlqD5vQ/Ri9hA+Ax6gfuB51ONENpAKIp2trUetl3cTgghRgpluf757n50u4C7vCJoxMzeDVwPfJL+ZpQzCIF/ALFq6U157EFCWRRH8iZildN9gMNKtC1u11c7tzNb6s38GloyQwjRCQPOLN7VTUPN2NvdF5jZzfRXBM8jdjM7AvgI0dp/DHguIVDXE0NNy/aXU4hWf9kis+5ILoK5mJk2E4qjkaFyIjfzawxWCcihLUT3sYBYpbktu6MieCC/ZwN/B7y2du6LxAzkrwA/B24g9jMoO52NJpTAMfm/+BG2ED2FvTL8MwgfQlEMrZRA6ZoNZct9c6ZrsEJdSkCI7mMFA6+/tvuZhgpmNodo+a+gGv2zFbi7YS7CQmBvQmjvSSiKvQhB3zgbuU4fIew3Am8CPk30LsYzuNb3YFvqq/Pe+w3iGiFE93Kvu5/YLsDu2CMoLHP3a1qdNLNZhBBeS/gHijAuzuU+YjbyXsRoo/VUPQeoWvkTieWwR9N8uOlA1MM2KoX6/6XEcNm7gZcNEGdZTbU+6a3E1ckKrUKIXZ8fEX7Q9w0UcHfuEbwKOJdwCv9uroG7fz/PH14L/lfExjfPohKYZURQH+E8nkAojF8DpxLDUa8Hrnb3E83sMcL09DlCEG8iTEiNDLYHUBzSmzMNg3ESl2ub3a+s1gph+tIsaSF2P3oJc7m7+wmtAu3OPYK3E2sNjaVqqTuxfDXuvqAENLO3EstQrCWEYx9hUlpIOJdXAfcQtvl5xGii+4nF7Caa2bOIzC7bwZWhp8VH0GxCWjvqyqII8gmEwL6DGC7bbke2ZhPoGkce7Z2/pQSE2H1xYumdtuzOiuAF7n5sh2EPJDa5OYFYu2gGIWzXEstM/B4xN+Ea4C+J3dCK0F9B7I+wlGoWX+OSFHW2UPU2jMqMU2+9N1Mcvfm5kViArxVGNdqpmL72bghTzsHAq68KIXZdZtcbva3YnU1D3wT+r7vP7iDsauAthK3/48ROaC+m6klsBE4GrgVOA45195/Wrj+Q6CnsRWv7e7PF7LaHeou/WTybCUUzKn+PIhTDE8RoqGamJQ0tFWL3w4FNu83qo9uDmT1A7G88nxCIRgs7mZkto3IS1ymKYIG7H2VmPwbe7O7rzOw57v6gmZ2cYZ4JnAX8L8KB3ChY68K2leCtz1uoX1f2QyhDWevDR9s5mLdmnBObhFtG9HLKPIgBC4sQYpeh1Pct7j6g6Xd3VgSHNzverJuUYf+BEPzPI1r2KwkT0Z2EKWYr4Ud4PuGAfgkxF+FPiPWMAH6fEOTtRuWUrTQHO3KntOQHGvWzIcMWM1Wjz6AMid2S4SZ0EKcQYtejD/ixuw/oI9htFcFgMbM/aTg0AXglcCbhE3hni0tPBT5G9AQ+SSxhUZzEm6h2SxtHzFHYk4HXKGrWY3iI6HXs3eRc47XFPPRzYvnt4+g//HV0httE5YgWQuw+OLAYmAM8udsvOjdUpE8BYqTRMwmBvg5YQmTk61pcN9PdTzCz7xC9hB5gKtVksx6qSWatzC+9hKDeg+h5bCCUT2/GMZYQ2vuyrYmpcQntwmZi8tkkYBbwVeDrVMtnj6O/2UmIoUI+p5HBEsLkvWCgRed251FDg8Ld3w6td0PLc1OJGcTHU7WiJ+f3QrPXMxIAACAASURBVOBgwky0AfgNMcz02Xl+U7kV21aS0VTLVIyhEsyjqUw844mu3nqqUUBGCPs985oNhNIYk+EPynAnAP+ev+ujhAbyWQixPagsjQyuJSwVWwcKqJbgtnwHOJ+YlYeZHW9mxSz0TapW9SuAbwMLzOzfiVFHDxPDO/ckegcHUy1rXRapW1e7V2nNb6X/Bjt1Yd1LmJe2UPUaoHJk70tlappA9CCKH+QpKlPRBsJxPq/JM9/eIi+EELsmfcA7CNlw/kCBpQi25VvAT6iWpX4I+Jv8PdHdbyJMagvc/ROEYP8JcDoh0I8nhPEoYp/kNXm85PXehFCHquU0mhDWRWj/kMqZ+9UMdx/R+l/REF+99TWK6FlMyf+fJ0xTVwCvy99lOdrNtetfwshjMDbLkWLfHCnpeDroGzhIS56ufOqm99GKX7j7LwYKJEWwLQe6+1VkQU/zUBnps9nMRgFzzex9ZvZGYE93/767zyWcMzPz2wjTzL6EqWYsoSCKk7aOEQpiNdEb+MP83gK8nxD+Mwkh/yuit7KW/j4Car+Lye/v8/d5wC8IM1V556UXsSG/yzPuCI3Cwdk2jZ2yves1PV300X/G+nClY7jYEdnxdOXT7vQ+BluHyhL5t3QSWIpgW9ab2QFkxpvZi6g2vv8hYZr5a+AU4M8JIVv4lrt/jBhtNBV4EbHkdSmQvyWUwFwqAdmT59bmuaVET2IrIfyvI4awHg2sdfc3ld8ZZiGx/tEyYktOJ5bEmE0sUPckoVB+UEvnKqp3X0xNQzF8tLE8NZsp/XSyoy3Cnjbn6suL14fnis7ZnfNse56t3TWDrUNlPtIfdxJYo4YayAliXyGWlbiPGHUzmphL8GOitd7vpbj7irz2bnc/2czWEkJiLNsOFXVCgJfj9+a9lhKK4nTCnDSRykFc1ityYnbwbJrvSdBHCP35RG/kv4lexoeBy4lF+OpzERonp42U+QR9hOlqYgdhmzm6Oz3Wjq15zWia71U92LjLexwo7K4wkqsuNNot1Q7tF0nUIIUqf5YRPf7B7DXSKm+LX9GAXncfcAN7KYIGzOzNhM3/MGKfgRcSC8z9MWFfX0T/JR4ceC+x3OvZwJVEL2ED0dqeQwi1l9L6xdV3PNtE1UpfS3TtZhF7HbyM6E30EGsiFSE1qiGOIuy3UAnTIuR/TUx8683wjesi7eyK2ek9vksoru2Jvz5p7+naQnQgOl011mvfRSHU3+tg4xss21sG7gIeJ+bdjGQltrNozLed8X4GG6cTimDAvdW78YUNxMfdfQ2x8csrgK8BL3f344BL3P0odz+y/k2s4zODEOJ3EYJ8NSGQ7so4oNrMBirH8FrCB7A4w3+cUDbLiF7DR4FHqfZanpBhZtG/terEqKL1wBuJQvmjvLYsWAeVEniMalG6Yop6OlpnnZhRnFDCW6kc64OJfwzbLtWxMyhKp5PWVLO61sz0VExpo+ivyDptsXnDd6fsqJ/jFOANPP0ypfSwh7tF26x3Plga/X2Nz1TWDWsM2yqeNZ2mQ4pgW4rAfD3wH+5+HTmc093f3ewCd7/X3S8Fjs7vLYS5p5doyV+Svx8i7PhLiKGdPYT55ivE3gWriPH+vyD8Edfl7/9FzGx+F7EHwr3ESqhPAT8lXvw6qq02f0a10NwzqBRMb97rvXmfh4GbCWU1Ou//dDCQacSI9Gxix0anNIsbIh86ibdeGcs6T71EPhUF0Ec1MuxxmlfgVulo11JrjKfRZNeq7lrDd6cMNvw8KiVdxql3mqdQCbTVwMX5+/4Orq3nSRke/XQo/XY0e9+l/NaVVDMlXT/WuOpws2cazNyvvQgz8YBoQtm2LMp5Aa8B/sXMxtO5wpxrZk7Y9lcQZpkTCeHsxNDS1Rnf/lRDRo+l0vbLqVq15b6TgAfz9wJiOGvZ7+BYokKuIGZEjyVa+b1EC62siPoiwlx1NaFQNhE9jF/muXVET2Rfhr5SlS5tJ/bi8j0buI0ws62i+SY/regh8qG02Es5rw/XbYcTFehI+gvWMht7Qu05yogvCDPe2Wy7v0OzSt7MlEAt3nuIvTBabUm6s8x49Z5BK58IhK9qFZHP9xHlvOzn0Y7y7KOIsvxrorG0lXje+q56rdLXS7zjA/N7uBu0zd5DWbplC9uWv2ZloY+By+VKojz0EY24fYi5Ss3uXfLproGTLx/BNpjZHsSG97Pcfa6ZHQI8r77sdJtrD8if5xGt8fMJH0MxVYwiWo9OCOh1xEY5LyZ2Nvt7YpIawJ8Ro5L64e63m9kXiJFLG6jMPp5x307lq5hJtLIuJwrZyrzmMKJ38pxMw5G1dO1IpSp7LZSCvi6fs7FydyLESpgyZ6LTdPUSSm4DUQle2+F1jfcuprJ2jaWSxuIP2kBU/vqzFZ9MWeupkwrfjMZ82ESVpyuJhkVx/jdLYyNFWQ50zzFU/qVWgwk2UjV2BlN+bgReTeWvGgkDFQainp/lvZd8X0u19W1p9LRTplCZi9fRXInW73cfYWloPN6M5YS14adlV8Z2SBHsZHJv5D8lRhsdRfgLjiFs/5OJoal7EzOazyQmfznhfD4FeDnRUtqT0P6PufvnzWwesfLpG4D/ndc8CTxC7JzWQ7RqDyO24BxNCPu1ed+LCXPTeOAi4AL6j2ZqpF7wdqZTeR6hmLZXKBST2KH096FA/9ZuvaXWTHhtJnpIhxOCahz9K3V9I6GiAK8EzmmTti2Ekv7hYB6oIf07I9+LqaxZfrW6X8njucRQ6RKeQaZxNZGve1H1uiDeY/GVNI6cazX6ql16hyrv6rsOlvJQGjqbCKVY78WtIJR0ncZyNCqvK72I+vHtVeRFufy3u79joIeSIhhCzGw9/U0co6kK9np338fMfo9ooc8lTB//P2FbLgvTvYlo7Z9LzB/YRDiGnehh/Duh7d9FVMBFxFDRJwjfwzcJJfImYrnsVYQwO4oYRno7MeP4P/K6LxIFc0+iV/LRIcqORjNQo5mmFaWb/FtiM6DSEq63TBvDNxMIQzEEczGhqFvdA6oK3LjnQyfpaqzs9WuaPWuz/Spa3assZ7IPndEYR7OeWBEue1P19hYTZfllxMCEKbRv3W+l6l1sIebB9BJzY8p7hupZy33LSLhSr9YQdabUm0abepnwN4b2e3c30uxdDMRG+g913h6lU8w+7Si9/9G07kH0C+/uHZn/pQiGEDO7mUpAbCUqxkXA/yUcxl8jWvWja+GeICrRb6jWCPoJMVzV6hvp1FY6LQvjQdj3/5IYpnoSITw3Ei2tMwjfwlsJJTARuCHj/zKhFDYQDul/Ilo2+9NfiHdSqJu1cEpFLbb6jcS+zi9gWwFYnzdQhOXjhJA4jNgK9NwO0lEo8ylWEIJ8oFZi/RkXET2mOYQCfhPhlCyVrq6U+gjH/XTCXv2GvK9Ttdg2Zfh9iXc9imqC4mCFTjFZjSEaAwfSX8h1Gk/ZHnVjXr8nzffXbhdfMYtsploZt6yF1eqaXqLXeijVmliH0/mmSFuJ+TaHUvnXRjOwiWtnUAYPlPLQmG8LiJ78dELJ7U/7XkzJ/3Z7iC8j6shiotdcGlZF6RY/U1EWa4C3dGTWliIYOsxsAiE43kQICIgX/WpiHsFGojCXzWCeJJygE/P3rFp0BxNmnM8Ro4DeA7zC3d/Q5L7N/Bp/Tpgpvk+08svKpM8nHMRHEIrqL4nexZt3PAf6KZBiNy29pAEntdTiKEJ0M/33Smg2TrveSi7HlhDDbcewbRe6KJqSrn0Iod+4v/Vy4n3tT1Sq0vWvT/Ir6ZiX9z+E8L0cR7zPQ4je38mEku4l3v94+ptUSrrKMxZ/zyhCSHfamm3GYCcJ1udgbCLyrlxfemujiFnr5Zn6qIRXD/1NTPX7l1FgexCCchyhzFYRAyLq76quJIvA66FSNosIBX10xlPyiYbrW9HqfDPHbXFO05AfjQ7+TuJvpN6IaudPKI2AsVS9qjW07xX0Afe6+8ltwgDD723f3fgh0ZI/nljXZx1RcJcTL20C0a0em7+PJLqDEwgzzedqnzKz+HpCqF4EvNrM1jTe1N031NY7wt0XE0tP9xHCZjPhmNubWFRvMlEpDyWcTzOIgvaaDp+zsfWwlcrpVSru2Py+mVBCxYfxVJs45xPO8u9mnNOpRhC1a9WX1uETxKiof6PqcTS2FkvlLRsEOeGzqdOb6TyIqJjFvDeXaJE+AXyDqkV6WH7GEsJxD8IUN5Hww5yU148iykIv8V7qcyTqwr6MRBpH/1Z6MY040Vsqz1LMIGuoBHndDFWf67A5w24lHNklXHl/92Uaewjh/ET+LjNVy9yGU4gWav0ZHq09Z0lDXaBOoDKhHE6VvyX991GVjx6qjZO2UCmlfTNfjiR6l/tnnGXE3YeotmjdQjQKmtFOSDcqzjIse0LDuWI6K0vDLCJ8fCX+kueN63jV/y/K7/WZ7tV53VNUjaIS38b8vTjP3dAk7f+T331Ez7+zBoS76zNEH+C+/J5ICN5zgUuBLxGV7seEGebmfNHFRroyX+CXidVG/5vwD8wnBLptR1rm5PfXM/6VhBCYmYXuRqoCXIRDEdiNn77ad1nMqlH41z9l7HRPFtoN+XknMaLqU2yrOEq8PYRA21JL88oW9/GM/+NUlWYzlf15HZWZqPF5yn3KM2/OdH8kwxdhUn/+xus3Ek73OYSS+HHt/IYMsyzj2pr5fXst7zZQKbF2nx76K4HGT31Ow6aGdzDQZzPVSrfrqBYz7CXKylaiHNbfkxOK6j7C+d1Ti6uTZ5mZcd1GNdJmQX7X87r+DKXsbSb8R4cTkyLvA/6LUChrCBNsKQOlTHWSD63u2Sz966jmjrSKZwOhhJbRf4HInoZwZW7K1hbne4k66sAdRO+pVb1rfI6jiH1VBpYXwy08d6cPIfRfQrRSDs8C+6/A24mW1X8Rrd31VL2FYqe9nxCS7yCEypeyoowjnL7LgP/VYTquI5zGxxNbaX6DMC+VAlQqfhGGfVnA7iQqZCcFrR5mC2HT31o7NzcLcVkcrwicJR1UtA1Ey6i0Er+acTSr1MVGfQkxYqqPqjXVScVfmfe8jMpctzHjPTzT/8uGexeBu5Qw2T2R/2/Oc2Xxv7oy6clneopKSC3LclAEfRHi9SHBj2V66kqrE0H/WyoFVE93T+34glq66vct915SS2sflTmtj+hZLsjyNpMwf30gP+sJk9g1+U7qaS4t/TKrvjz75syLgZTJpszvPyTq21PEBMkzmzzrYBRA4zWl19R4bAPV/iB1YV3P33LtZpqX2Xqc5Z2uYtsGS+NnI1FmNhOm5162fc7SCNpM9NBOkCJ4+hVBeZHlRWypv5QMMzlf0MOEsJxLmGe+X4vnXmJC0bcIBTEd+DuiG/mhDtIxnnDMNlbu3ixMC7MC/RVhtnhV3uszRCWut6RaVah2FW15pnddfkrh3DDAdfWKUQTjcirFWVc0vYRwXEQs1X028J/EKqv35LlbGipIqTibMv/r5+qtyOIEnkXl1ymC7Kk8dxvhiP90xlUEe6PgaOz5rG3431iJByO0Sr6WXk/ZGrUo0Mb7t/usBi7Mz370F0zNekR9VMuTrCHK6QoqZdZ43eZafKUXsIRKOZRGya1UyvmJjOMjVA2mFQ1xNvZWW+X9YJRAp+FLubiaqnw8mzBb/SNVb7a8q8Z7lF6rA5+oPWOrtJTysSw/q6h6vSXM3cA/A1cMRnbJWTyEmNnhLU5dSzhL96PaVrLYEFcQ9v+PEwV/H8IOOpowoZxJ2MjnAe8Gvu7unxwgHbcC/4dqyv51xJIZuPuCDHM3Uem/S1TASRm2k+Fm9Zm7sK0dsiw5MIdKYHzN3S8zs15C2LdyHpc4H8l4DyME7zGEY30OUSFmEQp0TyKvJuZ9Smv8eCKfixOzbrcuTrfi+Kx/l7AriEr2PcKWfRSx0N94Ir8WEEMlP0n4hiYBZxGK6V+JSj2aMKscmun7dMb/nlo8k/M5S56WvCytzz2pRpLcRtjFN1NNHjw50zaf8E1MyLw7iOhNjqOalV1a38WJ/0vCsf0sQoAvzjQcCvwtMcrtPwnz5EH0n8U6NvNnccZTllf/AnAVsd7VaqLMjyF6NvtluIMy7DMzXx7PNCym8pk1c3B7Xj+RaOnOMrOZmdcHEGWlNBBuJObZlJFN7WYrl7g7safXnd/lmvq1fYTwH081abTRkf1wPsPBxDt8IsM8r/YMZQBEme/SS+ThEXmPUn4bZykXpXMPcJm7f6mDZ1KPYCf1DN4LPKP2/0uEAFtCdJ2XEb6CXxDmmNIiXZT/pxHj+x8kuvhjCUFzxyDS8C/EjNZvEa3YbxLOpSuIoacPEDNv9yW69n15/01EBf4G0bVfSrVNZml91FthzVoumzL++wjzyrWEoPoV/Vs2axriW0FUou/ntZ/Mc0szz+r3LC2sLUSrtI+YmV1atbMJAdjMP1COfZAQPu8kFOJaQtiuye9ZtTTX710cs/cRe0/sn+94//zMzWfZTFTIUjmX5bOsperhFL9G8RcUE1gPIVQ35rnFVK3/0hqd3pDnddPSPKLnuSzf3axMc8nHHqKsradyPm4lhNJG4LZaL7feoi0OzYcz304kBOMeGX4d1bDkqZkXfYRp8hEqQTaDqidQ8nURlc28/p7L4oNriDLck/f6YL6D0uscqBW/vsmxxjL8JNU+v8U8eX2mrZiz6qYfpxoMUt5DvbdweS1t5f38Re2+xanttf+NaVxKKOV35fm7897FzNqqF7EVmNmRvBhuobk7foB7Gv5/mKiU8wmH8Bvz+F3EzOE5REtoL+DNee5IQgj9PD+3EN3mn3eYhldm3L8lKuXdRIWfQwiDR6hsvL9tKKjNPqVgL6VqWT5MZXJ5gFA4XyQq69w8dlJWoj8gfCXtbNu9hK3/o4QweoLKRFR6F3WzWw8xI/uGPPZ5oiX7t8TokSmEw76P/pWlnobG9JT/6+gvWOs247WZf72Zf/Pz/vOpxseXLUXLKKHNwMcIBVXS8SAxG/ma2r03UpkIBnJINirfhwiz33eJVnpdILVynPble9pC+LBWZtq/m8+2kuiZFgFV4iiKq4foNd2e4RcRAmoJoWiK/+FvMl9+TOVILQ71RwghXRpCRZGtyDx4Mo/dQgyq2EDUjY2EAv6PfOb78/l/QKVAyjtrZn9vNNu1E6rtPqUhsoFo+NxLKMNm5ascK/l5f957CWFiWk9/E2VRDguJ8lLiWEW1gnAJ20tlOruVqIOHSxEMnyKYRW2kD+Ht30o1euc6QriXnsHniUln12Thfg9ROU+pfV6S4T7bYRo+md8TiJnK9dbyPxCjPh4nBNParHitbJmlkmzJOBcSFbw4HuuVqFxfWk831yplKeTtWm8ljk1sO1qmXjmKgLmJUEhFOcwhejhlNnbdL7CB/itCNj5nSeO1tXw8g+jRFKd3uWZN5uO7814b8r4bCYFWWrufyOs+lPl9WaZ1I2EGLGWiL8+vJlaILb6aPkLAX5Z5+UA+w54ZR/ETPEmYEFbk8dKybifA+qiUThG0PcD/1J6/5NdWqrWtnFAOy6ls3MUkM5cY9TadEFxX1c5tzLy8gqoHWvK9Pqy1lUBeT/SwFmRebCKE/keI3skjmcb78n19jaoc1Rs5j1C15utCt/TOyv+fUI2qa8y34kPpyTz/CtGLPJyo10fndf9G855I+byLqgFQfE2t3ls5/gRhZrwq/9fTV2+wzQKOlCIYPkXwf/MlvSo/d2SleZjK67+CaPk+loX63jxX1l7pIRRD4+fOQablBkLx3EE1zHEuoYj2ICpy3SlYhG8vlbCtV5hS0FZQCdjSdS+CcnlWiDdlGh7N8/fU4i2t5CJUPSvDrIzjM4SAnM62grtUll6il/MkocgOb/i8NuNbTn+hUir6U5mGH2bFOY5w9j1F2N33z3e0Ko9tJCr79cCt+Wz7Eo7q2wi7+AJCcK+l6nGtp3IMfpxqtNgTGXcxfT1GrA1Dprvc82ai5bgu428mJMron2JqOZooe4cRyv4uKsHvGddnMvzKPL420/YDKj9EXaHUhekTRIPg5xnm17Uyd3FDGVyZz7CUMFGW99k4XLTRJOSZB6uJ+lFMb/tnvDcR9ew/M19KD2pNPt+PqXpm9bJbf47GkUYba8eLue0XVIMYiqlscy2e5+V9rs53WJR7iWdtvt8Sd13Ql5WCN9TSWTcR9tau2Zz5sSTv9yGi3Jd0baBqHJTG121SBMOnCEYRLcX/zs+7CPPQP2clm58V6W8JQXYDscTEd4nRBk8RwuuPap8ziZbPnEGm5b6sGO8mhpL+rpfRJFwRBqWlX8bAr8yK9SSh5H5FCM8ioJZmBf00IXDGZZzjieUtHqQaAVKEUGOFKP//rpaOxopa/5ThcbcRgm4LoWgfyPs9QDgg98r4fkCldIrdfm1DnHVhURbtW57xfzLveSfRynu4lndfyWuuJ2zfvXntbwmFUhTBJiql1+5T8qf0FLZkni/M9K0jJt5tJATPo5neC/O5S/5ton1r9Of5PPcQ5e4ewub/G2B2Ptvn6b95UX2Y68ba769n+KJIf5rxF8VeRtAVk8j1RJl5kOjtbqJS4D1UvZN2reOi0Mtw36eohuOWYcD1BkSzMfoljjLIoD7yq1nZK2XkCqr5Fc/Kd30e1YS2LfRXcvXP43mf0ogqfoL6/UrPb3PtviuJelSE/hVE2ewhTIzlmUq8r0Q+gmFXBhOBY/P3ewgt3peV7JtEq/GKLLhPZcG4gBA46wgn0/za56GsXC8dZDoupqrUawlBVD71nscWolV1E1XLsC6s1xGVdD7RtS62+sbKUlpKxbF3JWGWup4wCxS7by/bDn0rn/UZZjVRmRdTtdDqlXQj0aJ+OSEkixKYXztfegTz89p3U5ldSk9oKf1ndD6ZcZbPDwjH8y15j+JEfCjzdU7G+az8f2d+353fexJmo1Op7LzFXHEb1Qzd+vDjYnPuJey9lxF25xUZ/t8yX/4i39kKqtblxrxPL+GkLYKkKKjybotZozinHyUWQdwIrM60FxPf1tp76CXMJg/lsdIy/1Hm92NUjY7HM92Npp7SEykzgjcRo+felM8/m6gz66iUa18+b5lQ9jEq5XFCfsqw6dX0V7rF51Js95syzlVUCsOJhs1Gohy0m5cwn0rhPk7Uif8i6tBtee6SPFaff1HvOfcSI5v+JK/bTPTWWynucs2VTfKzMZ13keVOimD4lMCZhHCYn/+/kQVuZi3M3oRD7x5iCOAz8vgBWYHuIlrpKwiBsIpYemJCh2mYRQif2VkwluTv2VnYy0iXy4llMXqzMBbhW1pI12UB3Qxcn3G/kGiRPkA1M3ox0etZCMzNcGWm9YWEkHiIMEeVESxrM94FmV+LiVb94VmJbshK9mCt4NcVQVFS15GCq/b8J2cl/Aoh4L5Ppdgeo+rp9BGmhUvoPwFodubDTQ3xvpFYAfY3+Qw/I4RGGQ30RMZbhOet+buYeT5B5excSTWMs4wCuTTTPJ3wQSyj6uZvoBL2izP/y5o778383ZDXXFh71kZhsZDK9FPMgXU/wHLCrFJa86vYtkVdhjSvJYYm30EI6Y3Awlp+lfkWxxCC+lNUPaQLCQG6Jo/Np5rdfCeVw7uYu64G3kY1J+cWws8ylUpBLsl8WFFLd8mHJVTKtm5yqQvUK/Md3kxV3i6hUtSl3H2DmOg4h1gv6Y+IMlAmAJbeT+Pks2ZKpf5p5UMrDbYNRK+rPhm0+ImeomoA/BWxmsH7pQiGTxGUYZm/rR3bTI77zv/jCQH1a2Is+YuJVUhfRjgoNxNd6/8k9k5+P1FBv9dhGg6vfR5r8nkEmF4L/yjVshdP1gpgGdroWfBKy3o2oTTKsL4yumN1hr+Q6I08j1B2RlTULxMjP4ppppdqiYEbiAp8FdVwyVLgGytRXTm8HJjXJA82UY3+aZzoVLr4ZbmApVROzd5M53PIiX5Er+4WQol+Ajg+j7+SEPYLiYr66/zMzvc9J+P8ArAhr3mAqhW+Nd9FcZjWR400Co2S7i1Ei3wN0VtZnvEeSAihR4kGxRO1PK6/zyI41ub7fGnmQ/FbrKd/2WlspfZlfq3I93YN0ejYn1CStxML7u1P9FxWE42aZUR5LmamtxLK8EuEqfFrRCPqAUJxP5b3KApoKVXZLO+pCMOtmddbiZ5z6ck8VHvWPqpRS6XnVfJ6HtFzXU/l6G8mvIugLmuI/ZxodJxMDMq4PdN9EdUSKfUeQeOnNIhuyXyaSSXUi99nXv6ek++2+PWaLdVSnN2XEr3fkzuRF5pQthMws9+4+4vM7LfuflIeKzMwDyRe/AHES/sRUSE2lcuJCT9LgbXufnwt3ruJHsHvju1gOh8ATnf3x/L3sVTmkYOIAngA0ZJq3OnrA8Q4boiC17gbU09efwxRQOcSgnVrXvvveV19ZcrRRKW5khhq+hqqpblnE+a2MqGoJ+/1CDFE9H+IoatkWk4GTifMFi8jhhjuRYwuWUK1GN3kjOMVREU7lFBCB7j7ZjO7n6h8M4Er3f2eJvl4Xv48gJgkNJt4h1fk86xx973MbBmhJE7O+5SFxv4P1QqxbyMmWn2TEDIQyuY3+Xta5ueivNcKYC93n2xmRpSpuzPsC/Pb2XayVBmyuCrTcQwhxE4keiPXEg2a7xDv/ge1fDXCeXsQsXrtZqJXtqjhPnsQLfYJVO/r8bxuPNVS1qXX8QliItitRFnYO6/ZCuDuY83secC33P2U3PTpNOK9vYp4x8fl804n/DOH0pw1GX99Qlgf/ZeELpP8GvdkKIpw/wz/Wyrz0osIZbUq03AO1eq1pW7cQjReykx7iB6QEQ3CUVSrjEK1mmtRgHvk9xIi7ycR5RiqOvUoOS/D3V/ZIg9qTzUCWtC724foNr6VEB5Tia7+vxGV6IOEU/VP82XPyTClBTY5C8wXidbMe7LQfI0w41xLh1oe+C81+AAAIABJREFUeGV+/0mLzxlUo5Z+TeXku4QQ4m+gsr/u3/CZTWUnrY/i6SFaIk/VnumfCRv3Y/n7DqLCl5ZNWavocULQP0LYhbdm2m4hhNVHqSYolQr5eN7z51TDY/+e2OpzBiEUV2RezqcaTlh6A57Hv5vpOCjT/kvCFHE9MD7zsdHHspYQ8hACsbQu6xPGyj3WEC3Ib2aa+6gcm8X8Ny+fcb/M13dQtWSLeWFzhtuY170vz92W+fsQIaQ/TOVXqdukS0u3OK+btXqL/f5+qtEsywmlegOhKP8MODGf/RDgtCbl75v5KXtll+dfSuWDOCfvtZyqNbyVGEhxUB47mjBTHU6UvYeIunQJIexmE2X5n6hmSP+YULBrqWz5ZV7DzVSjxcrAjOkZbgZVj6GYR8tAibpJaR7VXIeHiUloP8v7vJVYBv5B+vvSiu+rbtbcROUTqs95qA9VLvd9J/17x8XHVsrdKkKJPkrlL/tZR7JiuIXm7vghNPansnBNJ4TfeGLN/70zzP8mur+3ES26xjhupupSlklGTjXDd0AnENVcglIhL8nPN4FL8tx4Yo+CS/Ke78hPEWBzqIT1FsJ0sDgL8BVUQ2HvyMK4gGjVrMr4X0pU6tcQSy9cnIX0U1mA78uKdB/RW1pNZT7aRFTWVfQXYOV7PaE0jiXWXa8/+7lEJV+Vleq6DL+KUCwbMkwRlvUhs7PzvZxJtOxvHSCfp2a8xR9Tuv29hND6JDEEcRmhEDcBryN6SLdSrdA6i/4V+1GqkTNlNMt6ouU+O5+72NwfJBoKE/J9FMX2F2zrUymmqDKAYSHR6n95vqtvEAr0TkKwzSaUy/8QvYStmbYFefyUWl40zqq/jBD038v3cQlRth4nhPV/EUp3FZVZZQuhhD6U6X2UylRS/FWzar9LftVt8cW3UBzb8wnl8e9Uzvk1tWuL4P0QYeralO+mlLW6UC6r+Ja1qJ5DKMgyj2U11UTOMnelvIPN+exFQVxKlI1ZVMPLSz0vDvzZVAMPHqJa/6r4QMpnLdELLD6k2+jQWSzT0E7AzKYRrdIj6L/dHh47jL2UaL1cRHS9VxCtrc0lDnf/6zZrF5UwC9qdr6WnbJjTLz3u/o9m9uI8/hlijLu7+7fN7K2EAPs0Ufgez+vWE63v6UShO4howRqVo3IclY18GtFaHG9mhxICYU93P9HMbiQq2EsJ5bmVavr8Nwi7ell/p48w7ZQ16pcQJpSxhLJ4KdHqLM84nmg9TiZGT8wmZmt/guhdPUqYO/6CUGinZRrK2Gzc/RcN+fisFll8OWEW6iHMEcXHcFa+74uJES5vILYbPYV4578mysCThCJ8hDDROJVJpLQmV+fzjyXMZhuIET5vAn7h7s+rpXNB5tGhVPv+QmUCeZRoXT9EtSHPIxnnmMzDBZnX46i27Pw6MZx0jLtb3uulxDpSJ+T/e9z9xFpa3kT0iPcm3sPmjLP07H5N9Ar2yPsV89KTxDt7G/BZQrDeTZiavkc0Xp6VaXomMaqr9LyOpjI5rqbaM3gL/ddzKnlazC5FAI8jGm0fJMxyZUMiqPYqfpCoA79PmKTOyDTfTSjTku9b8z39F80pSmJDPsN1RKNpPdV6XvtSreU0Nc/fkvnyd1TmpTISbz+iPL3F3We3uG8/pAh2AmY2h+ialxUgCz9095PM7NPEbmKXZ6X9hybRlBESR1BbCM7d/3o70nMD0Xq4mygsEAXsJKLS3ENMivo+sCmV0L8Q5qPbCbvnXURL/p8JIVEccGS8p2XcBxGF8TmEwjiJaM291t2n5yJhawnn9+cJAb6VqGhbCEFQFMarCaH1XCIvTyFMbEcRS1ZcQ7Wz2tiM867aM44jxqj/FSGox1L5F8qwwUOInsr9wN+7e+MmNfV8nFX7WzYWmkNU0puI97V/xjse+Jy7/31uJnQO8Z6/mul/P9ESfB0h8J4i5mi8n/Ch/DXRev5upn0eIUh/QSirItRKi7XH3ffJdP4X0cB4F2Gu+AqVsCvDhA/OZyj29IOJFuSfEkLsImKE1FFEb+9lxEinE4D93P13G/6Y2d2eu2BlHp3g7m5mH8q4Pp3P/S6iwfEPhEIwqgX8DiCU/18RPcM7M/oTqPxCqwiFcAKx+N7VxNyaqwilcBPh6/kloQgeI0Y0jSbKz5S833F5bo8MeyZVuRtHlKupxOicxi1Yi09hVb7jiUTjZ1/C33cSoaz68twYQgmW3smYvPZRwh9T3smDhHLbmNcZ1b7QRYGXPcn/mFDgvZmGJYSfoPg1NgOT3L34HQdEimAnYGa3uvtLmxy/lihwr6HaW/hOQtA+y93n1MLeTjgIi7kAAHe/dDvSc5+7/16T4w8Qo1/czM4hFqq7mSh0LwMucPcrzew/M80HEsJiJeGcK5O29iNauKeSE4Hc/flmdqe7n2pmm6hWoDwk86D0dsZkPowjWk4QyuYponCXSUKNW0kW+gjb6fvd/ZSG57uKqCjHEgp1HFXPxQlBWJZr6KNqfRvRM2q78buZnUz4cI4nFNN0QgH2EMLgqTx2eubZVnd/Zl77INGavYCo2K8nlPFmwk9zdn6uy/RdD/yxu0/M9/ZVondxIGFiWU+8h5UZ3+FUO40dQn+HcXGGGtHK3J8QOMsyjjcQAvJZVKaIssex57Ufy9/HZR6WFu+5ee9/J+zk04jGxkJCcX6PUIqrCQWwimg87EP0oqcS770I4D/JtD1JtTIp+X88IXzPy7TdTjQeZmW6yraWj2X+QSi6FVRbXJYd3NYQLfLiLB5NNfihbn55FWHu+lzG91FCES3MfH8HoTw/mvlwDP0FevHTFBPgtURj5jv5HJ73XkqUnT8glMTRRMOgjyhvJ+fzHUm1a1xP5sMP3P0dDAIpgp2Amb2KqBD9zD2EHbFxb+H/j3A0jXP3I83sRGKW5xTvYK/RDtNzMfAVd5/VcPx7RMtzCVFBfkW0tCAmRT2Z4e5295PNbC7hnPwpUeDGUE28KcMeJwLvcvefmtmHiYr9OkJwnU21ounJefyZhDDYk2jxf4RoiTph9riWGKP9OqLLXcbSlxbQAsLcM5XwX5TlESAEwzpC8HyPEEqPEXMg/ohKAFp+xrt72Zay07ydRVT+BwhBci/Ru5ibz/kWYpmHPyUc2RMzH76b6djP3T9Wi+9OQjAelvkyJuN7Zj7LW4hRVe8hlMgqokfn+UzfIlreEC3OhwjBsZJomW7IePbNeL9J9BomEgrzr939VbX0/JgQ0L8g7OInUinUtXnvghOC+F2EwIQQ5BDlY0+q0UOlh1Ja2lsIRXQr8c4PJ1rqhxAKbS+iETI/07+cKA/fIfwkVxHmmf2Iclha7kVxQZTZsoXqhMyT/ajmUCwl/CCXEsL+OKq9m0fTf2/h6zIvipJyQjGd4DHabE/CLHk/VcOnzAgel2HL1qUH0l9Rl9/r839ZnG9i5tflhBzZQrWmUSnH38k4p7r7mXSIFMFOILvmzyEKQWnNu7u/I22qU939m2Y2iTCfvBS4xauhpvcR3eR1RKWo+w5WDCIdZeG1MYSgfCTjOiKP300U5jsJe/8vgRe6+8EN8dxBOICnp0KYRPQcXt/ktj8sz5HXvoYwGxnhtDsSeF/azssyugdRCZa9iZEbz6b/ngWlQj9GNfSzJ68t++A+2pCWA4nJTcelSeovCWfm84i9DO4BfuTu/5hpvbud8k1TR2EUobj+iKq3s5lQLr2EolpCjP76PKGwLJ9p37z+caJcHFW7x+FEQ6CYvDYRef1xQnmMJswjhxKtxS8T5ogJRC9ufKbpBsIcUnwlDxNCowji0YSy2kooACPKwaeA17v7ezI9FxADG8q+CGOInuo5wBfd/U2t8iuvv49olc8n7Okr89k3EILeCPPaRMJE1ksovcuJRsENee/JmY+/JNY12mhmRxGt5JcSQn4OIXCfQSjICXmvPagGA8yjcubfkO/jnHwnxUewnjCVFXNO8Q+UljdUvar1RFmCaNCdTTRGIHpWZRTWC4h3WUw/pTeyJb/vIt7jDZkv5LV1ikJcR7yD9XmPYiIsebmWWBb8eXRKJx5lfQY9aqjpekDUZtjm/0OJuQLQf/LZTEJgFVvi/Pw8Msh0HN7iU8wOL699LiOcqOcQgqM+VLK+INansrCV5bJHAxfV7vme2u+PA4c1pGkBURlmUg3jLBWuzMadT7V2/maq2c4biC74FYQQK0spPEb/tX/KrOqy3MCjVOsKPUm02tcRQu1r+U72JUdmNMnH7+T3KrYdovpsQhA9SUyW+gQhdNdkOr5N9Jr+tBbfKeSorXq+EWa1tYQA/yLRuv800fI9hFCodxMK7KNULcHf5vPuRfSCZmacxQRWH0lTHKp3Z1m4g4ZRV3n/D+Vncd6r7A39BaKF+3lCMH0Z+HLt2qnE+lqziYZHGb65iOgh/zzT/WA+898SPZyHM72/JBy07waem3FekO/4/VRbqRZT0vIsEwuJFn1xsP8mv9+cv+cR5bw805pM1/H5Pu+o5ePniCGgpR6UGfCzCSU9vVk5ybSup5rAWJbpLqO1+ohycipR9zcSo+j68jmWZRn4EOHXKYsXrsr0f46wIDxC1JFbCTNX6Wlsolr0sKPRQr9L93ALzd3xQ+4X3OR4mWFbF/oraD7n4BHgwJ2cziPJJSuolE0Zx18+o4jewHMI5fQ+4LiGeH7TIv6lRK/oFbVjG1IglGGURVAsJFqeS4hu96sIG/q9+XmUsIVflelclNdcmpW5Pj/iXbXPiwlh/VxCeN1ICKClhAA6N/N8AfBUi+eYnRW3zJ7t96mFmU206GYSPZUy5G9SVuSvEj2eSVmpvw+clGE+TfTO7hvgnd2Rcd0HbMxj91MJ/3tK+co4/4xqGGpZCuOrmaf7EkKxjJ3/XB67l0rhLSeU7gb674P8EKF0zgPOq6Xv1nx3MwlF8xRVq7jMr9hC9D6WEQL5AULxlTkTZfRLKRtX5jv/WaZpbi0vlhOO7bKI241Ue0CUoaHr8vxPqIbMlvMzqRbP25DPtZ5w3BZFWobtbiHqxLWE0j+XWrlrV9/pv4xHfVG6shFQmR9QFFl9efIy8ez/tXfmYXdNZ///3ImQkZhpQ6oxU1SpkIgfqq2XVovQmkoHnVSat1qdNIYOvLQutIYOtF7VkpcihhapmlJBBkkFRaKmKk0lISKm+/fHd63sffazz3nOeXLOeU5kfa/rXM+z91l7r7X32Xvd656+90LkN5if2x9J6WI+wmPhnl/Y0FzQ25Pm2/FDVugjz40/i3IystmU5xzcTKj61MJx3k/GFDoArfxihbDxwIDw3YxuznMBsv0fSeWEPAM5HKcCXw9to6ngQFTacUnu8yqajP6OVoz9c33MINBIoAnum+G8xVyJ/OfikrHG7+4utL2OoKmVHHM8GRtkXkjGWPA4mcwPv/mS8ELODMefHF70pWH7gPBS70eh6hyBlqPGvd4a+XJuDeM5FwmxryAfyF/RRHVRGMdQssS2BWHcMeM5UmEcTFYRbja5erdoUhmGzB2PkPkYRlUZ37Twd3b4Ozz0tRWZRrpV6PtItBCKAnRUOGZNpHlcjMKsd0Ehq7GP2UhDehGZwy4P43+QjIb6FSSo4qr8X2QaxVIkWP4TxvNg+LwTPZebIS3iK2Ra+f+F3/focNz9aBFS61lb9r6T1d9eFH77PAdUPkEyLo5eINN2bw3b/8y1dbKExChcXkCa1fnUyUkWP/XUp01oHEU6hoixZnYRMNTMPoecjDehl3uV8DkA2QgfB2aa2W0U8guaOM5V3P218P9vyLJfz0Naym+QCWlyiAe/2sOTXUD/cFw+ld3DeJ80s7HArcHGvhD5KNZEWkG0Cz+NVp7boUgLgD+Z2Vh3fx45jh8zsy3QZHYUyk9YGPo5Jj8gM7vS3Q/J+UmKmAGMd/cFof2aZJEgFXD3c4FzzewCd/9iro/huWbbovDPIeil3AA408yORqr+BWhiAUWJLXT3G8zs+4XuRgNHm9k8shBLd/lU+qIQ192D7+XZcB+eQr/TOORMnosiWMaQhSFG5+PaZOGJmyOz0DlIS/kiegb/X2480WRxNRIuhyCn7NwQaXZzuEfRd7XUzPoAj5rZcUhzWwUJ8dfDfXscaVhrIQ1lW3d/y8xmIA3pRWC8mX0I5Uzsjyb6SLlwKlrdgwTKhUhQxwCC6ch/EvNzPoies7FIMM4mcxZvGM7zpLs/E64TMxuI3ssYifN8OPYS9DytigIe3EMORQmuDBF3a4VxxboZQ5AW81O0IBmAzIWxPkUMKd4W2f/7h2OuQeGxB4Vr3g+ZAyehxeOGSAB+BrjJzDZz959UGVslWrniTJ/SFdM+aMI4M/z/CHppN6HSjv+psk+Tx3IL8NHw/xwkhCbnvs/TV8fMzQpqhW7O/4vw9yY0gTyNJqlV0MpnAZn6mydHi7xMp6IXfLcwtpfJVo8DCn2tjxzsN4XtMeGFGF7l00XLKdvXg3u6BjJxHYkEy4/JTBkL0WQ6j8yRW8yILh1v7vu7yLS4o8L3uyEt6QY0OY0jK585l2wVGTn7Y23qvxJoSML5RpErMBP2fQeZil4Nv0vk+X+CbGU9N9d+ZzQ5DUOT5lXhmqeTmZueDNd/L1rwHIME0u/DvRsTPr8M59wLFWIv3uuTwnmfR9pMpOqIZpzxaOJ8Nlz3rmiBM4cs0/r1cI53Fc59RrhPD4axfgMJlWhC2rv425SML0Y1LQi/wX/ITLAPoBDcf6P3Y0b47X6GNOJjySiz/xGu6X6Ug3FxuIap6D15NXz/aLi/eyFBVarhlo61NyfFlfGDVoofRZP/BnRDX9DisYxAKvBT4UGaBYwI3+0CXNrN8d8If88jOA3zn1y7+8LfvG8khljOCC/aGWg1/Gh4wf4WxhZjrpeSTUbR5Jan9Y7C5oGwvQrBPFFl7A+g0M24vVat9nXcy1iE59vIIfp8eIGPC9cWBcD54Zq/QhWOnm76uZTM+R05gKIzvcLUFCaN1ZGWkDc/RAruHXL3NRak362kzx2RED8z/B6fLv6eYbsicKDw3U5IQI0Ddgr7BpOZayLVdhxf5ITqsuCgki7lWrIKeJH1M9KKRMEQaxM8SsZdNRUJrTWrjHdeGEOsYjc3jDM+j9Ee/xDB9Ftyjgm56/Iqn5fDOaajXIw5ZH6NWH/4tXCN8b5ND20OD/cg+koWhXGejzSGf9T7XKXw0TbCzD6LXs4/I9V8DxTfviZdcw5mo1Xe1mRJNHgu1LCJ4xpMFrL5ZNi9MRmtryN1dThUmBPXcPdJOfbNIh5D9vFRaGKMyWSHocnxKeA6d78yjGMX4MvuflRkbjWzrZBP4RRkchuR78ADzYaZ3efuOxcYX2PlrTPIQlRjfPlxaNKeGE41FviBu/9vvfctj5C9vZDKzOYN0W/6NfSCR4bJ29HKPl7Dk9QJM5uAfBa/RBrBhSh2/wJXpnr++me5TEoxSS5mnkKWn3A6mui3QuG2m6EJZIdct/lcktvQJDQDCbHtCu3ucfeR3VzDWch0E6Oc7iLLYVkV1dk91cyOQNrU1MLx+ZDV/ijv5FkUvrk6eobfRWZK6ocmyRhsMBEJkDztw3c9FzpsZoeEc30UvafPoWfocbRo2DNsX4BMNcuexcJY90LC5PfoeXga3fPRKELvMmSSew4J0jWREIiV7V5Fmt5ryIR0BBJmQ9H7M5mMofcU5Gf8YO6aziuOqQxJELQRgXpiN3efH7bXRhL8UQo5B2hSnoCcZh9B6nMfdy+jo+jpeNYIfYxBL+A9aDX/UqHpN8nslssoKrybhJWQPTsePcQ/RJPNQ+hhPxhFg2yBVjOrILNKjIh4Admfx4TT3Y4ccg9W6esv6IW8JUxYI5EAeCfKyH2o5Jityfwaf/Y6eVmq9N8le9vMNkAC5iQkJCKvjJGjqHD3bRrs64/IqXklJZnq7r59aBeF6Rtoonw99Ps0ioq5J4zr5TCWd6AV9P+6+48Kfd7g7vuZ2R9QBNYr4RznQea7MrML0D2fSJYQhbtfnTvXW8CR7v7b3L6fk0X57OXK/VgTuNndY5JjtfvRH2mI1yBn7oBwrjvRZD0O+bvWDocMRhN6rDq3FPkm8vkvUYgehhYzk8O5B6JgjwphW2NspyBh8Ti6/6uF//+JfDEbICHwJnoH4/MRieUGIEH3vXCer5Fl8sdCNB9EPobbwrPfH+jr7svuf3dIzuL2Yj6Vk+xLyN67U7GhmU1z98lmZmGlcbKZTaOcl6inuBiZYA4J20cCp7r7gflGYVWzhbsvpQSWkewVNYaF7n5TaDOKLHHpEXd/3cw+jF7QlyhQaaAV037IARgJ57ahuiM+skaOMLO7w3EHo5VyFyEAECb+Hk/+BUwxs/e4+2wz+xK6p+uiCTFOtNHW6+hFfz8KZa0bZrZ5OMdz4bwj0QRzKtIgv55r/kwITjD03JyLVsfno3DgdZCgvQ/4obtfW61fd4/Jg7FW9cIqTasFDlyd234CONbMRnhI5kOmI8JENiP8/6KZLcv0NrOPAYe4+2GFPqcg4bMqSrr7RLi2tYGvonvye/SsrYo08kfQQuR7SDvcoXDOuOB5D3By0LaOcnc3Mw/jGVTlHsTxRgr5g8M9eB4tTkDCaSF6ZvuQJXXORVnlq5IVVjod2NTdfx2CH76MEslWQabCsej9GRQEF0hAxLoU3SJpBG1ALiN1B/RgXYsejANQNMABxdWoiWtoNApb+zNa+Z3u7tU4d3oyrgqmyBr7bkIJZC9XOc8j6GUrTuZfROruArSijxENGwLfd/fpZSvpcM7FiIDsd2HXJ4CL3L3LyxeiaY5Hq9OisDkHrbquoTL66urieZYHZjYHRTvNQyu2hSjOfzurwiLr7v8ws9neQAaomcX4/ynIqRhf4Ce8wEMVsnoHo5V+P7LCK0ZG470I2f7HIHPgo4jN9Fc1xlAzA7uOa5iOhOC5aNKL5o7XkeP738ivcRaa4LdyRY7FiKP1ctft6B2KHP4RkVDvdbJcGFA46jvQ3LeDme2BTEavufvQ3BjLeMHmoQidfZDQ/TRweTXzi4lc8gp3n2ni27oFOawdaWP7IG3jcHTfXwzn/xYSpu9Fz/VM4FqvLFL1PbSIuQg9D6+QFfsB+W+6L0gTkDSC9mBI+Pt4+ERci1YjM4vhgmjCHIgehNPQCquaLb6nWGJmo939Lli2al8SvzSz88gocmeaWReq7PDvC+5+XfHkweYPsv1+AD2wm6AX/wK0Alq2ki4Z3zx3fyOca161i3D3N83sk+5+NjKx5bF6GP8H84dQuUJtBvatMb7ox6igqDCR1j3bYD9vAC+WrIrLEFfxe6NnZwP0zg8L++cjW/q4sH8wMjHsgSKwusDMNgM2CYKvi+/KRPtwDtJUHEUmfdXd87+fhd/1SyG89i4kPE9E+QpLwpgHh+3bg3/NEHHfEDNbC/kz+iOm37+gRVPE73L/vwf5VBaE/V8EXjWzd5GFShcJBg9B2udZ7r7AxAs2KYx1ETLdfs/dbym7T+GefMvMtjSFT/dD2uEl4esNURjupkhDOg+FU38EmU83QhrkF5Gg+Grh9MOCH+UGAmswut93Ii24buZRSBpBryHEWg9GL0AXlDmeWjCGHZC6vEbY9SIKUZ0Vvq8peOIK1KqQ7MVVd85WnaffjvvyK+mlZGyP96IXMYaU9kGhoZ+tci2xdsEVVNqm61aPlwdWpVZB3hEcHL0RsbjLVfW8tGHiAy0MRqJ7neehWuBVeKhKNLZY1vAZNLmtipzc96IIryk1xnFXaLsnJb4rM7sHhUDmNbmvuPsuuXN83t0vym2/DwUJfNrMtkRCwFAo80NmNgYlWq6CVsubIeE1DK2W9winegwJhmhOOj/8vREJtvic9yXTjn5NVn/i0mrXHcY5AQmI/6DnbKK7/6ubYw5AZpwjkYB7LvS7LkowXK/Qfo43UIrWxLC7CAkRkBb1uLsfUv2okvMkQdA+mNnlyNzxJrLLrg6c4+5n5tpMojwBCoDuHLQNjmc1ZL8cgRxnC9XFMrttbDcI1Sl4M2z3RUydr4TtUpI95CiegFToWP91O5Tlea+LqrpoNrkz/H0TZV7HmrFvhbGtRQlM0SyxX5CZ48kwpi7305ubmJcn+FsuR3CN88/LnX9YSZOnvEpEmVWhRQ/frevuL9Q5htHAZe7+riDA93P3ecGf9b7Qpovz1MweiA7sbs6/v7tfX0e72UjLvCeYd/ZCgm4AmmD/SabJgrScSKd9FyIfHI4EwPOhzYx6nwkz2w6Zrg5CpIYfqOOYJ8N47kXO+seQlnaiu98T2oxB2tR97n5s0L62qHVPioLDeljXPJmG2out3X2RmR2O7JLfRKurM3Ntzgp/D0SqfJ7nvebqowe4lqxgzTM12k1Gpp3oIxiAInqi3XXnMt+FmV2FnNG7oVXblohT5stIQEDXCKXon7icrsV0uiBnbrmeSirfrZEQiHVnW4qind+yWgX5feuixKRtqDSrdGvLdfdNlmN4E0wZrmUaWx8z+xXwDnff1xRJtWvRRxBWwzsB6wdt9gmULX4ClSyxN5kYS3+P7vuhwI1Ro6mmtQSMCH19jXLhHbNkX3X3V82sT1jMzESC19EzHRHNnPPQYmQO0qaGIadt/l6cUmNcRTyPVvbzkb+iHkQ68IPQ+xPzCKYEIeHITPc82Xv1DDIn1RKO081sZBQmaPE0tUb7UiRB0F70M7N+SFX8aXBmVjTwUB7RzH7sldFEk8zs/iaPZ5i7V4vCyaN/3lHs7i+bUvAjppjZ1kWHN0pOOwiWmTZeRQLhDZRIdA0ShPGlz/Oxx9XbpvnVUcnYov9lC7KqVYbi1FdDK6xv07VsaE0zwPIiOMJ3Kez+LTIp7I80w0+hMNmGYGbbIsH8IeRkfQ7Y0t2/XeWQY5AQjg5jyPwkv0Z26++E/X8PYyz6CD6OnJcPI9/VsShk80gqfVfRJPF5Kn8zZJJpAAAWCklEQVTXT4TtWnkw88PfwTXaADxtZkORML0FrfgNPVdHEOpbeyW99/lk9a8nhev7V7UAiDJY12iwz5U889UwEC3qFiKBOgY5uPM5F5OQue0adAGvWHGCyMYSNdB+VAqT4dRYOFVDEgTtxUVoJfUAcEcwi1QLwxtkZu9297kAZrYJkvbNRC1HbR6LzWzHaG83hYsuyX0/knKH9+JgTtgbPfzbo1VO5E8BrQIPJ0si2hg50i5HiTs1V0dxJWdmdwA7uvtLYftklIhzGeURTU1F0RGMIk2KjuC13f1XZjYuCPzbzey+BvuZgOLPR6JJ8BRkbhiCBF4ZSjW2gHXc/Uoz+xaAu79hZmUTyWvu7ma2OCwEHJlFDiy0OxH4Y9B8T0L34bQ6fTWDgtlxUXD8l8LdPx7+3dwU9TMU+dr+B63QnwLWMeVPgGzooGduADIbHogcxktRctxLdZhdN0KO75ndtCvDdkgY90eC6k7gC+7+dGxgZi8jjSCGp8aiM2XYHwnbi9AzUKFBmdm5jZg/kyBoLya5CMyAZXbDaiXlxgN/MbO5aGIdTlZ5arlglQVrjgl9VBCcFQ4ZB0w0szixbYhU/ohqWsW6aOW1FZoU1wF+7u7Pmllcyf+MkESEYuFvQmq+Ia3hTVNRGSMjoyvD+miFFfFa2Fca0dQCDMn9HytYXVVoE7N6/2lm+6F7UurzKMKU/HcZupebIztz9DfdS2XoZBHVNDaQsF6bbPIZSfniZJCZPQGsa8rY3hjlKVwHFb6r7wbBMhr9pmeRRYjVRHQgm9knUSJlt3D3201JbJGmeWfke5uE4u9BwQMfRqbV7ZA2+GDY3gH5DE6uo69vddemBt5APoLpKPR3NRSosXeuzQTEo7WRmf0WZeQfXWUs/zCzKeHvbWVtGoL3Es/NyvihpPAJgba3SvvV0Cp6e+ScbdY4htf6lLQfi16ubVECzg1o9V127mPz14tC9+4Px89D9stBZPz5kZJ7Rm5sc5DJaHPkYxiOTD0za1xTJEc7OXxmogiTvamDO75Nv//+KHJlW2TSmUYg/avz+DWRGehipOFsgwTkw8D7axxXSosevtsRMWAuCH//jsotFs+xB8pqnRb6OwmZMS4Hzs61y9dCOCy/r4HrPBsxc+4exrdjtecttI903/l6EX+jsl5EHzQBP1ByfJd9LfjtY/2DSLkegx+K7dZGYb/70009ErKCSeOWd3xJI2gDQkjcNsAaZpZXpVcn5zQsHHNUYdf2ZoZ3E+JWD7zx0NST3H1isMvuSe1V3hcQpz4oImki4lOfhJzfZ4ftX4Q2rwdzQFRtY7GNb4Y26yN6iqqro3BNPzAlvu0edh3j7jNyEU1l9vGmwZTxewKVvgg85wj2LPpjIbqPDcGVaXs1EmwXoWpeTyPG0HtrHFrLDzQHxerHWtDXIGFQ7Duasj7p7lvmvir6rmI28z7AGcGZ24fGEAMGogM3mhqrOdWHh2doC7To6IO01uh/ejcKOV0PmB3MQVF77A+sYmaL3L2YS9BMPEbGJTQj9F/mL9kDJZJG+/8fStpEvM/M3gF82swuJfOxAd065iuQBEF7sAWS8EPRKiriJaSWliHPr9IfrWyn02InZxVEm/F+iFq6jEc/YtnD6O5zg5p/DVLPL0QmhXwizrnoYV/PzH6AhMd33f2WEAo3MpxznLv/u9YgXXbooi26ln28mZiIru+XVHHWhaihz9FVWFQzD5ZhdeQcPRI5HI8C7jYz87A8LKIbwX8psqH/MGwfhqiTx4YxxxKfEYMK+7an0ndVloiVp72oB8UIMAcWmdkOXm6fPw49Qy8gbWcUEvrroKioRUiTOhFl6Z9CNtneiUK566/v2zO8jITTGsg8dR8F+39waG9KloPxeTP7gLt/uco5L0SRYO9GQi8vCLpzzFcg5RG0EWa2q7v/tYfHDkWVo+qJ8mkqrDzdfhm5WaHtMGTnzz9Y66FV8FIA7xpnnk8imuvuN4bwyy7wBhPEzOwS4ExfDkK5OvtZFktfo80UNPHkGUpx96IvoezYPijscVdEz/B+NKHESmWruPuh1c9Q9bxdEpiq7DsNxef/EwnvJWSJW5939z/RJJjybXZCPDyGFlGzkACd6O7/U3LMlmSa6DvdfYQVSOuC1nCru3fRxizH2toKWMbemmeHrcivMJE0bhUFevjNH3T3rbo5d0XBpJ4gaQRtgJl9Izy8h4UVcgW8Pu/+YuRE7Q3UXOUF9f8gspXuzeGrfAjhoFz7l5B9dHX0z8PI7oyJhfJGKquF5YVK3fwpAaURTUVh1FNYlvE7KYQX/oHK+PS8ej7Q3U/sYVdroolxOjJ9LETCOUZw9XRFWxGHbgp5LQtT/qhnrKY3InPblcifUC2ypacYhnwCL4f+JiC/1BgkRLsIAnd/2MwGe460DoVsDjUlakW8ZaK3jslmfdA9bIiSoQfoa6Jo2RIgRDutW2jzGNKYowa3UdhXCjNb3d0XAd/JPYfLkExDnYfIfll3HoBVZhj3QckoVzZ5XHXBlUF8dW47rgwjriXj4l9KoHhw9+7iwcv6Ojb8ewElYYg9GH6rNahoh45qed4MUlTPrzez/3L3GxvtxN3nm9mFyGQzCHH4/B0RET5f8+ASWO049IdLDllsSoSMc8ZI9Fwe2izfVQ7rUWk2eR1Y392XBPt+NSzzN5nZGShAYA2y38SRieZCFLEVC+EsQgSQrcSq6F73zZlAI49WfNeHAA+Z2b1hexcUEVYNlyNtqfgMQjINdT7MbGCYXGu12SO3+QYqFvJ0tfa9CavCILqc54x88KORADgL+Ra6DUPsVARNaBByFL5Gpp3U7aQ0cSq9D02UdyOH8V/dfUnNA7uep0jtUYGiX8FE0HYOGXnfs2TUydPd/eBG+u9mbCehBLaYJfwRpA39GIUfH17luMNRWPOOKF9gPqrxPDHX5lMlh3qTBVnZ2O5Dfp1bUFbzZOB3njGgVkVw1Nc692WI3ffOoF03Pr4kCNoHM9sVxdUPdveNzWx7ZF9tiJO+0xDMOed594lpjZyzKlFds/poBsxsL3f/cyEabBm8yXTXuX6HoCiqE4AN3H01M/s4MMrdT2hFn1XG0RLflSlpcVTYvNvd69Kmc/6mLwFHufu0wvfjkA/rAqRlbBu0h5fcvVoAxHIjRLQdh3wcO5rZwcBn3H3f8H1V/0Ud594TRcvtjhI0pyOhcE7d50iCoH0ws6lIJbwu5zCqxsc/ElHTboXUyr7A4haHuDUEq0xM2wwV1WiKHb4RB3VvwsxOcfcJwSldhOcjgszMyLKoTzOzjYANuwn9LPZ3HEq0G4SeiweBM9z9j+H8j7p7rcS7HqFaxBPyA/2tTZFZ3cIy6vR3ooimYgnY0Sha7+uovsV7g0+hX7O12sK43o2c2bshlt95wOF5zSv4EA5092psA7XO3xdFGu6JQriXFMJ8ax+fBEH7YGZT3X2XWpEDubb3I36WiciZdRSwuS9fdmNT0ah5ocFzD0T2/dnu/mhwUL/H3W/u5tCOhWUZsA2VYiyc4wQUJvkntEjYF8WeH4Rs9n/x1tS1jhFPe1PJIbQWcKW7f7PZffYEVUw/IBPWrkiQga5lD7JC8Gt7oSBTk8cVKUgGoN9pMcGvFkNizexaxOd0C5VU6jWDSYIAiX6jO4G7GvUbJWdxe/GUme2GnFn9EHVDaRlFAHd/zMz6uuifLwkrl44RBJ4VXBmJwtwiz8/qaJLqsSCow0HdUTCzmPhWi8VzF69RirEeuPtZuc3ngNtCJNrdSGssFjBpFga6+4md7rvyrEZGkTr9DpSpfjISYL9DETxfQ7H7x7R4aDtRGRJ7BAqJ/YKZxZDYq+lZouMs5DfaFgmXBWbWmN/IW5xanT4VKeHrIAbKf6FV3WVoJVLW9g6k+l+KwuXG04ZU+B5e1wyCdhm2+1BCp/F2/iC78yHxN0KLrNmFNlPRZB1pNdalAfoFatAstOH6vg/8V2/f5wbGew/yxcXtwYhiYl8UTXMrCiF9BoVoDm/xeO4oGc/tSEOY06Q+hiCz4T9Q0Zu6j00aQRvhyowtjXgowZFoQj0OCYGNkPrfiajIanX3t8xsZXu26mHxLM2ibqCPQ1Hcf1sd0wHjgG+ZWSwwFGkj3qLDfFcB1ajTT0KZxZcgvqcPoYzfTyHSw1ah25BYE9X6j1CoeJcyoNUQ/Ea7I63gCcRFdWetY4pY2V7WXoWV13Md74FqOg/P7OuvknGudCrmmtnxKBIDFLHR5Zre5uiWxdPdf2tm08iyqD/m7lVNgyWI2bsfKfmu6fxJBayBFjGnI9v6Nch5vCUyuXQaqlGnfxRpAPej32Ao4rRqmMO/QfwWmBr8ABAI+4IJK2a9X4IYSM9GTt9jqI+nqT/wE+RvqMVCWxXJWdxGWB31XHNtRyF75nAqeWma7ghcXpjZemi1uxeakCYj3vaGE51WVJgoMc5Ddtq/IbPPwe4+qyzrMw+vMwM0Rigt92B7gJyj+9PuPsDMHgRecfedOzSsdydUYKeCOt3dp5nZQ2hFPg1dT1smwe5CYi3QlJjZbA8V76wO6pKmjC0JgvbBGqjnauIdGU9XXpr5xbYJvYsQunc8EgRboJXmI+7+evh+HuWZnzHMtiHhHmL3j6IreV1T6zAX+oxcOS+jSKFfoqiuHwFHlz3DvQkzG4s0qFiE5ntIu34LraCdUMmMBpP6WoUQmTUa+D9EjvcMyhxveWhuMg21AbkVYWk91yqHLXT3m9oxvuWFmfUHPkPXWryNsGqusHD3N030zGejuP7i983miLoROUNbWnWtgEjf8AQyV0xAOR6d6rsqUqcfQMhMN7M5KFLoGULeS9kirRcwDvEjHY+y6feksgxoy5A0gjagyoowonRFaGanowiTq6kkMWuIfbMdMLOJiJ/mMORwOxx4yN3H9erA2ggT9UM/ZI7Ix4BPtypMqvk2DfY13d1rnrPZsEr6ht+Q0YVPrHlgL6FaZjoa//FIgJ2HTEbroWivHoc7NwNmtjswxUPIa9i3Yzve+SQI2gQTpeyu7n53ne1vK9ntnit00inIvXSRH6gfSnEf2e3BbxPkfq98spW7+15VfsuIhn9TMxuPyNOupzrTadNhom/4HKpL0Z+cNtJpvqtqmekohHS5kvpaBTN7BdUpGBv9a+0S+sk01CaEkMqfoszBeto3zDnSi4i1eBeY2bYo0Wm9XhxP25DLGC0WUyFst+K3fA04E5XnjIKnIbbJnsBF9bwfJb6rDkQ16vTTfTmT+lqIR9DveruZfcbdp1BuRWg6kiBoLyab2UHA1d1FKpiKlU9AHOyg5JNTvQc8JG3Az8PK6rsoc3IwitdeGRCL1m+BuF6uRS/vRyhQCIc49v8GNnb3Y0Pc+BaelbCsF18DNvVuKra1CCuE78qrZKab2WmWK41q4lBql5+lO7i7X29mjwBXmNnFVNbiaBmSaaiNsIyG+A0UwVCVhtjMrkJhiL8Ju44Etnf30mSi3oRVFqbpF3a7u7cyQaejECgM9vOMZmMIcIO7j8m1uQKtpI9ysV4ORDbhhjhuzOxmlINQk8q8FViRfFdl6GRfh1VykA1CeQUHunvLF+xJI2gj3H1IiCDajCpF63MY4e75aIxTzKysXmsnoFiYZmXE+mQF0Qn/r19oM8LdDw3cQLj7K2bWE9V/Maq6dhuVk3HLwkdziDkvO+X21Sos31FoQlJfSxC0lGU1Edx9MXCImW3cjv6TIGgjzOyzKERsGDATZRhPQQ9lEUvMbLS73xWOHYUcXp2IYd4LtZQ7DJcC95rZH8L2x4BfF9q8ZmYDyMwSI+iZ4LwmfNqOFcx3tQyFpL7nyZI6MbO1Wu1o7w4xBBllFef3P9mO/pNpqI0w8ffvDNzjqky0JfDDMnOPqWjNpSi1H8Rh/il3n9W2AdcJa0FhmhURIUx097B5h7vPKHy/D/KjbI3qOo9CyVh/aec4lwcrmO9qGQoh3PlJr0dJfa1ArRDklvedBEH7YGb3hZT8mYiSeKmZPeju2+Ta/Hf+ELKi74vRA/uTNg65LuQSdFpSIP7tAlNJwVlIs5sLTO2Jwzc3qVWgHZPZiuS7KkMI447FgU4NppcN3X1qLw+tV0PGk2movXg6ZDpeA9xiZi/SlbO/WhTKEdQuZN2b2Le3B7CC4FdIY9gHlRScYWZ3eAMlBQPy9vn+qKB9TT6jJmJF8l2V4WeEPAKU/PgScBV613oVvWl2SxpBL8FU4GMN4I/u/lrJ991GoSSseLDlLClY47ztIScz+yvw9YLv6ix337XVfTcDOc6kbqsE9gZCnkaRqqXl0XdJI+gluPvt3TSpJwolYQWCdS0puLP3gKG1QFnRB2kI7XqXvwBcGnwFEHxXbeq7GYicSR2XR2BmFyKuoT0Rqd/BtMkKkARB56KeKJSEFQvLX1JQ+DGZj+ANRAQ3tlmDLEPBd3Uplb6rD6BrWxGwvMWBWondAkXLLHc/xcx+jCrftRzJNNTB6C4KJWHFRDDzHQ2cAGzg7qvVeVycjGPkS8xBiFQWLQskMLNYB6E0g9rdj2hV381GiNaLeQSTOyGPAMDMprrYUe9B1NnzUS3wTVvdd9IIOhghbGyFyNhM6B62/CUF66azaDbc/RRY5rvaMee7Ohm4oZV9Nxvu/jBiy+00XB+CSc5E772j6mktR9IIEhLaBDM7AU38PS4pGM7Ta4EEgQdnO3ePdXZXA2Z5G4qnrEwI97V/u/IzkkaQkNAmuPtZTTpVbwYSJN9Vi2Bmd6EEvTtRKcu2JekljSAhYQWDmX0H0SznJ+Mr3P1Hbeo/+a5aADPbBN3X3RH9zFJU12N8y/tOgiAhYcVDmozfngh1E/ZAv+2ewJPt4PFKgiAhISGhA2BmjwP/Bi5H5qGZ7t6WHIckCBISEhI6AGY2DhiN6ik/jPwFd7j74y3vOwmChISEhM6BmQ0GjkF5JsPcvW/L+0yCICEhIaH3ETKJR6NSr1OAu5CzeG7L+06CICEhIaH3YWYHo4n/X23vOwmChISEhM6Amb0TGE4ux8vd72h1vymhLCEhIaEDYGanA58A5gBvht0OtFwQJI0gISEhoQNQpO9oJ/q0u8OEhISEhFLMRTWL245kGkpISEjoRZjZecgE9AowMxQwWqYVuPvxrR5DEgQJCQkJvYv7w99pwHW9MYDkI0hISEjoAJjZIOBVd38zbPcFVnP3V1rdd/IRJCQkJHQGJgMDctsDgFvb0XESBAkJCQmdgf7u/nLcCP8PbEfHSRAkJCQkdAYWB3pxAMxsJ2BJOzpOPoKEhISEDkCY+K8Ang27NgQOdfdpre47RQ0lJCQkdAY2Ad4LbAwcCOyCwkpbjmQaSkhISOgMnOTui4ChqDrZ+cAF7eg4CYKEhISEzkDkF9oP+IW73wCs2o6OkyBISEhI6Aw8Y2YXAYcCN5rZarRpjk7O4oSEhIQOgJkNBD4MzHb3R0Mh+/e4+80t7zsJgoSEhISVG8k0lJCQkLCSIwmChISEhJUcSRAkJCQkrORIgiAhISFhJUcSBAkJCQkrOf4/rkQyWNNX7+8AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# fequency bar plot - it takes time!!\n", + "w_count_df.plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb similarity index 99% rename from dataset_analysis/AnalyzeDataset.ipynb rename to notebooks/dataset_analysis/AnalyzeDataset.ipynb index 3ed54ded..e7848fab 100644 --- a/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -27,7 +27,7 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", - "from TTS.datasets.preprocess import *\n", + "from mozilla_voice_tts.tts.datasets.preprocess import *\n", "%matplotlib inline" ] }, diff --git a/dataset_analysis/CheckDatasetSNR.ipynb b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb similarity index 100% rename from dataset_analysis/CheckDatasetSNR.ipynb rename to notebooks/dataset_analysis/CheckDatasetSNR.ipynb diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb new file mode 100644 index 00000000..af00deaf --- /dev/null +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "# Jupyter Notbook for phoneme coverage analysis\n", + "\n", + "This jupyter notebook checks dataset configured in config.json for phoneme coverage.\n", + "As mentioned here https://github.com/mozilla/TTS/wiki/Dataset#what-makes-a-good-dataset a good phoneme coverage is recommended.\n", + "\n", + "Most parameters will be taken from config.json file in mozilla tts repo so please ensure it's configured correctly for your dataset.\n", + "This notebook used lots of existring code from the TTS repo to ensure future compatibility.\n", + "\n", + "Many thanks to Neil Stoker supporting me on this topic :-).\n", + "\n", + "I provide this notebook without any warrenty but it's hopefully useful for your dataset analysis.\n", + "\n", + "Happy TTS'ing :-)\n", + "\n", + "Thorsten Müller\n", + "\n", + "* https://github.com/thorstenMueller/deep-learning-german-tts\n", + "* https://discourse.mozilla.org/t/contributing-my-german-voice-for-tts/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# set some vars\n", + "# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n", + "CONFIG_FILE = \"/path/to/config/config.json\"\n", + "CHARS_TO_REMOVE = \".,:!?'\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# import stuff\n", + "from TTS.utils.io import load_config\n", + "from TTS.tts.datasets.preprocess import load_meta_data\n", + "from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n", + "from tqdm import tqdm\n", + "from matplotlib import pylab as plt\n", + "from multiprocessing import Pool, cpu_count\n", + "\n", + "# extra imports that might not be included in requirements.txt\n", + "import collections\n", + "import operator\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# Load config.json properties\n", + "CONFIG = load_config(CONFIG_FILE)\n", + "\n", + "# Load some properties from config.json\n", + "CONFIG_METADATA = sorted(load_meta_data(CONFIG.datasets)[0])\n", + "CONFIG_METADATA = CONFIG_METADATA\n", + "CONFIG_DATASET = CONFIG.datasets[0]\n", + "CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n", + "CONFIG_TEXT_CLEANER = CONFIG.text_cleaner\n", + "CONFIG_ENABLE_EOS_BOS_CHARS = CONFIG.enable_eos_bos_chars\n", + "\n", + "# Will be printed on generated output graph\n", + "CONFIG_RUN_NAME = CONFIG.run_name\n", + "CONFIG_RUN_DESC = CONFIG.run_description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# print some debug information on loaded config values\n", + "print(\" > Run name: \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\")\n", + "print(\" > Dataset files: \" + str(len(CONFIG_METADATA)))\n", + "print(\" > Phoneme language: \" + CONFIG_PHONEME_LANGUAGE)\n", + "print(\" > Used text cleaner: \" + CONFIG_TEXT_CLEANER)\n", + "print(\" > Enable eos bos chars: \" + str(CONFIG_ENABLE_EOS_BOS_CHARS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_phoneme_from_sequence(text):\n", + " temp_list = []\n", + " if len(text[0]) > 0:\n", + " temp_text = text[0].rstrip('\\n')\n", + " for rm_bad_chars in CHARS_TO_REMOVE:\n", + " temp_text = temp_text.replace(rm_bad_chars,\"\")\n", + " seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n", + " text = sequence_to_phoneme(seq)\n", + " text = text.replace(\" \",\"\")\n", + " temp_list.append(text)\n", + " return temp_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "# Get phonemes from metadata\n", + "phonemes = []\n", + "\n", + "with Pool(cpu_count()-1) as p:\n", + " \n", + " phonemes = list(tqdm(p.imap(get_phoneme_from_sequence, CONFIG_METADATA), total=len(CONFIG_METADATA)))\n", + " phonemes = [i for sub in phonemes for i in sub]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "s = \"\"\n", + "phonemeString = s.join(phonemes)\n", + "\n", + "d = {}\n", + "collections._count_elements(d, phonemeString)\n", + "sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))\n", + "\n", + "# remove useless keys\n", + "sorted_d.pop(' ', None)\n", + "sorted_d.pop('ˈ', None)\n", + "\n", + "phonemesSum = len(phonemeString.replace(\" \",\"\"))\n", + "\n", + "print(\"Dataset contains \" + str(len(sorted_d)) + \" different ipa phonemes.\")\n", + "print(\"Dataset consists of \" + str(phonemesSum) + \" phonemes\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"5 rarest phonemes\")\n", + "\n", + "rareList = dict(sorted(sorted_d.items(), key=operator.itemgetter(1), reverse=False)[:5])\n", + "for key, value in rareList.items():\n", + " print(key + \" --> \" + str(value) + \" occurrences\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# create plot from analysis result\n", + "\n", + "x = []\n", + "y = []\n", + "\n", + "for key, value in sorted_d.items():\n", + " x.append(key)\n", + " y.append(value)\n", + "\n", + "plt.figure(figsize=(50,50))\n", + "plt.title(\"Phoneme coverage for \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\", fontsize=50)\n", + "plt.xticks(fontsize=50)\n", + "plt.yticks(fontsize=50)\n", + "plt.barh(x,y, align='center', alpha=1.0)\n", + "plt.gca().invert_yaxis()\n", + "plt.ylabel('phoneme', fontsize=50)\n", + "plt.xlabel('occurrences', fontsize=50)\n", + "\n", + "for i, v in enumerate(y):\n", + " plt.text(v + 2, i - .2, str(v), fontsize=20)\n", + " plt.text(v + 2, i + .2, \"(\" + str(round(100/phonemesSum * v,2)) + \"%)\", fontsize=20)\n", + " \n", + " \n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9-final" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/dataset_analysis/README.md b/notebooks/dataset_analysis/README.md similarity index 100% rename from dataset_analysis/README.md rename to notebooks/dataset_analysis/README.md diff --git a/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py similarity index 95% rename from dataset_analysis/analyze.py rename to notebooks/dataset_analysis/analyze.py index f34605dd..161e2ae3 100644 --- a/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -71,7 +71,7 @@ def process_meta_data(path): def get_data_points(meta_data): - x = [char_cnt for char_cnt in meta_data] + x = meta_data y_avg = [meta_data[d]['mean'] for d in meta_data] y_mode = [meta_data[d]['mode'] for d in meta_data] y_median = [meta_data[d]['median'] for d in meta_data] diff --git a/requirements.txt b/requirements.txt index 862cb229..fdec4c57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,24 @@ -numpy>=1.16.0 torch>=1.5 -librosa>=0.5.1 -Unidecode>=0.4.20 -tensorboard +tensorflow==2.3.0 +numpy>=1.16.0 +scipy>=0.19.0 +numba==0.48 +librosa==0.7.2 +phonemizer>=2.2.0 +unidecode==0.4.20 +attrdict tensorboardX matplotlib Pillow flask -scipy tqdm -soundfile -phonemizer +inflect bokeh==1.4.0 +pysbd +pyworld +soundfile +nose==1.3.7 +cardboardlint==1.3.0 +pylint==2.5.3 +fuzzywuzzy +gdown diff --git a/requirements_tests.txt b/requirements_tests.txt index 5aacdb56..f37cda19 100644 --- a/requirements_tests.txt +++ b/requirements_tests.txt @@ -1,16 +1,20 @@ +torch>=1.5 +tensorflow==2.3.0 numpy>=1.16.0 +scipy>=0.19.0 numba==0.48 -torch>=0.4.1 -tensorflow>=2.2 -librosa>=0.5.1 -Unidecode>=0.4.20 -tensorboard +librosa==0.7.2 +phonemizer>=2.2.0 +unidecode==0.4.20 +attrdict tensorboardX matplotlib Pillow flask -scipy tqdm -soundfile -phonemizer +inflect +pysbd bokeh==1.4.0 +soundfile +nose==1.3.7 +cardboardlint==1.3.0 diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 00000000..27f54b24 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,12 @@ +TF_CPP_MIN_LOG_LEVEL=3 + +# tests +nosetests tests -x &&\ + +# runtime tests +./tests/test_server_package.sh && \ +./tests/test_tts_train.sh && \ +./tests/test_vocoder_train.sh && \ + +# linter check +cardboardlinter --refspec master \ No newline at end of file diff --git a/setup.py b/setup.py index edffc801..c40f77e6 100644 --- a/setup.py +++ b/setup.py @@ -19,12 +19,12 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.3' +version = '0.0.4' # Adapted from https://github.com/pytorch/pytorch cwd = os.path.dirname(os.path.abspath(__file__)) -if os.getenv('TTS_PYTORCH_BUILD_VERSION'): - version = os.getenv('TTS_PYTORCH_BUILD_VERSION') +if os.getenv('MOZILLA_VOICE_TTS_PYTORCH_BUILD_VERSION'): + version = os.getenv('MOZILLA_VOICE_TTS_PYTORCH_BUILD_VERSION') else: try: sha = subprocess.check_output( @@ -36,7 +36,7 @@ else: pass -class build_py(setuptools.command.build_py.build_py): +class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors def run(self): self.create_version_file() setuptools.command.build_py.build_py.run(self) @@ -56,11 +56,11 @@ class develop(setuptools.command.develop.develop): # The documentation for this feature is in server/README.md -package_data = ['server/templates/*'] +package_data = ['mozilla_voice_tts/server/templates/*'] if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config: print('Embedding model in wheel file...') - model_dir = os.path.join('server', 'model') + model_dir = os.path.join('mozilla_voice_tts', 'server', 'model') tts_dir = os.path.join(model_dir, 'tts') os.makedirs(tts_dir, exist_ok=True) embedded_checkpoint_path = os.path.join(tts_dir, 'checkpoint.pth.tar') @@ -69,17 +69,37 @@ if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config: shutil.copy(args.model_config, embedded_config_path) package_data.extend([embedded_checkpoint_path, embedded_config_path]) + +def pip_install(package_name): + subprocess.call( + [sys.executable, '-m', 'pip', 'install', package_name] + ) + + +reqs_from_file = open('requirements.txt').readlines() +reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')] +tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')] + +requirements = { + 'install_requires': reqs_without_tf, + 'pip_install': tf_req +} + + setup( - name='TTS', + name='mozilla_voice_tts', version=version, url='https://github.com/mozilla/TTS', + author='Eren Gölge', + author_email='egolge@mozilla.com', description='Text to Speech with Deep Learning', license='MPL-2.0', - package_dir={'': 'tts_namespace'}, - packages=find_packages('tts_namespace'), - package_data={ - 'TTS': package_data, + entry_points={ + 'console_scripts': [ + 'tts-server = mozilla_voice_tts.server.server:main' + ] }, + packages=find_packages(include=['TTS*']), project_urls={ 'Documentation': 'https://github.com/mozilla/TTS/wiki', 'Tracker': 'https://github.com/mozilla/TTS/issues', @@ -90,25 +110,24 @@ setup( 'build_py': build_py, 'develop': develop, }, - install_requires=[ - "scipy>=0.19.0", - "torch>=1.5", - "numpy>=1.16.0", - "numba==0.48.0", - "librosa==0.6.2", - "unidecode==0.4.20", - "attrdict", - "tensorboardX", - "matplotlib", - "Pillow", - "flask", - # "lws", - "tqdm", - "bokeh==1.4.0", - "soundfile", - "phonemizer @ https://github.com/bootphon/phonemizer/tarball/master", - ], - dependency_links=[ - "http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1" + install_requires=requirements['install_requires'], + python_requires='>=3.6.0', + classifiers=[ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + 'Development Status :: 3 - Alpha', + "Intended Audience :: Science/Research :: Developers", + "Operating System :: POSIX :: Linux", + 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', + "Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence", ] ) + +# for some reason having tensorflow in 'install_requires' +# breaks some of the dependencies. +if 'bdist_wheel' not in unknown_args: + for module in requirements['pip_install']: + pip_install(module) diff --git a/speaker_encoder/config.json b/speaker_encoder/config.json deleted file mode 100644 index 0d0f8f68..00000000 --- a/speaker_encoder/config.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "run_name": "libritts_360-half", - "run_description": "train speaker encoder for libritts 360", - "audio": { - // Audio processing parameters - "num_mels": 40, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - }, - "reinit_layers": [], - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "model": { - "input_dim": 40, - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-360/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-100/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb b/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb deleted file mode 100644 index 159f040c..00000000 --- a/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb +++ /dev/null @@ -1,325 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Overview\n", - "\n", - "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", - "\n", - "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import glob\n", - "import random\n", - "import numpy as np\n", - "import torch\n", - "import umap\n", - "\n", - "from TTS.speaker_encoder.model import SpeakerEncoder\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config\n", - "\n", - "from bokeh.io import output_notebook, show\n", - "from bokeh.plotting import figure\n", - "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", - "from bokeh.transform import factor_cmap, factor_mark\n", - "from bokeh.palettes import Category10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", - "\n", - "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", - "\n", - "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_notebook()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "# My single speaker locations\n", - "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", - "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", - "\n", - "# My multi speaker locations\n", - "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", - "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls -1 $MODEL_RUN_PATH" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "CONFIG = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**CONFIG['audio'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Bring in the embeddings created by **compute_embeddings.py**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", - "print(f'Embeddings found: {len(embed_files)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check that we did indeed find an embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embed_files[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process the speakers\n", - "\n", - "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", - "speaker_to_utter = {}\n", - "for embed_file in embed_files:\n", - " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", - " try:\n", - " speaker_to_utter[speaker_path].append(embed_file)\n", - " except:\n", - " speaker_to_utter[speaker_path]=[embed_file]\n", - "print(f'Speaker count: {len(speaker_paths)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up the embeddings\n", - "\n", - "Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeds = []\n", - "labels = []\n", - "locations = []\n", - "\n", - "# single speaker \n", - "#num_speakers = 1\n", - "#num_utters = 1000\n", - "\n", - "# multi speaker\n", - "num_speakers = 10\n", - "num_utters = 20\n", - "\n", - "\n", - "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", - "\n", - "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", - " speaker_path = speaker_paths[speaker_idx]\n", - " speakers_utter = speaker_to_utter[speaker_path]\n", - " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", - " for utter_idx in utter_idxs:\n", - " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", - " embed = np.load(embed_path)\n", - " embeds.append(embed)\n", - " labels.append(str(speaker_num))\n", - " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", - "embeds = np.concatenate(embeds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load embeddings with UMAP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = umap.UMAP()\n", - "projection = model.fit_transform(embeds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Interactively charting the data in Bokeh\n", - "\n", - "Set up various details for Bokeh to plot the data\n", - "\n", - "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", - "\n", - "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", - "\n", - "File location in the tooltip is given relative to **AUDIO_PATH**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_wav_stems = ColumnDataSource(\n", - " data=dict(\n", - " x = projection.T[0].tolist(),\n", - " y = projection.T[1].tolist(),\n", - " desc=locations,\n", - " label=labels\n", - " )\n", - " )\n", - "\n", - "hover = HoverTool(\n", - " tooltips=[\n", - " (\"file\", \"@desc\"),\n", - " (\"speaker\", \"@label\"),\n", - " ]\n", - " )\n", - "\n", - "# optionally consider adding these to the tooltips if you want additional detail\n", - "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", - "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", - "\n", - "factors = list(set(labels))\n", - "pal_size = max(len(factors), 3)\n", - "pal = Category10[pal_size]\n", - "\n", - "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", - "\n", - "\n", - "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", - "\n", - "url = \"http://localhost:8000/@desc\"\n", - "taptool = p.select(type=TapTool)\n", - "taptool.callback = OpenURL(url=url)\n", - "\n", - "show(p)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Local server to serve wav files from corpus\n", - "\n", - "This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n", - "\n", - "There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n", - "\n", - "The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%cd $AUDIO_PATH\n", - "%pwd\n", - "!python -m http.server" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/synthesize.py b/synthesize.py deleted file mode 100644 index 18048c2f..00000000 --- a/synthesize.py +++ /dev/null @@ -1,182 +0,0 @@ -# pylint: disable=redefined-outer-name, unused-argument -import os -import time -import argparse -import torch -import json -import string - -from TTS.utils.synthesis import synthesis -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config -from TTS.utils.text.symbols import make_symbols, symbols, phonemes -from TTS.utils.audio import AudioProcessor - - -def tts(model, - vocoder_model, - C, - VC, - text, - ap, - ap_vocoder, - use_cuda, - batched_vocoder, - speaker_id=None, - figures=False): - t_1 = time.time() - use_vocoder_model = vocoder_model is not None - waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( - model, text, C, use_cuda, ap, speaker_id, style_wav=False, - truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, - use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) - - if C.model == "Tacotron" and use_vocoder_model: - postnet_output = ap.out_linear_to_mel(postnet_output.T).T - # correct if there is a scale difference b/w two models - if use_vocoder_model: - postnet_output = ap._denormalize(postnet_output) - postnet_output = ap_vocoder._normalize(postnet_output) - vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) - waveform = vocoder_model.generate( - vocoder_input.cuda() if use_cuda else vocoder_input, - batched=batched_vocoder, - target=8000, - overlap=400) - print(" > Run-time: {}".format(time.time() - t_1)) - return alignment, postnet_output, stop_tokens, waveform - - -if __name__ == "__main__": - - global symbols, phonemes - - parser = argparse.ArgumentParser() - parser.add_argument('text', type=str, help='Text to generate speech.') - parser.add_argument('config_path', - type=str, - help='Path to model config file.') - parser.add_argument( - 'model_path', - type=str, - help='Path to model file.', - ) - parser.add_argument( - 'out_path', - type=str, - help='Path to save final wav file. Wav file will be names as the text given.', - ) - parser.add_argument('--use_cuda', - type=bool, - help='Run model on CUDA.', - default=False) - parser.add_argument( - '--vocoder_path', - type=str, - help= - 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', - default="", - ) - parser.add_argument('--vocoder_config_path', - type=str, - help='Path to vocoder model config file.', - default="") - parser.add_argument( - '--batched_vocoder', - type=bool, - help="If True, vocoder model uses faster batch processing.", - default=True) - parser.add_argument('--speakers_json', - type=str, - help="JSON file for multi-speaker model.", - default="") - parser.add_argument( - '--speaker_id', - type=int, - help="target speaker_id if the model is multi-speaker.", - default=None) - args = parser.parse_args() - - if args.vocoder_path != "": - assert args.use_cuda, " [!] Enable cuda for vocoder." - from WaveRNN.models.wavernn import Model as VocoderModel - - # load the config - C = load_config(args.config_path) - C.forward_attn_mask = True - - # load the audio processor - ap = AudioProcessor(**C.audio) - - # if the vocabulary was passed, replace the default - if 'characters' in C.keys(): - symbols, phonemes = make_symbols(**C.characters) - - # load speakers - if args.speakers_json != '': - speakers = json.load(open(args.speakers_json, 'r')) - num_speakers = len(speakers) - else: - num_speakers = 0 - - # load the model - num_chars = len(phonemes) if C.use_phonemes else len(symbols) - model = setup_model(num_chars, num_speakers, C) - cp = torch.load(args.model_path) - model.load_state_dict(cp['model']) - model.eval() - if args.use_cuda: - model.cuda() - model.decoder.set_r(cp['r']) - - # load vocoder model - if args.vocoder_path != "": - VC = load_config(args.vocoder_config_path) - ap_vocoder = AudioProcessor(**VC.audio) - bits = 10 - vocoder_model = VocoderModel(rnn_dims=512, - fc_dims=512, - mode=VC.mode, - mulaw=VC.mulaw, - pad=VC.pad, - upsample_factors=VC.upsample_factors, - feat_dims=VC.audio["num_mels"], - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=ap.hop_length, - sample_rate=ap.sample_rate, - use_aux_net=True, - use_upsample_net=True) - - check = torch.load(args.vocoder_path) - vocoder_model.load_state_dict(check['model']) - vocoder_model.eval() - if args.use_cuda: - vocoder_model.cuda() - else: - vocoder_model = None - VC = None - ap_vocoder = None - - # synthesize voice - print(" > Text: {}".format(args.text)) - _, _, _, wav = tts(model, - vocoder_model, - C, - VC, - args.text, - ap, - ap_vocoder, - args.use_cuda, - args.batched_vocoder, - speaker_id=args.speaker_id, - figures=False) - - # save the results - file_name = args.text.replace(" ", "_") - file_name = file_name.translate( - str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' - out_path = os.path.join(args.out_path, file_name) - print(" > Saving output to {}".format(out_path)) - ap.save_wav(wav, out_path) diff --git a/tests/data/ljspeech/wavs/LJ001-0001.wav b/tests/data/ljspeech/wavs/LJ001-0001.wav index a4662ab0..a274be89 100644 Binary files a/tests/data/ljspeech/wavs/LJ001-0001.wav and b/tests/data/ljspeech/wavs/LJ001-0001.wav differ diff --git a/tests/generic_utils_text.py b/tests/generic_utils_text.py deleted file mode 100644 index 228df2df..00000000 --- a/tests/generic_utils_text.py +++ /dev/null @@ -1,35 +0,0 @@ -import unittest -import torch as T - -from TTS.utils.generic_utils import save_checkpoint, save_best_model -from TTS.layers.tacotron import Prenet - -OUT_PATH = '/tmp/test.pth.tar' - - -class ModelSavingTests(unittest.TestCase): - def save_checkpoint_test(self): - # create a dummy model - model = Prenet(128, out_features=[256, 128]) - model = T.nn.DataParallel(layer) #FIXME: undefined variable layer - - # save the model - save_checkpoint(model, None, 100, OUT_PATH, 1, 1) - - # load the model to CPU - model_dict = T.load( - MODEL_PATH, map_location=lambda storage, loc: storage) #FIXME: undefined variable MODEL_PATH - model.load_state_dict(model_dict['model']) - - def save_best_model_test(self): - # create a dummy model - model = Prenet(256, out_features=[256, 256]) - model = T.nn.DataParallel(layer) - - # save the model - save_best_model(model, None, 0, 100, OUT_PATH, 10, 1) - - # load the model to CPU - model_dict = T.load( - MODEL_PATH, map_location=lambda storage, loc: storage) - model.load_state_dict(model_dict['model']) diff --git a/tests/inputs/scale_stats.npy b/tests/inputs/scale_stats.npy index 5368ecb2..10015de5 100644 Binary files a/tests/inputs/scale_stats.npy and b/tests/inputs/scale_stats.npy differ diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 7f5a60fb..0cb9b948 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -5,10 +5,9 @@ "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. "wavernn_file": null, // wavernn checkpoint file name "wavernn_config": null, // wavernn config file - "pwgan_lib_path": null, - "pwgan_file": null, - "pwgan_config": null, - "is_wavernn_batched":true, + "vocoder_config":null, + "vocoder_checkpoint": null, + "is_wavernn_batched":true, "port": 5002, "use_cuda": false, "debug": true diff --git a/tests/test_config.json b/tests/inputs/test_config.json similarity index 68% rename from tests/test_config.json rename to tests/inputs/test_config.json index e9cd48cf..b2bba154 100644 --- a/tests/test_config.json +++ b/tests/inputs/test_config.json @@ -1,8 +1,8 @@ { "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. - "num_mels": 80, // size of the mel spec frame. - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. "frame_length_ms": null, // stft window length in ms. "frame_shift_ms": null, // stft window hop-lengh in ms. @@ -19,7 +19,8 @@ "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false + "do_trim_silence": false, + "spec_gain": 20 }, "characters":{ @@ -50,5 +51,18 @@ "output_path": "result", "min_seq_len": 0, "max_seq_len": 300, - "log_dir": "tests/outputs/" - } + "log_dir": "tests/outputs/", + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + } +} diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json new file mode 100644 index 00000000..bea4cbb7 --- /dev/null +++ b/tests/inputs/test_train_config.json @@ -0,0 +1,311 @@ +<<<<<<< HEAD:tests/inputs/test_train_config.json +{ + "model": "Tacotron2", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "apex_amp_level": null, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. + "use_gst": false, // TACOTRON ONLY: use global style tokens + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + +======= +{ + "model": "Tacotron2", + "run_name": "ljspeech-ddc-bn", + "run_description": "tacotron2 with ddc and batch-normalization", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step:": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/", + + // PHONEMES + "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ] +} + +>>>>>>> Added support for Tacotron2 GST + abbility to condition style input with wav or tokens:config.json diff --git a/tests/inputs/test_vocoder_audio_config.json b/tests/inputs/test_vocoder_audio_config.json new file mode 100644 index 00000000..08acc48c --- /dev/null +++ b/tests/inputs/test_vocoder_audio_config.json @@ -0,0 +1,24 @@ +{ + "audio":{ + "num_mels": 80, // size of the mel spec frame. + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. + "frame_length_ms": null, // stft window length in ms. + "frame_shift_ms": null, // stft window hop-lengh in ms. + "hop_length": 256, + "win_length": 1024, + "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "clip_norm": true, // clip normalized values into the range. + "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false + } +} + diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json new file mode 100644 index 00000000..c0f552a4 --- /dev/null +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -0,0 +1,144 @@ +{ + "run_name": "multiband-melgan", + "run_description": "multiband melgan mean-var scaling", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_pqmf": true, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": true, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + // subband multiscale stft loss parameters + "subband_stft_loss_params":{ + "n_ffts": [384, 683, 171], + "hop_lengths": [30, 60, 10], + "win_lengths": [150, 300, 60] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "melgan_multiscale_discriminator", + "discriminator_model_params":{ + "base_channels": 16, + "max_channels":512, + "downsample_factors":[4, 4, 4] + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "multiband_melgan_generator", + "generator_model_params": { + "upsample_factors":[8, 4, 2], + "num_res_blocks": 4 + }, + + // DATASET + "data_path": "tests/data/ljspeech/wavs/", + "feature_path": null, + "seq_len": 16384, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "epochs": 1, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 1e-4, + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "tests/outputs/train_outputs/" +} + diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index 2f56c6ce..b032f191 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -4,11 +4,11 @@ "audio":{ // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. + "num_mels": 80, // size of the mel spec frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. + "hop_length": 256, + "win_length": 1024, "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. @@ -31,19 +31,19 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "windowing": false, // Enables attention windowing. Used only in eval mode. - "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. + "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, + "forward_attn_mask": false, "attention_type": "original", "attention_heads": 5, "bidirectional_decoder": false, @@ -51,13 +51,15 @@ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "stopnet": true, // Train stopnet predicting the end of synthesis. + "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "use_gst": false, - + "use_gst": false, + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "eval_batch_size":16, + "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" @@ -71,7 +73,7 @@ "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py + "dataset": "mozilla", // DATASET-RELATED: one of mozilla_voice_tts.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. @@ -81,6 +83,20 @@ "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers + "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + } } + diff --git a/tests/symbols_tests.py b/tests/symbols_tests.py index 4c32c7d6..e3cb23da 100644 --- a/tests/symbols_tests.py +++ b/tests/symbols_tests.py @@ -1,6 +1,6 @@ import unittest -from TTS.utils.text import phonemes +from mozilla_voice_tts.tts.utils.text import phonemes class SymbolsTest(unittest.TestCase): def test_uniqueness(self): #pylint: disable=no-self-use diff --git a/tests/test_audio.py b/tests/test_audio.py index 4b8ee276..6796c644 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -1,16 +1,17 @@ import os import unittest -from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path -from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config +from tests import get_tests_input_path, get_tests_output_path, get_tests_path + +from mozilla_voice_tts.utils.audio import AudioProcessor +from mozilla_voice_tts.utils.io import load_config TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) -conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) # pylint: disable=protected-access @@ -103,7 +104,7 @@ class TestAudio(unittest.TestCase): assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() + assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) @@ -120,7 +121,7 @@ class TestAudio(unittest.TestCase): assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() + assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) @@ -148,7 +149,7 @@ class TestAudio(unittest.TestCase): assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() + assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 51cbf341..2ec15aba 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -1,13 +1,14 @@ import os import unittest -import torch as T +from tests import get_tests_input_path, get_tests_output_path -from TTS.server.synthesizer import Synthesizer -from TTS.tests import get_tests_input_path, get_tests_output_path -from TTS.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config, save_checkpoint +from mozilla_voice_tts.server.synthesizer import Synthesizer +from mozilla_voice_tts.tts.utils.generic_utils import setup_model +from mozilla_voice_tts.tts.utils.io import save_checkpoint +from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes, + symbols) +from mozilla_voice_tts.utils.io import load_config class DemoServerTest(unittest.TestCase): @@ -32,3 +33,27 @@ class DemoServerTest(unittest.TestCase): config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) synthesizer = Synthesizer(config) synthesizer.tts("Better this test works!!") + + def test_split_into_sentences(self): + """Check demo server sentences split as expected""" + print("\n > Testing demo server sentence splitting") + # pylint: disable=attribute-defined-outside-init + self.seg = Synthesizer.get_segmenter("en") + sis = Synthesizer.split_into_sentences + assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences'] + assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.'] + assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.'] + assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?'] + assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.'] + assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?'] + assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.'] + assert sis(self, 'The email format is Firstname.Lastname@example.com. I think you reversed them.') == ['The email format is Firstname.Lastname@example.com.', 'I think you reversed them.'] + assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.'] + assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the final lowercase "she" we see it's all one sentence + assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.'] + assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.'] + assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."'] + assert sis(self, 'The address is not google.com.') == ['The address is not google.com.'] + assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item'] + assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item'] + assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item'] diff --git a/speaker_encoder/tests.py b/tests/test_encoder.py similarity index 69% rename from speaker_encoder/tests.py rename to tests/test_encoder.py index 039833fc..46266f29 100644 --- a/speaker_encoder/tests.py +++ b/tests/test_encoder.py @@ -1,13 +1,14 @@ import os import unittest + import torch as T +from tests import get_tests_input_path -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.loss import GE2ELoss -from TTS.utils.io import load_config +from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss +from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder +from mozilla_voice_tts.utils.io import load_config - -file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" +file_path = get_tests_input_path() c = load_config(os.path.join(file_path, "test_config.json")) @@ -58,6 +59,7 @@ class GE2ELossTests(unittest.TestCase): dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim loss = GE2ELoss(loss_method="softmax") output = loss.forward(dummy_input) + assert output.item() >= 0.0 # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) dummy_input = T.nn.init.orthogonal(dummy_input) @@ -72,6 +74,34 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 +class AngleProtoLossTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + # check random input + dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() >= 0.0 + + # check all zeros + dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() >= 0.0 + + # check speaker loss with orthogonal d-vectors + dummy_input = T.empty(3, 64) + dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.cat( + [ + dummy_input[0].repeat(5, 1, 1).transpose(0, 1), + dummy_input[1].repeat(5, 1, 1).transpose(0, 1), + dummy_input[2].repeat(5, 1, 1).transpose(0, 1), + ] + ) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() < 0.005 # class LoaderTest(unittest.TestCase): # def test_output(self): diff --git a/tests/test_layers.py b/tests/test_layers.py index d7c8829f..0b5315c5 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -1,15 +1,15 @@ import unittest import torch as T -from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder -from TTS.layers.losses import L1LossMasked -from TTS.utils.generic_utils import sequence_mask +from mozilla_voice_tts.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder +from mozilla_voice_tts.tts.layers.losses import L1LossMasked +from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask # pylint: disable=unused-variable class PrenetTests(unittest.TestCase): - def test_in_out(self): + def test_in_out(self): #pylint: disable=no-self-use layer = Prenet(128, out_features=[256, 128]) dummy_input = T.rand(4, 128) @@ -31,7 +31,7 @@ class CBHGTests(unittest.TestCase): gru_features=80, num_highways=4) # B x D x T - dummy_input = T.rand(4, 128, 8) + dummy_input = T.rand(4, 128, 8) print(layer) output = layer(dummy_input) @@ -44,8 +44,8 @@ class DecoderTests(unittest.TestCase): @staticmethod def test_in_out(): layer = Decoder( - in_features=256, - memory_dim=80, + in_channels=256, + frame_channels=80, r=2, memory_size=4, attn_windowing=False, @@ -58,8 +58,7 @@ class DecoderTests(unittest.TestCase): trans_agent=True, forward_attn_mask=True, location_attn=True, - separate_stopnet=True, - speaker_embedding_dim=0) + separate_stopnet=True) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) @@ -71,40 +70,8 @@ class DecoderTests(unittest.TestCase): assert output.shape[2] == 2, "size not {}".format(output.shape[2]) assert stop_tokens.shape[0] == 4 - @staticmethod - def test_in_out_multispeaker(): - layer = Decoder( - in_features=256, - memory_dim=80, - r=2, - memory_size=4, - attn_windowing=False, - attn_norm="sigmoid", - attn_K=5, - attn_type="graves", - prenet_type='original', - prenet_dropout=True, - forward_attn=True, - trans_agent=True, - forward_attn_mask=True, - location_attn=True, - separate_stopnet=True, - speaker_embedding_dim=80) - dummy_input = T.rand(4, 8, 256) - dummy_memory = T.rand(4, 2, 80) - dummy_embed = T.rand(4, 80) - - output, alignment, stop_tokens = layer( - dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed) - - assert output.shape[0] == 4 - assert output.shape[1] == 80, "size not {}".format(output.shape[1]) - assert output.shape[2] == 2, "size not {}".format(output.shape[2]) - assert stop_tokens.shape[0] == 4 - - class EncoderTests(unittest.TestCase): - def test_in_out(self): + def test_in_out(self): #pylint: disable=no-self-use layer = Encoder(128) dummy_input = T.rand(4, 8, 128) @@ -117,7 +84,7 @@ class EncoderTests(unittest.TestCase): class L1LossMaskedTests(unittest.TestCase): - def test_in_out(self): + def test_in_out(self): #pylint: disable=no-self-use # test input == target layer = L1LossMasked(seq_len_norm=False) dummy_input = T.ones(4, 8, 128).float() diff --git a/tests/test_loader.py b/tests/test_loader.py index 9edd233f..9f084f8f 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,21 +1,22 @@ import os -import unittest import shutil -import torch -import numpy as np +import unittest +import numpy as np +import torch +from tests import get_tests_input_path, get_tests_output_path from torch.utils.data import DataLoader -from TTS.utils.io import load_config -from TTS.utils.audio import AudioProcessor -from TTS.datasets import TTSDataset -from TTS.datasets.preprocess import ljspeech + +from mozilla_voice_tts.tts.datasets import TTSDataset +from mozilla_voice_tts.tts.datasets.preprocess import ljspeech +from mozilla_voice_tts.utils.audio import AudioProcessor +from mozilla_voice_tts.utils.io import load_config #pylint: disable=unused-variable -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(file_path, "outputs/loader_tests/") +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) ok_ljspeech = os.path.exists(c.data_path) DATA_EXIST = True @@ -32,7 +33,7 @@ class TestTTSDataset(unittest.TestCase): self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): - items = ljspeech(c.data_path,'metadata.csv') + items = ljspeech(c.data_path, 'metadata.csv') dataset = TTSDataset.MyDataset( r, c.text_cleaner, @@ -74,15 +75,15 @@ class TestTTSDataset(unittest.TestCase): assert check_count == 0, \ " !! Negative values in text_input: {}".format(check_count) # TODO: more assertion here - assert type(speaker_name[0]) is str + assert isinstance(speaker_name[0], str) assert linear_input.shape[0] == c.batch_size - assert linear_input.shape[2] == self.ap.num_freq + assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio['num_mels'] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm - assert mel_input.min() >= -self.ap.max_norm + assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 993ee495..5c875ce6 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -1,13 +1,13 @@ import unittest import os -from TTS.tests import get_tests_input_path +from tests import get_tests_input_path -from TTS.datasets.preprocess import common_voice +from mozilla_voice_tts.tts.datasets.preprocess import common_voice class TestPreprocessors(unittest.TestCase): - def test_common_voice_preprocessor(self): + def test_common_voice_preprocessor(self): #pylint: disable=no-self-use root_path = get_tests_input_path() meta_file = "common_voice.tsv" items = common_voice(root_path, meta_file) diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh index 9fe5e8b1..a5205cdd 100755 --- a/tests/test_server_package.sh +++ b/tests/test_server_package.sh @@ -12,9 +12,12 @@ pip install --quiet --upgrade pip setuptools wheel rm -f dist/*.whl python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json -pip install --quiet dist/TTS*.whl +pip install --quiet dist/mozilla_voice_tts*.whl -python -m TTS.server.server & +# this is related to https://github.com/librosa/librosa/issues/1160 +pip install numba==0.48 + +python -m mozilla_voice_tts.server.server & SERVER_PID=$! echo 'Waiting for server...' diff --git a/tests/test_stft_torch.py b/tests/test_stft_torch.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index eb91b3cc..28d39de5 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -1,14 +1,15 @@ -import os import copy -import torch +import os import unittest -import numpy as np -from torch import optim -from torch import nn -from TTS.utils.io import load_config -from TTS.layers.losses import MSELossMasked -from TTS.models.tacotron2 import Tacotron2 +import torch +from tests import get_tests_input_path +from torch import nn, optim + +from mozilla_voice_tts.tts.layers.losses import MSELossMasked +from mozilla_voice_tts.tts.models.tacotron2 import Tacotron2 +from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.utils.audio import AudioProcessor #pylint: disable=unused-variable @@ -16,25 +17,28 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) + +ap = AudioProcessor(**c.audio) +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): - def test_train_step(self): - input = torch.randint(0, 24, (8, 128)).long().to(device) + def test_train_step(self): # pylint: disable=no-self-use + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 - stop_targets = stop_targets.view(input.shape[0], + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() @@ -51,7 +55,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input, input_lengths, mel_spec, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -70,3 +74,167 @@ class TacotronTrainTest(unittest.TestCase): ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1 + + +class MultiSpeakeTacotronTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_embeddings = torch.rand(8, 55).to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(5): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + +class TacotronGSTTrainTest(unittest.TestCase): + #pylint: disable=no-self-use + def test_train_step(self): + # with random gst mel style + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(10): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + name, param = name_param + if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + #print(param.grad) + continue + assert (param != param_ref).any( + ), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref) + count += 1 + + # with file gst style + mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = mel_spec.repeat(8, 1, 1) + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(10): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + name, param = name_param + if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + #print(param.grad) + continue + assert (param != param_ref).any( + ), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref) + count += 1 diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py index aca363a8..50853e9a 100644 --- a/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -1,11 +1,19 @@ import os -import torch import unittest + import numpy as np import tensorflow as tf +import torch +from tests import get_tests_input_path + +from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2 +from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite, + load_tflite_model) +from mozilla_voice_tts.utils.io import load_config + +tf.get_logger().setLevel('INFO') + -from TTS.utils.io import load_config -from TTS.tf.models.tacotron2 import Tacotron2 #pylint: disable=unused-variable @@ -13,8 +21,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) class TacotronTFTrainTest(unittest.TestCase): @@ -61,3 +68,71 @@ class TacotronTFTrainTest(unittest.TestCase): # inference pass output = model(chars_seq, training=False) + + def test_forward_attention(self,): + chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ + stop_targets, speaker_ids = self.generate_dummy_inputs() + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(chars_seq.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True) + # training pass + output = model(chars_seq, chars_seq_lengths, mel_spec, training=True) + + # check model output shapes + assert np.all(output[0].shape == mel_spec.shape) + assert np.all(output[1].shape == mel_spec.shape) + assert output[2].shape[2] == chars_seq.shape[1] + assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) + assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) + + # inference pass + output = model(chars_seq, training=False) + + def test_tflite_conversion(self, ): #pylint:disable=no-self-use + model = Tacotron2(num_chars=24, + num_speakers=0, + r=3, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type='original', + attn_win=False, + attn_norm='sigmoid', + prenet_type='original', + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=0, + separate_stopnet=True, + bidirectional_decoder=False, + enable_tflite=True) + model.build_inference() + convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True) + # init tflite model + tflite_model = load_tflite_model('test_tacotron2.tflite') + # fake input + inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg + # run inference + # get input and output details + input_details = tflite_model.get_input_details() + output_details = tflite_model.get_output_details() + # reshape input tensor for the new input shape + tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg + tflite_model.allocate_tensors() + detail = input_details[0] + input_shape = detail['shape'] + tflite_model.set_tensor(detail['index'], inputs) + # run the tflite_model + tflite_model.invoke() + # collect outputs + decoder_output = tflite_model.get_tensor(output_details[0]['index']) + postnet_output = tflite_model.get_tensor(output_details[1]['index']) + # remove tflite binary + os.remove('test_tacotron2.tflite') diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 7053a580..0b80243f 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -1,13 +1,15 @@ -import os import copy -import torch +import os import unittest -from torch import optim -from torch import nn -from TTS.utils.io import load_config -from TTS.layers.losses import L1LossMasked -from TTS.models.tacotron import Tacotron +import torch +from tests import get_tests_input_path +from torch import nn, optim + +from mozilla_voice_tts.tts.layers.losses import L1LossMasked +from mozilla_voice_tts.tts.models.tacotron import Tacotron +from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.utils.audio import AudioProcessor #pylint: disable=unused-variable @@ -15,8 +17,10 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) + +ap = AudioProcessor(**c.audio) +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") def count_parameters(model): @@ -31,7 +35,7 @@ class TacotronTrainTest(unittest.TestCase): input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) + linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) @@ -49,7 +53,7 @@ class TacotronTrainTest(unittest.TestCase): model = Tacotron( num_chars=32, num_speakers=5, - postnet_output_dim=c.audio['num_freq'], + postnet_output_dim=c.audio['fft_size'], decoder_output_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size @@ -66,7 +70,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) @@ -85,16 +89,80 @@ class TacotronTrainTest(unittest.TestCase): count, param.shape, param, param_ref) count += 1 - -class TacotronGSTTrainTest(unittest.TestCase): +class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths[-1] = 128 + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_embeddings = torch.rand(8, 55).to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze() + + criterion = L1LossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron( + num_chars=32, + num_speakers=5, + postnet_output_dim=c.audio['fft_size'], + decoder_output_dim=c.audio['num_mels'], + r=c.r, + memory_size=c.memory_size, + speaker_embedding_dim=55, + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + model.train() + print(" > Num parameters for Tacotron model:%s" % + (count_parameters(model))) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for _ in range(5): + mel_out, linear_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, + speaker_embeddings=speaker_embeddings) + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(linear_out, linear_spec, + mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + +class TacotronGSTTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device) + linear_spec = torch.rand(8, 120, c.audio['fft_size']).to(device) mel_lengths = torch.randint(20, 120, (8, )).long().to(device) + mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) @@ -112,13 +180,16 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=32, num_speakers=5, gst=True, - postnet_output_dim=c.audio['num_freq'], + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + postnet_output_dim=c.audio['fft_size'], decoder_output_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(model) + # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -130,7 +201,73 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(linear_out, linear_spec, + mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + + # with file gst style + mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = mel_spec.repeat(8, 1, 1) + + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths[-1] = 128 + linear_spec = torch.rand(8, mel_spec.size(1), c.audio['fft_size']).to(device) + mel_lengths = torch.randint(20, mel_spec.size(1), (8, )).long().to(device) + mel_lengths[-1] = mel_spec.size(1) + stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze() + + criterion = L1LossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron( + num_chars=32, + num_speakers=5, + gst=True, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + postnet_output_dim=c.audio['fft_size'], + decoder_output_dim=c.audio['num_mels'], + r=c.r, + memory_size=c.memory_size + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + model.train() + # print(model) + print(" > Num parameters for Tacotron GST model:%s" % + (count_parameters(model))) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for _ in range(10): + mel_out, linear_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 93edabe7..61c2a407 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -3,12 +3,12 @@ import os # pylint: disable=wildcard-import # pylint: disable=unused-import import unittest -from TTS.utils.text import * -from TTS.tests import get_tests_path -from TTS.utils.io import load_config +from tests import get_tests_input_path +from mozilla_voice_tts.tts.utils.text import * +from tests import get_tests_path +from mozilla_voice_tts.utils.io import load_config -TESTS_PATH = get_tests_path() -conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) def test_phoneme_to_sequence(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" @@ -16,16 +16,16 @@ def test_phoneme_to_sequence(): lang = "en-us" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" - assert text_hat == text_hat_with_params == gt + assert text_hat == text_hat_with_params == gt # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" print(text_hat) @@ -36,7 +36,7 @@ def test_phoneme_to_sequence(): text = "Be a voice, not an! echo" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) @@ -47,7 +47,7 @@ def test_phoneme_to_sequence(): text = "Be a voice, not an echo!" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) @@ -58,7 +58,7 @@ def test_phoneme_to_sequence(): text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." print(text_hat) @@ -69,7 +69,7 @@ def test_phoneme_to_sequence(): text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, lang, True) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" print(text_hat) @@ -80,7 +80,7 @@ def test_phoneme_to_sequence(): text = "_Be a _voice, not an! echo_" sequence = phoneme_to_sequence(text, text_cleaner, lang) text_hat = sequence_to_phoneme(sequence) - sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) diff --git a/tests/test_train_tts.py b/tests/test_train_tts.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_tts_train.sh b/tests/test_tts_train.sh new file mode 100755 index 00000000..b7adbdd0 --- /dev/null +++ b/tests/test_tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python mozilla_voice_tts/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python mozilla_voice_tts/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_datasets.py new file mode 100644 index 00000000..bdfebfc5 --- /dev/null +++ b/tests/test_vocoder_datasets.py @@ -0,0 +1,95 @@ +import os + +import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path +from torch.utils.data import DataLoader + +from mozilla_voice_tts.utils.audio import AudioProcessor +from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset +from mozilla_voice_tts.vocoder.datasets.preprocess import load_wav_data + +file_path = os.path.dirname(os.path.realpath(__file__)) +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") +os.makedirs(OUTPATH, exist_ok=True) + +C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) + +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") +ok_ljspeech = os.path.exists(test_data_path) + + +def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers): + ''' run dataloader with given parameters and check conditions ''' + ap = AudioProcessor(**C.audio) + _, train_items = load_wav_data(test_data_path, 10) + dataset = GANDataset(ap, + train_items, + seq_len=seq_len, + hop_len=hop_len, + pad_short=2000, + conv_pad=conv_pad, + return_segments=return_segments, + use_noise_augment=use_noise_augment, + use_cache=use_cache) + loader = DataLoader(dataset=dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + pin_memory=True, + drop_last=True) + + max_iter = 10 + count_iter = 0 + + # return random segments or return the whole audio + if return_segments: + for item1, _ in loader: + feat1, wav1 = item1 + # feat2, wav2 = item2 + expected_feat_shape = (batch_size, ap.num_mels, seq_len // hop_len + conv_pad * 2) + + # check shapes + assert np.all(feat1.shape == expected_feat_shape), f" [!] {feat1.shape} vs {expected_feat_shape}" + assert (feat1.shape[2] - conv_pad * 2) * hop_len == wav1.shape[2] + + # check feature vs audio match + if not use_noise_augment: + for idx in range(batch_size): + audio = wav1[idx].squeeze() + feat = feat1[idx] + mel = ap.melspectrogram(audio) + # the first 2 and the last 2 frames are skipped due to the padding + # differences in stft + assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum() <= 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum()}' + + count_iter += 1 + # if count_iter == max_iter: + # break + else: + for item in loader: + feat, wav = item + expected_feat_shape = (batch_size, ap.num_mels, (wav.shape[-1] // hop_len) + (conv_pad * 2)) + assert np.all(feat.shape == expected_feat_shape), f" [!] {feat.shape} vs {expected_feat_shape}" + assert (feat.shape[2] - conv_pad * 2) * hop_len == wav.shape[2] + count_iter += 1 + if count_iter == max_iter: + break + + +def test_parametrized_gan_dataset(): + ''' test dataloader with different parameters ''' + params = [ + [32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0], + [32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, True, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 0, True, True, True, 0], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, True, True, True, 0], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, True, True, 0], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, False, 0], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, False, False, 0], + ] + for param in params: + print(param) + gan_dataset_case(*param) diff --git a/tests/test_vocoder_losses.py b/tests/test_vocoder_losses.py new file mode 100644 index 00000000..6cf0f6a9 --- /dev/null +++ b/tests/test_vocoder_losses.py @@ -0,0 +1,54 @@ +import os + +import torch +from tests import get_tests_input_path, get_tests_output_path, get_tests_path + +from mozilla_voice_tts.utils.audio import AudioProcessor +from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.vocoder.layers.losses import MultiScaleSTFTLoss, STFTLoss, TorchSTFT + +TESTS_PATH = get_tests_path() + +OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") +os.makedirs(OUT_PATH, exist_ok=True) + +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + +C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +ap = AudioProcessor(**C.audio) + + +def test_torch_stft(): + torch_stft = TorchSTFT(ap.fft_size, ap.hop_length, ap.win_length) + # librosa stft + wav = ap.load_wav(WAV_FILE) + M_librosa = abs(ap._stft(wav)) # pylint: disable=protected-access + # torch stft + wav = torch.from_numpy(wav[None, :]).float() + M_torch = torch_stft(wav) + # check the difference b/w librosa and torch outputs + assert (M_librosa - M_torch[0].data.numpy()).max() < 1e-5 + + +def test_stft_loss(): + stft_loss = STFTLoss(ap.fft_size, ap.hop_length, ap.win_length) + wav = ap.load_wav(WAV_FILE) + wav = torch.from_numpy(wav[None, :]).float() + loss_m, loss_sc = stft_loss(wav, wav) + assert loss_m + loss_sc == 0 + loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav)) + assert loss_sc < 1.0 + assert loss_m + loss_sc > 0 + + +def test_multiscale_stft_loss(): + stft_loss = MultiScaleSTFTLoss([ap.fft_size//2, ap.fft_size, ap.fft_size*2], + [ap.hop_length // 2, ap.hop_length, ap.hop_length * 2], + [ap.win_length // 2, ap.win_length, ap.win_length * 2]) + wav = ap.load_wav(WAV_FILE) + wav = torch.from_numpy(wav[None, :]).float() + loss_m, loss_sc = stft_loss(wav, wav) + assert loss_m + loss_sc == 0 + loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav)) + assert loss_sc < 1.0 + assert loss_m + loss_sc > 0 diff --git a/tests/test_vocoder_melgan_discriminator.py b/tests/test_vocoder_melgan_discriminator.py new file mode 100644 index 00000000..feafa60b --- /dev/null +++ b/tests/test_vocoder_melgan_discriminator.py @@ -0,0 +1,26 @@ +import numpy as np +import torch + +from mozilla_voice_tts.vocoder.models.melgan_discriminator import MelganDiscriminator +from mozilla_voice_tts.vocoder.models.melgan_multiscale_discriminator import MelganMultiscaleDiscriminator + + +def test_melgan_discriminator(): + model = MelganDiscriminator() + print(model) + dummy_input = torch.rand((4, 1, 256 * 10)) + output, _ = model(dummy_input) + assert np.all(output.shape == (4, 1, 10)) + + +def test_melgan_multi_scale_discriminator(): + model = MelganMultiscaleDiscriminator() + print(model) + dummy_input = torch.rand((4, 1, 256 * 16)) + scores, feats = model(dummy_input) + assert len(scores) == 3 + assert len(scores) == len(feats) + assert np.all(scores[0].shape == (4, 1, 64)) + assert np.all(feats[0][0].shape == (4, 16, 4096)) + assert np.all(feats[0][1].shape == (4, 64, 1024)) + assert np.all(feats[0][2].shape == (4, 256, 256)) diff --git a/tests/test_vocoder_melgan_generator.py b/tests/test_vocoder_melgan_generator.py new file mode 100644 index 00000000..c9cf5e2d --- /dev/null +++ b/tests/test_vocoder_melgan_generator.py @@ -0,0 +1,13 @@ +import numpy as np +import torch + +from mozilla_voice_tts.vocoder.models.melgan_generator import MelganGenerator + +def test_melgan_generator(): + model = MelganGenerator() + print(model) + dummy_input = torch.rand((4, 80, 64)) + output = model(dummy_input) + assert np.all(output.shape == (4, 1, 64 * 256)) + output = model.inference(dummy_input) + assert np.all(output.shape == (4, 1, (64 + 4) * 256)) diff --git a/tests/test_vocoder_parallel_wavegan_discriminator.py b/tests/test_vocoder_parallel_wavegan_discriminator.py new file mode 100644 index 00000000..ac0b5393 --- /dev/null +++ b/tests/test_vocoder_parallel_wavegan_discriminator.py @@ -0,0 +1,41 @@ +import numpy as np +import torch + +from mozilla_voice_tts.vocoder.models.parallel_wavegan_discriminator import ParallelWaveganDiscriminator, ResidualParallelWaveganDiscriminator + + +def test_pwgan_disciminator(): + model = ParallelWaveganDiscriminator( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=10, + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True) + dummy_x = torch.rand((4, 1, 64 * 256)) + output = model(dummy_x) + assert np.all(output.shape == (4, 1, 64 * 256)) + model.remove_weight_norm() + + +def test_redisual_pwgan_disciminator(): + model = ResidualParallelWaveganDiscriminator( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=30, + stacks=3, + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}) + dummy_x = torch.rand((4, 1, 64 * 256)) + output = model(dummy_x) + assert np.all(output.shape == (4, 1, 64 * 256)) + model.remove_weight_norm() diff --git a/tests/test_vocoder_parallel_wavegan_generator.py b/tests/test_vocoder_parallel_wavegan_generator.py new file mode 100644 index 00000000..72af728f --- /dev/null +++ b/tests/test_vocoder_parallel_wavegan_generator.py @@ -0,0 +1,27 @@ +import numpy as np +import torch + +from mozilla_voice_tts.vocoder.models.parallel_wavegan_generator import ParallelWaveganGenerator + + +def test_pwgan_generator(): + model = ParallelWaveganGenerator( + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=30, + stacks=3, + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=80, + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=[4, 4, 4, 4]) + dummy_c = torch.rand((2, 80, 5)) + output = model(dummy_c) + assert np.all(output.shape == (2, 1, 5 * 256)), output.shape + model.remove_weight_norm() + output = model.inference(dummy_c) + assert np.all(output.shape == (2, 1, (5 + 4) * 256)) diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py new file mode 100644 index 00000000..485e2f2b --- /dev/null +++ b/tests/test_vocoder_pqmf.py @@ -0,0 +1,27 @@ +import os +import torch + +import soundfile as sf +from librosa.core import load + +from tests import get_tests_path, get_tests_input_path +from mozilla_voice_tts.vocoder.layers.pqmf import PQMF + + +TESTS_PATH = get_tests_path() +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + + +def test_pqmf(): + w, sr = load(WAV_FILE) + + layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) + w, sr = load(WAV_FILE) + w2 = torch.from_numpy(w[None, None, :]) + b2 = layer.analysis(w2) + w2_ = layer.synthesis(b2) + + print(w2_.max()) + print(w2_.min()) + print(w2_.mean()) + sf.write('pqmf_output.wav', w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_rwd.py b/tests/test_vocoder_rwd.py new file mode 100644 index 00000000..266415db --- /dev/null +++ b/tests/test_vocoder_rwd.py @@ -0,0 +1,21 @@ +import torch +import numpy as np + +from mozilla_voice_tts.vocoder.models.random_window_discriminator import RandomWindowDiscriminator + + +def test_rwd(): + layer = RandomWindowDiscriminator(cond_channels=80, + window_sizes=(512, 1024, 2048, 4096, + 8192), + cond_disc_downsample_factors=[ + (8, 4, 2, 2, 2), (8, 4, 2, 2), + (8, 4, 2), (8, 4), (4, 2, 2) + ], + hop_length=256) + x = torch.rand([4, 1, 22050]) + c = torch.rand([4, 80, 22050 // 256]) + + scores, _ = layer(x, c) + assert len(scores) == 10 + assert np.all(scores[0].shape == (4, 1, 1)) diff --git a/tests/test_vocoder_tf_melgan_generator.py b/tests/test_vocoder_tf_melgan_generator.py new file mode 100644 index 00000000..5fdfc295 --- /dev/null +++ b/tests/test_vocoder_tf_melgan_generator.py @@ -0,0 +1,13 @@ +import numpy as np +import tensorflow as tf + +from mozilla_voice_tts.vocoder.tf.models.melgan_generator import MelganGenerator + + +def test_melgan_generator(): + hop_length = 256 + model = MelganGenerator() + # pylint: disable=no-value-for-parameter + dummy_input = tf.random.uniform((4, 80, 64)) + output = model(dummy_input, training=False) + assert np.all(output.shape == (4, 1, 64 * hop_length)), output.shape diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py new file mode 100644 index 00000000..851c0fb0 --- /dev/null +++ b/tests/test_vocoder_tf_pqmf.py @@ -0,0 +1,28 @@ +import os +import tensorflow as tf + +import soundfile as sf +from librosa.core import load + +from tests import get_tests_path, get_tests_input_path +from mozilla_voice_tts.vocoder.tf.layers.pqmf import PQMF + + +TESTS_PATH = get_tests_path() +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + + +def test_pqmf(): + w, sr = load(WAV_FILE) + + layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) + w, sr = load(WAV_FILE) + w2 = tf.convert_to_tensor(w[None, None, :]) + b2 = layer.analysis(w2) + w2_ = layer.synthesis(b2) + w2_ = w2.numpy() + + print(w2_.max()) + print(w2_.min()) + print(w2_.mean()) + sf.write('tf_pqmf_output.wav', w2_.flatten(), sr) diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_train.sh new file mode 100755 index 00000000..6be7177d --- /dev/null +++ b/tests/test_vocoder_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python mozilla_voice_tts/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/outputs/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python mozilla_voice_tts/bin/train_vocoder.py --continue_path $BASEDIR/outputs/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb deleted file mode 100644 index 4a21ae17..00000000 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ /dev/null @@ -1,714 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "This is to test TTS tensorflow models with benchmark sentences.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related models.\n", - " - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model in the models page).\n", - "- to set the file paths below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n", - "\n", - "Known Issues:\n", - "- To load the model second time you need to restart the notebook kernel. \n", - "- Some of the advance methods are not yet implemented for Tensorflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "\n", - "import sys\n", - "import io\n", - "import torch \n", - "import tensorflow as tf\n", - "print(tf.config.list_physical_devices('GPU'))\n", - "\n", - "import time\n", - "import json\n", - "import yaml\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "import matplotlib.pyplot as plt\n", - "plt.rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.tf.models.tacotron2 import Tacotron2\n", - "from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " print(mel_postnet_spec.shape)\n", - " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n", - " if use_cuda and not use_gl:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " waveform = waveform.squeeze()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '../torch_model/'\n", - "MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "# Run FLAGs\n", - "use_cuda = True # use the available GPU (only for torch)\n", - "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "BACKEND = 'tf' # set the backend for inference " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n", - "from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n", - "c = CONFIG\n", - "num_speakers = 0\n", - "r = 1\n", - "num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, num_speakers, c)\n", - "\n", - "# before loading weights you need to run the model once to generate the variables\n", - "input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n", - "mel_pred = model(input_ids, training=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "model = load_checkpoint(model, MODEL_PATH)\n", - "# model = tf.function(model, experimental_relax_shapes=True)\n", - "ap = AudioProcessor(**CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# wrapper class to use tf.function\n", - "class ModelInference(tf.keras.Model):\n", - " def __init__(self, model):\n", - " super(ModelInference, self).__init__()\n", - " self.model = model\n", - " \n", - " @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n", - " def call(self, characters):\n", - " return self.model(characters, training=False)\n", - " \n", - "model = ModelInference(model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# LOAD WAVERNN\n", - "if use_gl == False:\n", - " from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n", - " \n", - " vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", - " vocoder_model.remove_weight_norm()\n", - " ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", - " if use_cuda:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval();\n", - " print(count_parameters(vocoder_model))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### https://espnet.github.io/icassp2020-tts/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The Commission also recommends\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Other examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "wavs = []\n", - "model.eval()\n", - "model.decoder.prenet.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "# model.decoder.prenet.train()\n", - "speaker_id = None\n", - "sentence = '''This is App Store Optimization report.\n", - "The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and what’s new section.\n", - "In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n", - "Product name is the actual app name on the AppStore or Google Play Store.\n", - "Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n", - "Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n", - "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", - "If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps’ Store presence back then.\n", - "You can also filter for a specific app using Product Name.\n", - "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", - "'''\n", - "\n", - "for s in sentence.split('\\n'):\n", - " print(s)\n", - " align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n", - " wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tts_namespace/README.md b/tts_namespace/README.md deleted file mode 100644 index c5b2ddbf..00000000 --- a/tts_namespace/README.md +++ /dev/null @@ -1,29 +0,0 @@ -This folder contains a symlink called TTS to the parent folder: - - lrwxr-xr-x TTS -> .. - -This is used to appease the distribute/setuptools gods. When the project was -initially set up, the repository folder itself was considered a namespace, and -development was done with `sys.path` hacks. This means if you tried to install -TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of - `TTS.models`, `TTS.utils`... - -Installing TTS would then pollute the package namespace with generic names like -those above. In order to make things installable in both install and development -modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed -to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect -using `packages_dir` in `setup.py` is not enough because it breaks the editable -installation, which can only handle the simplest of `package_dir` redirects. - -Our solution is to use a symlink in order to add the extra `TTS` namespace. In -`setup.py`, we only look for packages inside `tts_namespace` (this folder), -which contains a symlink called TTS pointing to the repository root. The final -result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`... - -With this hack, `pip install -e` will then add a symlink to the `tts_namespace` -in your `site-packages` folder, which works properly. It's important not to add -anything else in this folder because it will pollute the package namespace when -installing the project. - -This does not work if you check out your project on a filesystem that does not -support symlinks. \ No newline at end of file diff --git a/tts_namespace/TTS b/tts_namespace/TTS deleted file mode 120000 index a96aa0ea..00000000 --- a/tts_namespace/TTS +++ /dev/null @@ -1 +0,0 @@ -.. \ No newline at end of file diff --git a/utils/.generic_utils.py.swo b/utils/.generic_utils.py.swo deleted file mode 100644 index ab1b3870..00000000 Binary files a/utils/.generic_utils.py.swo and /dev/null differ diff --git a/utils/.model.py.swp b/utils/.model.py.swp deleted file mode 100644 index 24a8152e..00000000 Binary files a/utils/.model.py.swp and /dev/null differ diff --git a/utils/generic_utils.py b/utils/generic_utils.py index e6466e0c..3bb99e08 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -107,15 +107,15 @@ def sequence_mask(sequence_length, max_len=None): return seq_range_expand < seq_length_expand -def set_init_dict(model_dict, checkpoint, c): +def set_init_dict(model_dict, checkpoint_state, c): # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint['model'].items(): + for k, v in checkpoint_state.items(): if k not in model_dict: print(" | > Layer missing in the model definition: {}".format(k)) # 1. filter out unnecessary keys pretrained_dict = { k: v - for k, v in checkpoint['model'].items() if k in model_dict + for k, v in checkpoint_state.items() if k in model_dict } # 2. filter out different size layers pretrained_dict = { @@ -146,9 +146,12 @@ def setup_model(num_chars, num_speakers, c): model = MyModel(num_chars=num_chars, num_speakers=num_speakers, r=c.r, - postnet_output_dim=c.audio['num_freq'], + postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), decoder_output_dim=c.audio['num_mels'], gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], memory_size=c.memory_size, attn_type=c.attention_type, attn_win=c.windowing, @@ -161,13 +164,19 @@ def setup_model(num_chars, num_speakers, c): location_attn=c.location_attn, attn_K=c.attention_heads, separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder) + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r) elif c.model.lower() == "tacotron2": model = MyModel(num_chars=num_chars, num_speakers=num_speakers, r=c.r, postnet_output_dim=c.audio['num_mels'], decoder_output_dim=c.audio['num_mels'], + gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], attn_type=c.attention_type, attn_win=c.windowing, attn_norm=c.attention_norm, @@ -179,7 +188,9 @@ def setup_model(num_chars, num_speakers, c): location_attn=c.location_attn, attn_K=c.attention_heads, separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder) + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r) return model class KeepAverage(): @@ -198,14 +209,19 @@ class KeepAverage(): self.iters[name] = init_iter def update_value(self, name, value, weighted_avg=False): - if weighted_avg: - self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value - self.iters[name] += 1 + if name not in self.avg_values: + # add value if not exist before + self.add_value(name, init_val=value) else: - self.avg_values[name] = self.avg_values[name] * \ - self.iters[name] + value - self.iters[name] += 1 - self.avg_values[name] /= self.iters[name] + # else update existing value + if weighted_avg: + self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value + self.iters[name] += 1 + else: + self.avg_values[name] = self.avg_values[name] * \ + self.iters[name] + value + self.iters[name] += 1 + self.avg_values[name] /= self.iters[name] def add_values(self, name_dict): for key, value in name_dict.items(): @@ -242,7 +258,7 @@ def check_config(c): # audio processing parameters _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) - _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + _check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') @@ -268,6 +284,7 @@ def check_config(c): _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + _check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100) _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) _check_argument('trim_db', c['audio'], restricted=True, val_type=int) @@ -309,6 +326,8 @@ def check_config(c): _check_argument('transition_agent', c, restricted=True, val_type=bool) _check_argument('location_attn', c, restricted=True, val_type=bool) _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) + _check_argument('double_decoder_consistency', c, restricted=True, val_type=bool) + _check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) # stopnet _check_argument('stopnet', c, restricted=True, val_type=bool) @@ -316,6 +335,7 @@ def check_config(c): # tensorboard _check_argument('print_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) _check_argument('save_step', c, restricted=True, val_type=int, min_val=1) _check_argument('checkpoint', c, restricted=True, val_type=bool) _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) @@ -334,10 +354,16 @@ def check_config(c): # paths _check_argument('output_path', c, restricted=True, val_type=str) - # multi-speaker gst + # multi-speaker _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) - _check_argument('style_wav_for_test', c, restricted=True, val_type=str) + + # GST _check_argument('use_gst', c, restricted=True, val_type=bool) + _check_argument('gst', c, restricted=True, val_type=dict) + _check_argument('gst_style_input', c['gst'], restricted=True, val_type=str) + _check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1) + _check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) + _check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) # datasets - checking only the first entry _check_argument('datasets', c, restricted=True, val_type=list) diff --git a/utils/text/number_norm.py b/utils/text/number_norm.py deleted file mode 100644 index d3d9a46b..00000000 --- a/utils/text/number_norm.py +++ /dev/null @@ -1,127 +0,0 @@ -import re - -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') - -_units = [ - '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', - 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', - 'seventeen', 'eighteen', 'nineteen' -] - -_tens = [ - '', - 'ten', - 'twenty', - 'thirty', - 'forty', - 'fifty', - 'sixty', - 'seventy', - 'eighty', - 'ninety', -] - -_digit_groups = [ - '', - 'thousand', - 'million', - 'billion', - 'trillion', - 'quadrillion', -] - -_ordinal_suffixes = [ - ('one', 'first'), - ('two', 'second'), - ('three', 'third'), - ('five', 'fifth'), - ('eight', 'eighth'), - ('nine', 'ninth'), - ('twelve', 'twelfth'), - ('ty', 'tieth'), -] - - -def _remove_commas(m): - return m.group(1).replace(',', '') - - -def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') - - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - if dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - if cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - return 'zero dollars' - - -def _standard_number_to_words(n, digit_group): - parts = [] - if n >= 1000: - # Format next higher digit group. - parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) - n = n % 1000 - - if n >= 100: - parts.append('%s hundred' % _units[n // 100]) - if n % 100 >= len(_units): - parts.append(_tens[(n % 100) // 10]) - parts.append(_units[(n % 100) % 10]) - else: - parts.append(_units[n % 100]) - if n > 0: - parts.append(_digit_groups[digit_group]) - return ' '.join([x for x in parts if x]) - - -def _number_to_words(n): - # Handle special cases first, then go to the standard case: - if n >= 1000000000000000000: - return str(n) # Too large, just return the digits - if n == 0: - return 'zero' - if n % 100 == 0 and n % 1000 != 0 and n < 3000: - return _standard_number_to_words(n // 100, 0) + ' hundred' - return _standard_number_to_words(n, 0) - - -def _expand_number(m): - return _number_to_words(int(m.group(0))) - - -def _expand_ordinal(m): - num = _number_to_words(int(m.group(1))) - for suffix, replacement in _ordinal_suffixes: - if num.endswith(suffix): - return num[:-len(suffix)] + replacement - return num + 'th' - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text