diff --git a/.travis.yml b/.travis.yml index 83ba25a3..5f20cb78 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,5 +17,9 @@ matrix: python: "3.6" install: pip install --quiet -r requirements_tests.txt env: TEST_SUITE="unittest" + - name: "Unit tests" + python: "3.6" + install: pip install --quiet -r requirements_tests.txt + env: TEST_SUITE="testscripts" script: ./.travis/script diff --git a/.travis/script b/.travis/script index 76d74aea..c793d9e7 100755 --- a/.travis/script +++ b/.travis/script @@ -14,9 +14,15 @@ if [[ "$TEST_SUITE" == "unittest" ]]; then pushd tts_namespace nosetests TTS.speaker_encoder.tests --nocapture nosetests TTS.vocoder.tests --nocapture - nosetests TTS.tests --nocapture - nosetests TTS.tf.tests --nocapture + nosetests TTS.tts.tests --nocapture + nosetests TTS.tts.tf.tests --nocapture popd - # Test server package - ./tests/test_server_package.sh +fi + +if [[ "$TEST_SUITE" == "testscripts" ]]; then + # Test server package + ./tts/tests/test_server_package.sh + # test model training scripts + ./tts/tests/test_tts_train.sh + ./vocoder/tests/test_vocoder_train.sh fi diff --git a/compute_statistics.py b/compute_statistics.py deleted file mode 100755 index 399ae512..00000000 --- a/compute_statistics.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import argparse - -import numpy as np -from tqdm import tqdm - -from TTS.datasets.preprocess import load_meta_data -from TTS.utils.io import load_config -from TTS.utils.audio import AudioProcessor - -def main(): - """Run preprocessing process.""" - parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features.") - parser.add_argument("--config_path", type=str, required=True, - help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") - args = parser.parse_args() - - # load config - CONFIG = load_config(args.config_path) - CONFIG.audio['signal_norm'] = False # do not apply earlier normalization - CONFIG.audio['stats_path'] = None # discard pre-defined stats - - # load audio processor - ap = AudioProcessor(**CONFIG.audio) - - # load the meta data of target dataset - dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data - print(f" > There are {len(dataset_items)} files.") - - mel_sum = 0 - mel_square_sum = 0 - linear_sum = 0 - linear_square_sum = 0 - N = 0 - for item in tqdm(dataset_items): - # compute features - wav = ap.load_wav(item[1]) - linear = ap.spectrogram(wav) - mel = ap.melspectrogram(wav) - - # compute stats - N += mel.shape[1] - mel_sum += mel.sum(1) - linear_sum += linear.sum(1) - mel_square_sum += (mel ** 2).sum(axis=1) - linear_square_sum += (linear ** 2).sum(axis=1) - - mel_mean = mel_sum / N - mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) - linear_mean = linear_sum / N - linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) - - output_file_path = os.path.join(args.out_path, "scale_stats.npy") - stats = {} - stats['mel_mean'] = mel_mean - stats['mel_std'] = mel_scale - stats['linear_mean'] = linear_mean - stats['linear_std'] = linear_scale - - print(f' > Avg mel spec mean: {mel_mean.mean()}') - print(f' > Avg mel spec scale: {mel_scale.mean()}') - print(f' > Avg linear spec mean: {linear_mean.mean()}') - print(f' > Avg lienar spec scale: {linear_scale.mean()}') - - # set default config values for mean-var scaling - CONFIG.audio['stats_path'] = output_file_path - CONFIG.audio['signal_norm'] = True - # remove redundant values - del CONFIG.audio['max_norm'] - del CONFIG.audio['min_level_db'] - del CONFIG.audio['symmetric_norm'] - del CONFIG.audio['clip_norm'] - stats['audio_config'] = CONFIG.audio - np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') - - -if __name__ == "__main__": - main() diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py deleted file mode 100644 index 7fe966d7..00000000 --- a/datasets/TTSDataset.py +++ /dev/null @@ -1,240 +0,0 @@ -import os -import numpy as np -import collections -import torch -import random -from torch.utils.data import Dataset - -from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos -from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target - - -class MyDataset(Dataset): - def __init__(self, - outputs_per_step, - text_cleaner, - compute_linear_spec, - ap, - meta_data, - tp=None, - batch_group_size=0, - min_seq_len=0, - max_seq_len=float("inf"), - use_phonemes=True, - phoneme_cache_path=None, - phoneme_language="en-us", - enable_eos_bos=False, - verbose=False): - """ - Args: - outputs_per_step (int): number of time frames predicted per step. - text_cleaner (str): text cleaner used for the dataset. - compute_linear_spec (bool): compute linear spectrogram if True. - ap (TTS.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - batch_group_size (int): (0) range of batch randomization after sorting - sequences by length. - min_seq_len (int): (0) minimum sequence length to be processed - by the loader. - max_seq_len (int): (float("inf")) maximum sequence length. - use_phonemes (bool): (true) if true, text converted to phonemes. - phoneme_cache_path (str): path to cache phoneme features. - phoneme_language (str): one the languages from - https://github.com/bootphon/phonemizer#languages - enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. - verbose (bool): print diagnostic information. - """ - self.batch_group_size = batch_group_size - self.items = meta_data - self.outputs_per_step = outputs_per_step - self.sample_rate = ap.sample_rate - self.cleaners = text_cleaner - self.compute_linear_spec = compute_linear_spec - self.min_seq_len = min_seq_len - self.max_seq_len = max_seq_len - self.ap = ap - self.tp = tp - self.use_phonemes = use_phonemes - self.phoneme_cache_path = phoneme_cache_path - self.phoneme_language = phoneme_language - self.enable_eos_bos = enable_eos_bos - self.verbose = verbose - if use_phonemes and not os.path.isdir(phoneme_cache_path): - os.makedirs(phoneme_cache_path, exist_ok=True) - if self.verbose: - print("\n > DataLoader initialization") - print(" | > Use phonemes: {}".format(self.use_phonemes)) - if use_phonemes: - print(" | > phoneme language: {}".format(phoneme_language)) - print(" | > Number of instances : {}".format(len(self.items))) - self.sort_items() - - def load_wav(self, filename): - audio = self.ap.load_wav(filename) - return audio - - @staticmethod - def load_np(filename): - data = np.load(filename).astype('float32') - return data - - def _generate_and_cache_phoneme_sequence(self, text, cache_path): - """generate a phoneme sequence from text. - since the usage is for subsequent caching, we never add bos and - eos chars here. Instead we add those dynamically later; based on the - config option.""" - phonemes = phoneme_to_sequence(text, [self.cleaners], - language=self.phoneme_language, - enable_eos_bos=False, - tp=self.tp) - phonemes = np.asarray(phonemes, dtype=np.int32) - np.save(cache_path, phonemes) - return phonemes - - def _load_or_generate_phoneme_sequence(self, wav_file, text): - file_name = os.path.splitext(os.path.basename(wav_file))[0] - cache_path = os.path.join(self.phoneme_cache_path, - file_name + '_phoneme.npy') - try: - phonemes = np.load(cache_path) - except FileNotFoundError: - phonemes = self._generate_and_cache_phoneme_sequence(text, - cache_path) - except (ValueError, IOError): - print(" > ERROR: failed loading phonemes for {}. " - "Recomputing.".format(wav_file)) - phonemes = self._generate_and_cache_phoneme_sequence(text, - cache_path) - if self.enable_eos_bos: - phonemes = pad_with_eos_bos(phonemes, tp=self.tp) - phonemes = np.asarray(phonemes, dtype=np.int32) - return phonemes - - def load_data(self, idx): - text, wav_file, speaker_name = self.items[idx] - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - - if self.use_phonemes: - text = self._load_or_generate_phoneme_sequence(wav_file, text) - else: - text = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp), dtype=np.int32) - - assert text.size > 0, self.items[idx][1] - assert wav.size > 0, self.items[idx][1] - - sample = { - 'text': text, - 'wav': wav, - 'item_idx': self.items[idx][1], - 'speaker_name': speaker_name - } - return sample - - def sort_items(self): - r"""Sort instances based on text length in ascending order""" - lengths = np.array([len(ins[0]) for ins in self.items]) - - idxs = np.argsort(lengths) - new_items = [] - ignored = [] - for i, idx in enumerate(idxs): - length = lengths[idx] - if length < self.min_seq_len or length > self.max_seq_len: - ignored.append(idx) - else: - new_items.append(self.items[idx]) - # shuffle batch groups - if self.batch_group_size > 0: - for i in range(len(new_items) // self.batch_group_size): - offset = i * self.batch_group_size - end_offset = offset + self.batch_group_size - temp_items = new_items[offset:end_offset] - random.shuffle(temp_items) - new_items[offset:end_offset] = temp_items - self.items = new_items - - if self.verbose: - print(" | > Max length sequence: {}".format(np.max(lengths))) - print(" | > Min length sequence: {}".format(np.min(lengths))) - print(" | > Avg length sequence: {}".format(np.mean(lengths))) - print(" | > Num. instances discarded by max-min (max={}, min={}) seq limits: {}".format( - self.max_seq_len, self.min_seq_len, len(ignored))) - print(" | > Batch group size: {}.".format(self.batch_group_size)) - - def __len__(self): - return len(self.items) - - def __getitem__(self, idx): - return self.load_data(idx) - - def collate_fn(self, batch): - r""" - Perform preprocessing and create a final data batch: - 1. Sort batch instances by text-length - 2. Convert Audio signal to Spectrograms. - 3. PAD sequences wrt r. - 4. Load to Torch. - """ - - # Puts each data field into a tensor with outer dimension batch size - if isinstance(batch[0], collections.Mapping): - - text_lenghts = np.array([len(d["text"]) for d in batch]) - - # sort items with text input length for RNN efficiency - text_lenghts, ids_sorted_decreasing = torch.sort( - torch.LongTensor(text_lenghts), dim=0, descending=True) - - wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing] - item_idxs = [ - batch[idx]['item_idx'] for idx in ids_sorted_decreasing - ] - text = [batch[idx]['text'] for idx in ids_sorted_decreasing] - speaker_name = [batch[idx]['speaker_name'] - for idx in ids_sorted_decreasing] - - # compute features - mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - - mel_lengths = [m.shape[1] for m in mel] - - # compute 'stop token' targets - stop_targets = [ - np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths - ] - - # PAD stop targets - stop_targets = prepare_stop_target(stop_targets, - self.outputs_per_step) - - # PAD sequences with longest instance in the batch - text = prepare_data(text).astype(np.int32) - - # PAD features with longest instance - mel = prepare_tensor(mel, self.outputs_per_step) - - # B x D x T --> B x T x D - mel = mel.transpose(0, 2, 1) - - # convert things to pytorch - text_lenghts = torch.LongTensor(text_lenghts) - text = torch.LongTensor(text) - mel = torch.FloatTensor(mel).contiguous() - mel_lengths = torch.LongTensor(mel_lengths) - stop_targets = torch.FloatTensor(stop_targets) - - # compute linear spectrogram - if self.compute_linear_spec: - linear = [self.ap.spectrogram(w).astype('float32') for w in wav] - linear = prepare_tensor(linear, self.outputs_per_step) - linear = linear.transpose(0, 2, 1) - assert mel.shape[1] == linear.shape[1] - linear = torch.FloatTensor(linear).contiguous() - else: - linear = None - return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ - stop_targets, item_idxs - - raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ - found {}".format(type(batch[0])))) diff --git a/datasets/preprocess.py b/datasets/preprocess.py deleted file mode 100644 index e8700c6b..00000000 --- a/datasets/preprocess.py +++ /dev/null @@ -1,207 +0,0 @@ -import os -from glob import glob -import re -import sys -from TTS.utils.generic_utils import split_dataset - - -def load_meta_data(datasets): - meta_data_train_all = [] - meta_data_eval_all = [] - for dataset in datasets: - name = dataset['name'] - root_path = dataset['path'] - meta_file_train = dataset['meta_file_train'] - meta_file_val = dataset['meta_file_val'] - preprocessor = get_preprocessor_by_name(name) - - meta_data_train = preprocessor(root_path, meta_file_train) - if meta_file_val is None: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - else: - meta_data_eval = preprocessor(root_path, meta_file_val) - meta_data_train_all += meta_data_train - meta_data_eval_all += meta_data_eval - return meta_data_train_all, meta_data_eval_all - - -def get_preprocessor_by_name(name): - """Returns the respective preprocessing function.""" - thismodule = sys.modules[__name__] - return getattr(thismodule, name.lower()) - - -def tweb(root_path, meta_file): - """Normalize TWEB dataset. - https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset - """ - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "tweb" - with open(txt_file, 'r') as ttf: - for line in ttf: - cols = line.split('\t') - wav_file = os.path.join(root_path, cols[0] + '.wav') - text = cols[1] - items.append([text, wav_file, speaker_name]) - return items - - -# def kusal(root_path, meta_file): -# txt_file = os.path.join(root_path, meta_file) -# texts = [] -# wavs = [] -# with open(txt_file, "r", encoding="utf8") as f: -# frames = [ -# line.split('\t') for line in f -# if line.split('\t')[0] in self.wav_files_dict.keys() -# ] -# # TODO: code the rest -# return {'text': texts, 'wavs': wavs} - - -def mozilla(root_path, meta_file): - """Normalizes Mozilla meta data files to TTS format""" - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "mozilla" - with open(txt_file, 'r') as ttf: - for line in ttf: - cols = line.split('|') - wav_file = cols[1].strip() - text = cols[0].strip() - wav_file = os.path.join(root_path, "wavs", wav_file) - items.append([text, wav_file, speaker_name]) - return items - - -def mozilla_de(root_path, meta_file): - """Normalizes Mozilla meta data files to TTS format""" - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "mozilla" - with open(txt_file, 'r', encoding="ISO 8859-1") as ttf: - for line in ttf: - cols = line.strip().split('|') - wav_file = cols[0].strip() - text = cols[1].strip() - folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL" - wav_file = os.path.join(root_path, folder_name, wav_file) - items.append([text, wav_file, speaker_name]) - return items - - -def mailabs(root_path, meta_files=None): - """Normalizes M-AI-Labs meta data files to TTS format""" - speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") - if meta_files is None: - csv_files = glob(root_path+"/**/metadata.csv", recursive=True) - else: - csv_files = meta_files - # meta_files = [f.strip() for f in meta_files.split(",")] - items = [] - for csv_file in csv_files: - txt_file = os.path.join(root_path, csv_file) - folder = os.path.dirname(txt_file) - # determine speaker based on folder structure... - speaker_name_match = speaker_regex.search(txt_file) - if speaker_name_match is None: - continue - speaker_name = speaker_name_match.group("speaker_name") - print(" | > {}".format(csv_file)) - with open(txt_file, 'r') as ttf: - for line in ttf: - cols = line.split('|') - if meta_files is None: - wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav') - else: - wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav') - if os.path.isfile(wav_file): - text = cols[1].strip() - items.append([text, wav_file, speaker_name]) - else: - raise RuntimeError("> File %s does not exist!"%(wav_file)) - return items - - -def ljspeech(root_path, meta_file): - """Normalizes the Nancy meta data file to TTS format""" - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "ljspeech" - with open(txt_file, 'r') as ttf: - for line in ttf: - cols = line.split('|') - wav_file = os.path.join(root_path, 'wavs', cols[0] + '.wav') - text = cols[1] - items.append([text, wav_file, speaker_name]) - return items - - -def nancy(root_path, meta_file): - """Normalizes the Nancy meta data file to TTS format""" - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "nancy" - with open(txt_file, 'r') as ttf: - for line in ttf: - utt_id = line.split()[1] - text = line[line.find('"') + 1:line.rfind('"') - 1] - wav_file = os.path.join(root_path, "wavn", utt_id + ".wav") - items.append([text, wav_file, speaker_name]) - return items - - -def common_voice(root_path, meta_file): - """Normalize the common voice meta data file to TTS format.""" - txt_file = os.path.join(root_path, meta_file) - items = [] - with open(txt_file, 'r') as ttf: - for line in ttf: - if line.startswith("client_id"): - continue - cols = line.split("\t") - text = cols[2] - speaker_name = cols[0] - wav_file = os.path.join(root_path, "clips", cols[1] + ".wav") - items.append([text, wav_file, speaker_name]) - return items - - -def libri_tts(root_path, meta_files=None): - """https://ai.google/tools/datasets/libri-tts/""" - items = [] - if meta_files is None: - meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True) - for meta_file in meta_files: - _meta_file = os.path.basename(meta_file).split('.')[0] - speaker_name = _meta_file.split('_')[0] - chapter_id = _meta_file.split('_')[1] - _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}") - with open(meta_file, 'r') as ttf: - for line in ttf: - cols = line.split('\t') - wav_file = os.path.join(_root_path, cols[0] + '.wav') - text = cols[1] - items.append([text, wav_file, speaker_name]) - for item in items: - assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" - return items - - -def custom_turkish(root_path, meta_file): - txt_file = os.path.join(root_path, meta_file) - items = [] - speaker_name = "turkish-female" - skipped_files = [] - with open(txt_file, 'r', encoding='utf-8') as ttf: - for line in ttf: - cols = line.split('|') - wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav') - if not os.path.exists(wav_file): - skipped_files.append(wav_file) - continue - text = cols[1].strip() - items.append([text, wav_file, speaker_name]) - print(f" [!] {len(skipped_files)} files skipped. They don't exist...") - return items diff --git a/distribute.py b/distribute.py deleted file mode 100644 index b0fc8b07..00000000 --- a/distribute.py +++ /dev/null @@ -1,178 +0,0 @@ -# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py -import os, sys -import math -import time -import subprocess -import argparse -import torch -import torch.distributed as dist -from torch.utils.data.sampler import Sampler -from torch.autograd import Variable -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from TTS.utils.generic_utils import create_experiment_folder - - -class DistributedSampler(Sampler): - """ - Non shuffling Distributed Sampler - """ - - def __init__(self, dataset, num_replicas=None, rank=None): - super(DistributedSampler, self).__init__(dataset) - if num_replicas is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - num_replicas = dist.get_world_size() - if rank is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - rank = dist.get_rank() - self.dataset = dataset - self.num_replicas = num_replicas - self.rank = rank - self.epoch = 0 - self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - indices = torch.arange(len(self.dataset)).tolist() - - # add extra samples to make it evenly divisible - indices += indices[:(self.total_size - len(indices))] - assert len(indices) == self.total_size - - # subsample - indices = indices[self.rank:self.total_size:self.num_replicas] - assert len(indices) == self.num_samples - - return iter(indices) - - def __len__(self): - return self.num_samples - - def set_epoch(self, epoch): - self.epoch = epoch - - -def reduce_tensor(tensor, num_gpus): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.reduce_op.SUM) - rt /= num_gpus - return rt - - -def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): - assert torch.cuda.is_available(), "Distributed mode requires CUDA." - - # Set cuda device so everything is done on the right GPU. - torch.cuda.set_device(rank % torch.cuda.device_count()) - - # Initialize distributed communication - dist.init_process_group( - dist_backend, - init_method=dist_url, - world_size=num_gpus, - rank=rank, - group_name=group_name) - - -def apply_gradient_allreduce(module): - - # sync model parameters - for p in module.state_dict().values(): - if not torch.is_tensor(p): - continue - dist.broadcast(p, 0) - - def allreduce_params(): - if module.needs_reduction: - module.needs_reduction = False - # bucketing params based on value types - buckets = {} - for param in module.parameters(): - if param.requires_grad and param.grad is not None: - tp = type(param.data) - if tp not in buckets: - buckets[tp] = [] - buckets[tp].append(param) - for tp in buckets: - bucket = buckets[tp] - grads = [param.grad.data for param in bucket] - coalesced = _flatten_dense_tensors(grads) - dist.all_reduce(coalesced, op=dist.reduce_op.SUM) - coalesced /= dist.get_world_size() - for buf, synced in zip( - grads, _unflatten_dense_tensors(coalesced, grads)): - buf.copy_(synced) - - for param in list(module.parameters()): - - def allreduce_hook(*_): - Variable._execution_engine.queue_callback(allreduce_params) - - if param.requires_grad: - param.register_hook(allreduce_hook) - - def set_needs_reduction(self, *_): - self.needs_reduction = True - - module.register_forward_hook(set_needs_reduction) - return module - - -def main(): - """ - Call train.py as a new process and pass command arguments - """ - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - args = parser.parse_args() - - # OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, - # True) - # stdout_path = os.path.join(OUT_PATH, "process_stdout/") - - num_gpus = torch.cuda.device_count() - group_id = time.strftime("%Y_%m_%d-%H%M%S") - - # set arguments for train.py - command = ['train.py'] - command.append('--continue_path={}'.format(args.continue_path)) - command.append('--restore_path={}'.format(args.restore_path)) - command.append('--config_path={}'.format(args.config_path)) - command.append('--group_id=group_{}'.format(group_id)) - command.append('') - - # run processes - processes = [] - for i in range(num_gpus): - my_env = os.environ.copy() - my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) - command[-1] = '--rank={}'.format(i) - stdout = None if i == 0 else open(os.devnull, 'w') - p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env) - processes.append(p) - print(command) - - for p in processes: - p.wait() - - -if __name__ == '__main__': - main() diff --git a/layers/__init__.py b/layers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/layers/common_layers.py b/layers/common_layers.py deleted file mode 100644 index b7d02c2d..00000000 --- a/layers/common_layers.py +++ /dev/null @@ -1,389 +0,0 @@ -import torch -from torch import nn -from torch.autograd import Variable -from torch.nn import functional as F - - -class Linear(nn.Module): - def __init__(self, - in_features, - out_features, - bias=True, - init_gain='linear'): - super(Linear, self).__init__() - self.linear_layer = torch.nn.Linear( - in_features, out_features, bias=bias) - self._init_w(init_gain) - - def _init_w(self, init_gain): - torch.nn.init.xavier_uniform_( - self.linear_layer.weight, - gain=torch.nn.init.calculate_gain(init_gain)) - - def forward(self, x): - return self.linear_layer(x) - - -class LinearBN(nn.Module): - def __init__(self, - in_features, - out_features, - bias=True, - init_gain='linear'): - super(LinearBN, self).__init__() - self.linear_layer = torch.nn.Linear( - in_features, out_features, bias=bias) - self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5) - self._init_w(init_gain) - - def _init_w(self, init_gain): - torch.nn.init.xavier_uniform_( - self.linear_layer.weight, - gain=torch.nn.init.calculate_gain(init_gain)) - - def forward(self, x): - out = self.linear_layer(x) - if len(out.shape) == 3: - out = out.permute(1, 2, 0) - out = self.batch_normalization(out) - if len(out.shape) == 3: - out = out.permute(2, 0, 1) - return out - - -class Prenet(nn.Module): - def __init__(self, - in_features, - prenet_type="original", - prenet_dropout=True, - out_features=[256, 256], - bias=True): - super(Prenet, self).__init__() - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - in_features = [in_features] + out_features[:-1] - if prenet_type == "bn": - self.linear_layers = nn.ModuleList([ - LinearBN(in_size, out_size, bias=bias) - for (in_size, out_size) in zip(in_features, out_features) - ]) - elif prenet_type == "original": - self.linear_layers = nn.ModuleList([ - Linear(in_size, out_size, bias=bias) - for (in_size, out_size) in zip(in_features, out_features) - ]) - - def forward(self, x): - for linear in self.linear_layers: - if self.prenet_dropout: - x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training) - else: - x = F.relu(linear(x)) - return x - - -#################### -# ATTENTION MODULES -#################### - - -class LocationLayer(nn.Module): - def __init__(self, - attention_dim, - attention_n_filters=32, - attention_kernel_size=31): - super(LocationLayer, self).__init__() - self.location_conv1d = nn.Conv1d( - in_channels=2, - out_channels=attention_n_filters, - kernel_size=attention_kernel_size, - stride=1, - padding=(attention_kernel_size - 1) // 2, - bias=False) - self.location_dense = Linear( - attention_n_filters, attention_dim, bias=False, init_gain='tanh') - - def forward(self, attention_cat): - processed_attention = self.location_conv1d(attention_cat) - processed_attention = self.location_dense( - processed_attention.transpose(1, 2)) - return processed_attention - - -class GravesAttention(nn.Module): - """ Discretized Graves attention: - - https://arxiv.org/abs/1910.10288 - - https://arxiv.org/pdf/1906.01083.pdf - """ - COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi)) - - def __init__(self, query_dim, K): - super(GravesAttention, self).__init__() - self._mask_value = 1e-8 - self.K = K - # self.attention_alignment = 0.05 - self.eps = 1e-5 - self.J = None - self.N_a = nn.Sequential( - nn.Linear(query_dim, query_dim, bias=True), - nn.ReLU(), - nn.Linear(query_dim, 3*K, bias=True)) - self.attention_weights = None - self.mu_prev = None - self.init_layers() - - def init_layers(self): - torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.) # bias mean - torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10) # bias std - - def init_states(self, inputs): - if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5 - self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) - self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) - - # pylint: disable=R0201 - # pylint: disable=unused-argument - def preprocess_inputs(self, inputs): - return None - - def forward(self, query, inputs, processed_inputs, mask): - """ - shapes: - query: B x D_attention_rnn - inputs: B x T_in x D_encoder - processed_inputs: place_holder - mask: B x T_in - """ - gbk_t = self.N_a(query) - gbk_t = gbk_t.view(gbk_t.size(0), -1, self.K) - - # attention model parameters - # each B x K - g_t = gbk_t[:, 0, :] - b_t = gbk_t[:, 1, :] - k_t = gbk_t[:, 2, :] - - # dropout to decorrelate attention heads - g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training) - - # attention GMM parameters - sig_t = torch.nn.functional.softplus(b_t) + self.eps - - mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) - g_t = torch.softmax(g_t, dim=-1) + self.eps - - j = self.J[:inputs.size(1)+1] - - # attention weights - phi_t = g_t.unsqueeze(-1) * (1 / (1 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) - - # discritize attention weights - alpha_t = torch.sum(phi_t, 1) - alpha_t = alpha_t[:, 1:] - alpha_t[:, :-1] - alpha_t[alpha_t == 0] = 1e-8 - - # apply masking - if mask is not None: - alpha_t.data.masked_fill_(~mask, self._mask_value) - - context = torch.bmm(alpha_t.unsqueeze(1), inputs).squeeze(1) - self.attention_weights = alpha_t - self.mu_prev = mu_t - return context - - -class OriginalAttention(nn.Module): - """Following the methods proposed here: - - https://arxiv.org/abs/1712.05884 - - https://arxiv.org/abs/1807.06736 + state masking at inference - - Using sigmoid instead of softmax normalization - - Attention windowing at inference time - """ - # Pylint gets confused by PyTorch conventions here - #pylint: disable=attribute-defined-outside-init - def __init__(self, query_dim, embedding_dim, attention_dim, - location_attention, attention_location_n_filters, - attention_location_kernel_size, windowing, norm, forward_attn, - trans_agent, forward_attn_mask): - super(OriginalAttention, self).__init__() - self.query_layer = Linear( - query_dim, attention_dim, bias=False, init_gain='tanh') - self.inputs_layer = Linear( - embedding_dim, attention_dim, bias=False, init_gain='tanh') - self.v = Linear(attention_dim, 1, bias=True) - if trans_agent: - self.ta = nn.Linear( - query_dim + embedding_dim, 1, bias=True) - if location_attention: - self.location_layer = LocationLayer( - attention_dim, - attention_location_n_filters, - attention_location_kernel_size, - ) - self._mask_value = -float("inf") - self.windowing = windowing - self.win_idx = None - self.norm = norm - self.forward_attn = forward_attn - self.trans_agent = trans_agent - self.forward_attn_mask = forward_attn_mask - self.location_attention = location_attention - - def init_win_idx(self): - self.win_idx = -1 - self.win_back = 2 - self.win_front = 6 - - def init_forward_attn(self, inputs): - B = inputs.shape[0] - T = inputs.shape[1] - self.alpha = torch.cat( - [torch.ones([B, 1]), - torch.zeros([B, T])[:, :-1] + 1e-7], dim=1).to(inputs.device) - self.u = (0.5 * torch.ones([B, 1])).to(inputs.device) - - def init_location_attention(self, inputs): - B = inputs.shape[0] - T = inputs.shape[1] - self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_()) - - def init_states(self, inputs): - B = inputs.shape[0] - T = inputs.shape[1] - self.attention_weights = Variable(inputs.data.new(B, T).zero_()) - if self.location_attention: - self.init_location_attention(inputs) - if self.forward_attn: - self.init_forward_attn(inputs) - if self.windowing: - self.init_win_idx() - - def preprocess_inputs(self, inputs): - return self.inputs_layer(inputs) - - def update_location_attention(self, alignments): - self.attention_weights_cum += alignments - - def get_location_attention(self, query, processed_inputs): - attention_cat = torch.cat((self.attention_weights.unsqueeze(1), - self.attention_weights_cum.unsqueeze(1)), - dim=1) - processed_query = self.query_layer(query.unsqueeze(1)) - processed_attention_weights = self.location_layer(attention_cat) - energies = self.v( - torch.tanh(processed_query + processed_attention_weights + - processed_inputs)) - energies = energies.squeeze(-1) - return energies, processed_query - - def get_attention(self, query, processed_inputs): - processed_query = self.query_layer(query.unsqueeze(1)) - energies = self.v(torch.tanh(processed_query + processed_inputs)) - energies = energies.squeeze(-1) - return energies, processed_query - - def apply_windowing(self, attention, inputs): - back_win = self.win_idx - self.win_back - front_win = self.win_idx + self.win_front - if back_win > 0: - attention[:, :back_win] = -float("inf") - if front_win < inputs.shape[1]: - attention[:, front_win:] = -float("inf") - # this is a trick to solve a special problem. - # but it does not hurt. - if self.win_idx == -1: - attention[:, 0] = attention.max() - # Update the window - self.win_idx = torch.argmax(attention, 1).long()[0].item() - return attention - - def apply_forward_attention(self, alignment): - # forward attention - fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device), - (1, 0, 0, 0)) - # compute transition potentials - alpha = ((1 - self.u) * self.alpha - + self.u * fwd_shifted_alpha - + 1e-8) * alignment - # force incremental alignment - if not self.training and self.forward_attn_mask: - _, n = fwd_shifted_alpha.max(1) - val, n2 = alpha.max(1) - for b in range(alignment.shape[0]): - alpha[b, n[b] + 3:] = 0 - alpha[b, :( - n[b] - 1 - )] = 0 # ignore all previous states to prevent repetition. - alpha[b, - (n[b] - 2 - )] = 0.01 * val[b] # smoothing factor for the prev step - # renormalize attention weights - alpha = alpha / alpha.sum(dim=1, keepdim=True) - return alpha - - def forward(self, query, inputs, processed_inputs, mask): - """ - shapes: - query: B x D_attn_rnn - inputs: B x T_en x D_en - processed_inputs:: B x T_en x D_attn - mask: B x T_en - """ - if self.location_attention: - attention, _ = self.get_location_attention( - query, processed_inputs) - else: - attention, _ = self.get_attention( - query, processed_inputs) - # apply masking - if mask is not None: - attention.data.masked_fill_(~mask, self._mask_value) - # apply windowing - only in eval mode - if not self.training and self.windowing: - attention = self.apply_windowing(attention, inputs) - - # normalize attention values - if self.norm == "softmax": - alignment = torch.softmax(attention, dim=-1) - elif self.norm == "sigmoid": - alignment = torch.sigmoid(attention) / torch.sigmoid( - attention).sum( - dim=1, keepdim=True) - else: - raise ValueError("Unknown value for attention norm type") - - if self.location_attention: - self.update_location_attention(alignment) - - # apply forward attention if enabled - if self.forward_attn: - alignment = self.apply_forward_attention(alignment) - self.alpha = alignment - - context = torch.bmm(alignment.unsqueeze(1), inputs) - context = context.squeeze(1) - self.attention_weights = alignment - - # compute transition agent - if self.forward_attn and self.trans_agent: - ta_input = torch.cat([context, query.squeeze(1)], dim=-1) - self.u = torch.sigmoid(self.ta(ta_input)) - return context - - -def init_attn(attn_type, query_dim, embedding_dim, attention_dim, - location_attention, attention_location_n_filters, - attention_location_kernel_size, windowing, norm, forward_attn, - trans_agent, forward_attn_mask, attn_K): - if attn_type == "original": - return OriginalAttention(query_dim, embedding_dim, attention_dim, - location_attention, - attention_location_n_filters, - attention_location_kernel_size, windowing, - norm, forward_attn, trans_agent, - forward_attn_mask) - if attn_type == "graves": - return GravesAttention(query_dim, attn_K) - raise RuntimeError( - " [!] Given Attention Type '{attn_type}' is not exist.") diff --git a/layers/gst_layers.py b/layers/gst_layers.py deleted file mode 100644 index 8058d5ed..00000000 --- a/layers/gst_layers.py +++ /dev/null @@ -1,169 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class GST(nn.Module): - """Global Style Token Module for factorizing prosody in speech. - - See https://arxiv.org/pdf/1803.09017""" - - def __init__(self, num_mel, num_heads, num_style_tokens, embedding_dim): - super().__init__() - self.encoder = ReferenceEncoder(num_mel, embedding_dim) - self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, - embedding_dim) - - def forward(self, inputs): - enc_out = self.encoder(inputs) - style_embed = self.style_token_layer(enc_out) - - return style_embed - - -class ReferenceEncoder(nn.Module): - """NN module creating a fixed size prosody embedding from a spectrogram. - - inputs: mel spectrograms [batch_size, num_spec_frames, num_mel] - outputs: [batch_size, embedding_dim] - """ - - def __init__(self, num_mel, embedding_dim): - - super().__init__() - self.num_mel = num_mel - filters = [1] + [32, 32, 64, 64, 128, 128] - num_layers = len(filters) - 1 - convs = [ - nn.Conv2d( - in_channels=filters[i], - out_channels=filters[i + 1], - kernel_size=(3, 3), - stride=(2, 2), - padding=(1, 1)) for i in range(num_layers) - ] - self.convs = nn.ModuleList(convs) - self.bns = nn.ModuleList([ - nn.BatchNorm2d(num_features=filter_size) - for filter_size in filters[1:] - ]) - - post_conv_height = self.calculate_post_conv_height( - num_mel, 3, 2, 1, num_layers) - self.recurrence = nn.GRU( - input_size=filters[-1] * post_conv_height, - hidden_size=embedding_dim // 2, - batch_first=True) - - def forward(self, inputs): - batch_size = inputs.size(0) - x = inputs.view(batch_size, 1, -1, self.num_mel) - # x: 4D tensor [batch_size, num_channels==1, num_frames, num_mel] - for conv, bn in zip(self.convs, self.bns): - x = conv(x) - x = bn(x) - x = F.relu(x) - - x = x.transpose(1, 2) - # x: 4D tensor [batch_size, post_conv_width, - # num_channels==128, post_conv_height] - post_conv_width = x.size(1) - x = x.contiguous().view(batch_size, post_conv_width, -1) - # x: 3D tensor [batch_size, post_conv_width, - # num_channels*post_conv_height] - self.recurrence.flatten_parameters() - memory, out = self.recurrence(x) - # out: 3D tensor [seq_len==1, batch_size, encoding_size=128] - - return out.squeeze(0) - - @staticmethod - def calculate_post_conv_height(height, kernel_size, stride, pad, - n_convs): - """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for _ in range(n_convs): - height = (height - kernel_size + 2 * pad) // stride + 1 - return height - - -class StyleTokenLayer(nn.Module): - """NN Module attending to style tokens based on prosody encodings.""" - - def __init__(self, num_heads, num_style_tokens, - embedding_dim): - super().__init__() - self.query_dim = embedding_dim // 2 - self.key_dim = embedding_dim // num_heads - self.style_tokens = nn.Parameter( - torch.FloatTensor(num_style_tokens, self.key_dim)) - nn.init.orthogonal_(self.style_tokens) - self.attention = MultiHeadAttention( - query_dim=self.query_dim, - key_dim=self.key_dim, - num_units=embedding_dim, - num_heads=num_heads) - - def forward(self, inputs): - batch_size = inputs.size(0) - prosody_encoding = inputs.unsqueeze(1) - # prosody_encoding: 3D tensor [batch_size, 1, encoding_size==128] - tokens = torch.tanh(self.style_tokens) \ - .unsqueeze(0) \ - .expand(batch_size, -1, -1) - # tokens: 3D tensor [batch_size, num tokens, token embedding size] - style_embed = self.attention(prosody_encoding, tokens) - - return style_embed - - -class MultiHeadAttention(nn.Module): - ''' - input: - query --- [N, T_q, query_dim] - key --- [N, T_k, key_dim] - output: - out --- [N, T_q, num_units] - ''' - - def __init__(self, query_dim, key_dim, num_units, num_heads): - - super().__init__() - self.num_units = num_units - self.num_heads = num_heads - self.key_dim = key_dim - - self.W_query = nn.Linear( - in_features=query_dim, out_features=num_units, bias=False) - self.W_key = nn.Linear( - in_features=key_dim, out_features=num_units, bias=False) - self.W_value = nn.Linear( - in_features=key_dim, out_features=num_units, bias=False) - - def forward(self, query, key): - queries = self.W_query(query) # [N, T_q, num_units] - keys = self.W_key(key) # [N, T_k, num_units] - values = self.W_value(key) - - split_size = self.num_units // self.num_heads - queries = torch.stack( - torch.split(queries, split_size, dim=2), - dim=0) # [h, N, T_q, num_units/h] - keys = torch.stack( - torch.split(keys, split_size, dim=2), - dim=0) # [h, N, T_k, num_units/h] - values = torch.stack( - torch.split(values, split_size, dim=2), - dim=0) # [h, N, T_k, num_units/h] - - # score = softmax(QK^T / (d_k ** 0.5)) - scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k] - scores = scores / (self.key_dim**0.5) - scores = F.softmax(scores, dim=3) - - # out = score * V - out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] - out = torch.cat( - torch.split(out, 1, dim=0), - dim=3).squeeze(0) # [N, T_q, num_units] - - return out diff --git a/layers/losses.py b/layers/losses.py deleted file mode 100644 index f7745b6e..00000000 --- a/layers/losses.py +++ /dev/null @@ -1,246 +0,0 @@ -import numpy as np -import torch -from torch import nn -from torch.nn import functional -from TTS.utils.generic_utils import sequence_mask - - -class L1LossMasked(nn.Module): - - def __init__(self, seq_len_norm): - super(L1LossMasked, self).__init__() - self.seq_len_norm = seq_len_norm - - def forward(self, x, target, length): - """ - Args: - x: A Variable containing a FloatTensor of size - (batch, max_len, dim) which contains the - unnormalized probability for each class. - target: A Variable containing a LongTensor of size - (batch, max_len, dim) which contains the index of the true - class for each corresponding step. - length: A Variable containing a LongTensor of size (batch,) - which contains the length of each data in a batch. - Returns: - loss: An average loss value in range [0, 1] masked by the length. - """ - # mask: (batch, max_len, 1) - target.requires_grad = False - mask = sequence_mask( - sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - if self.seq_len_norm: - norm_w = mask / mask.sum(dim=1, keepdim=True) - out_weights = norm_w.div(target.shape[0] * target.shape[2]) - mask = mask.expand_as(x) - loss = functional.l1_loss( - x * mask, target * mask, reduction='none') - loss = loss.mul(out_weights.to(loss.device)).sum() - else: - mask = mask.expand_as(x) - loss = functional.l1_loss( - x * mask, target * mask, reduction='sum') - loss = loss / mask.sum() - return loss - - -class MSELossMasked(nn.Module): - - def __init__(self, seq_len_norm): - super(MSELossMasked, self).__init__() - self.seq_len_norm = seq_len_norm - - def forward(self, x, target, length): - """ - Args: - x: A Variable containing a FloatTensor of size - (batch, max_len, dim) which contains the - unnormalized probability for each class. - target: A Variable containing a LongTensor of size - (batch, max_len, dim) which contains the index of the true - class for each corresponding step. - length: A Variable containing a LongTensor of size (batch,) - which contains the length of each data in a batch. - Returns: - loss: An average loss value in range [0, 1] masked by the length. - """ - # mask: (batch, max_len, 1) - target.requires_grad = False - mask = sequence_mask( - sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - if self.seq_len_norm: - norm_w = mask / mask.sum(dim=1, keepdim=True) - out_weights = norm_w.div(target.shape[0] * target.shape[2]) - mask = mask.expand_as(x) - loss = functional.mse_loss( - x * mask, target * mask, reduction='none') - loss = loss.mul(out_weights.to(loss.device)).sum() - else: - mask = mask.expand_as(x) - loss = functional.mse_loss( - x * mask, target * mask, reduction='sum') - loss = loss / mask.sum() - return loss - - -class AttentionEntropyLoss(nn.Module): - # pylint: disable=R0201 - def forward(self, align): - """ - Forces attention to be more decisive by penalizing - soft attention weights - - TODO: arguments - TODO: unit_test - """ - entropy = torch.distributions.Categorical(probs=align).entropy() - loss = (entropy / np.log(align.shape[1])).mean() - return loss - - -class BCELossMasked(nn.Module): - - def __init__(self, pos_weight): - super(BCELossMasked, self).__init__() - self.pos_weight = pos_weight - - def forward(self, x, target, length): - """ - Args: - x: A Variable containing a FloatTensor of size - (batch, max_len) which contains the - unnormalized probability for each class. - target: A Variable containing a LongTensor of size - (batch, max_len) which contains the index of the true - class for each corresponding step. - length: A Variable containing a LongTensor of size (batch,) - which contains the length of each data in a batch. - Returns: - loss: An average loss value in range [0, 1] masked by the length. - """ - # mask: (batch, max_len, 1) - target.requires_grad = False - mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float() - loss = functional.binary_cross_entropy_with_logits( - x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum') - loss = loss / mask.sum() - return loss - - -class GuidedAttentionLoss(torch.nn.Module): - def __init__(self, sigma=0.4): - super(GuidedAttentionLoss, self).__init__() - self.sigma = sigma - - def _make_ga_masks(self, ilens, olens): - B = len(ilens) - max_ilen = max(ilens) - max_olen = max(olens) - ga_masks = torch.zeros((B, max_olen, max_ilen)) - for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma) - return ga_masks - - def forward(self, att_ws, ilens, olens): - ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device) - seq_masks = self._make_masks(ilens, olens).to(att_ws.device) - losses = ga_masks * att_ws - loss = torch.mean(losses.masked_select(seq_masks)) - return loss - - @staticmethod - def _make_ga_mask(ilen, olen, sigma): - grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen)) - grid_x, grid_y = grid_x.float(), grid_y.float() - return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2))) - - @staticmethod - def _make_masks(ilens, olens): - in_masks = sequence_mask(ilens) - out_masks = sequence_mask(olens) - return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) - - -class TacotronLoss(torch.nn.Module): - def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4): - super(TacotronLoss, self).__init__() - self.stopnet_pos_weight = stopnet_pos_weight - self.ga_alpha = c.ga_alpha - self.config = c - # postnet decoder loss - if c.loss_masking: - self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [ - "Tacotron" - ] else MSELossMasked(c.seq_len_norm) - else: - self.criterion = nn.L1Loss() if c.model in ["Tacotron" - ] else nn.MSELoss() - # guided attention loss - if c.ga_alpha > 0: - self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) - # stopnet loss - # pylint: disable=not-callable - self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None - - def forward(self, postnet_output, decoder_output, mel_input, linear_input, - stopnet_output, stopnet_target, output_lens, decoder_b_output, - alignments, alignment_lens, alignments_backwards, input_lens): - - return_dict = {} - # decoder and postnet losses - if self.config.loss_masking: - decoder_loss = self.criterion(decoder_output, mel_input, - output_lens) - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input, - output_lens) - else: - postnet_loss = self.criterion(postnet_output, mel_input, - output_lens) - else: - decoder_loss = self.criterion(decoder_output, mel_input) - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input) - else: - postnet_loss = self.criterion(postnet_output, mel_input) - loss = decoder_loss + postnet_loss - return_dict['decoder_loss'] = decoder_loss - return_dict['postnet_loss'] = postnet_loss - - # stopnet loss - stop_loss = self.criterion_st( - stopnet_output, stopnet_target, - output_lens) if self.config.stopnet else torch.zeros(1) - if not self.config.separate_stopnet and self.config.stopnet: - loss += stop_loss - return_dict['stopnet_loss'] = stop_loss - - # backward decoder loss (if enabled) - if self.config.bidirectional_decoder: - if self.config.loss_masking: - decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens) - else: - decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input) - decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output) - loss += decoder_b_loss + decoder_c_loss - return_dict['decoder_b_loss'] = decoder_b_loss - return_dict['decoder_c_loss'] = decoder_c_loss - - # double decoder consistency loss (if enabled) - if self.config.double_decoder_consistency: - decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens) - # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output) - attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards) - loss += decoder_b_loss + attention_c_loss - return_dict['decoder_coarse_loss'] = decoder_b_loss - return_dict['decoder_ddc_loss'] = attention_c_loss - - # guided attention loss (if enabled) - if self.config.ga_alpha > 0: - ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens) - loss += ga_loss * self.ga_alpha - return_dict['ga_loss'] = ga_loss * self.ga_alpha - - return_dict['loss'] = loss - return return_dict - diff --git a/layers/tacotron.py b/layers/tacotron.py deleted file mode 100644 index 20fd1e52..00000000 --- a/layers/tacotron.py +++ /dev/null @@ -1,496 +0,0 @@ -# coding: utf-8 -import torch -from torch import nn -from .common_layers import Prenet, init_attn, Linear - - -class BatchNormConv1d(nn.Module): - r"""A wrapper for Conv1d with BatchNorm. It sets the activation - function between Conv and BatchNorm layers. BatchNorm layer - is initialized with the TF default values for momentum and eps. - - Args: - in_channels: size of each input sample - out_channels: size of each output samples - kernel_size: kernel size of conv filters - stride: stride of conv filters - padding: padding of conv filters - activation: activation function set b/w Conv1d and BatchNorm - - Shapes: - - input: batch x dims - - output: batch x dims - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - activation=None): - - super(BatchNormConv1d, self).__init__() - self.padding = padding - self.padder = nn.ConstantPad1d(padding, 0) - self.conv1d = nn.Conv1d( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=0, - bias=False) - # Following tensorflow's default parameters - self.bn = nn.BatchNorm1d(out_channels, momentum=0.99, eps=1e-3) - self.activation = activation - # self.init_layers() - - def init_layers(self): - if type(self.activation) == torch.nn.ReLU: - w_gain = 'relu' - elif type(self.activation) == torch.nn.Tanh: - w_gain = 'tanh' - elif self.activation is None: - w_gain = 'linear' - else: - raise RuntimeError('Unknown activation function') - torch.nn.init.xavier_uniform_( - self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_gain)) - - def forward(self, x): - x = self.padder(x) - x = self.conv1d(x) - x = self.bn(x) - if self.activation is not None: - x = self.activation(x) - return x - - -class Highway(nn.Module): - # TODO: Try GLU layer - def __init__(self, in_size, out_size): - super(Highway, self).__init__() - self.H = nn.Linear(in_size, out_size) - self.H.bias.data.zero_() - self.T = nn.Linear(in_size, out_size) - self.T.bias.data.fill_(-1) - self.relu = nn.ReLU() - self.sigmoid = nn.Sigmoid() - # self.init_layers() - - def init_layers(self): - torch.nn.init.xavier_uniform_( - self.H.weight, gain=torch.nn.init.calculate_gain('relu')) - torch.nn.init.xavier_uniform_( - self.T.weight, gain=torch.nn.init.calculate_gain('sigmoid')) - - def forward(self, inputs): - H = self.relu(self.H(inputs)) - T = self.sigmoid(self.T(inputs)) - return H * T + inputs * (1.0 - T) - - -class CBHG(nn.Module): - """CBHG module: a recurrent neural network composed of: - - 1-d convolution banks - - Highway networks + residual connections - - Bidirectional gated recurrent units - - Args: - in_features (int): sample size - K (int): max filter size in conv bank - projections (list): conv channel sizes for conv projections - num_highways (int): number of highways layers - - Shapes: - - input: B x D x T_in - - output: B x T_in x D*2 - """ - - def __init__(self, - in_features, - K=16, - conv_bank_features=128, - conv_projections=[128, 128], - highway_features=128, - gru_features=128, - num_highways=4): - super(CBHG, self).__init__() - self.in_features = in_features - self.conv_bank_features = conv_bank_features - self.highway_features = highway_features - self.gru_features = gru_features - self.conv_projections = conv_projections - self.relu = nn.ReLU() - # list of conv1d bank with filter size k=1...K - # TODO: try dilational layers instead - self.conv1d_banks = nn.ModuleList([ - BatchNormConv1d(in_features, - conv_bank_features, - kernel_size=k, - stride=1, - padding=[(k - 1) // 2, k // 2], - activation=self.relu) for k in range(1, K + 1) - ]) - # max pooling of conv bank, with padding - # TODO: try average pooling OR larger kernel size - out_features = [K * conv_bank_features] + conv_projections[:-1] - activations = [self.relu] * (len(conv_projections) - 1) - activations += [None] - # setup conv1d projection layers - layer_set = [] - for (in_size, out_size, ac) in zip(out_features, conv_projections, - activations): - layer = BatchNormConv1d(in_size, - out_size, - kernel_size=3, - stride=1, - padding=[1, 1], - activation=ac) - layer_set.append(layer) - self.conv1d_projections = nn.ModuleList(layer_set) - # setup Highway layers - if self.highway_features != conv_projections[-1]: - self.pre_highway = nn.Linear(conv_projections[-1], - highway_features, - bias=False) - self.highways = nn.ModuleList([ - Highway(highway_features, highway_features) - for _ in range(num_highways) - ]) - # bi-directional GPU layer - self.gru = nn.GRU(gru_features, - gru_features, - 1, - batch_first=True, - bidirectional=True) - - def forward(self, inputs): - # (B, in_features, T_in) - x = inputs - # (B, hid_features*K, T_in) - # Concat conv1d bank outputs - outs = [] - for conv1d in self.conv1d_banks: - out = conv1d(x) - outs.append(out) - x = torch.cat(outs, dim=1) - assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks) - for conv1d in self.conv1d_projections: - x = conv1d(x) - x += inputs - x = x.transpose(1, 2) - if self.highway_features != self.conv_projections[-1]: - x = self.pre_highway(x) - # Residual connection - # TODO: try residual scaling as in Deep Voice 3 - # TODO: try plain residual layers - for highway in self.highways: - x = highway(x) - # (B, T_in, hid_features*2) - # TODO: replace GRU with convolution as in Deep Voice 3 - self.gru.flatten_parameters() - outputs, _ = self.gru(x) - return outputs - - -class EncoderCBHG(nn.Module): - def __init__(self): - super(EncoderCBHG, self).__init__() - self.cbhg = CBHG( - 128, - K=16, - conv_bank_features=128, - conv_projections=[128, 128], - highway_features=128, - gru_features=128, - num_highways=4) - - def forward(self, x): - return self.cbhg(x) - - -class Encoder(nn.Module): - r"""Encapsulate Prenet and CBHG modules for encoder""" - - def __init__(self, in_features): - super(Encoder, self).__init__() - self.prenet = Prenet(in_features, out_features=[256, 128]) - self.cbhg = EncoderCBHG() - - def forward(self, inputs): - r""" - Args: - inputs (FloatTensor): embedding features - - Shapes: - - inputs: batch x time x in_features - - outputs: batch x time x 128*2 - """ - # B x T x prenet_dim - outputs = self.prenet(inputs) - outputs = self.cbhg(outputs.transpose(1, 2)) - return outputs - - -class PostCBHG(nn.Module): - def __init__(self, mel_dim): - super(PostCBHG, self).__init__() - self.cbhg = CBHG( - mel_dim, - K=8, - conv_bank_features=128, - conv_projections=[256, mel_dim], - highway_features=128, - gru_features=128, - num_highways=4) - - def forward(self, x): - return self.cbhg(x) - - -class Decoder(nn.Module): - """Decoder module. - - Args: - in_features (int): input vector (encoder output) sample size. - memory_dim (int): memory vector (prev. time-step output) sample size. - r (int): number of outputs per time step. - memory_size (int): size of the past window. if <= 0 memory_size = r - TODO: arguments - """ - - # Pylint gets confused by PyTorch conventions here - #pylint: disable=attribute-defined-outside-init - - def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing, - attn_norm, prenet_type, prenet_dropout, forward_attn, - trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, speaker_embedding_dim): - super(Decoder, self).__init__() - self.r_init = r - self.r = r - self.in_features = in_features - self.max_decoder_steps = 500 - self.use_memory_queue = memory_size > 0 - self.memory_size = memory_size if memory_size > 0 else r - self.memory_dim = memory_dim - self.separate_stopnet = separate_stopnet - self.query_dim = 256 - # memory -> |Prenet| -> processed_memory - prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim - self.prenet = Prenet( - prenet_dim, - prenet_type, - prenet_dropout, - out_features=[256, 128]) - # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State - # attention_rnn generates queries for the attention mechanism - self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim) - - self.attention = init_attn(attn_type=attn_type, - query_dim=self.query_dim, - embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_windowing, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask, - attn_K=attn_K) - # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input - self.project_to_decoder_in = nn.Linear(256 + in_features, 256) - # decoder_RNN_input -> |RNN| -> RNN_state - self.decoder_rnns = nn.ModuleList( - [nn.GRUCell(256, 256) for _ in range(2)]) - # RNN_state -> |Linear| -> mel_spec - self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init) - # learn init values instead of zero init. - self.stopnet = StopNet(256 + memory_dim * self.r_init) - - def set_r(self, new_r): - self.r = new_r - - def _reshape_memory(self, memory): - """ - Reshape the spectrograms for given 'r' - """ - # Grouping multiple frames if necessary - if memory.size(-1) == self.memory_dim: - memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1) - # Time first (T_decoder, B, memory_dim) - memory = memory.transpose(0, 1) - return memory - - def _init_states(self, inputs): - """ - Initialization of decoder states - """ - B = inputs.size(0) - T = inputs.size(1) - # go frame as zeros matrix - if self.use_memory_queue: - self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size) - else: - self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim) - # decoder states - self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256) - self.decoder_rnn_hiddens = [ - torch.zeros(1, device=inputs.device).repeat(B, 256) - for idx in range(len(self.decoder_rnns)) - ] - self.context_vec = inputs.data.new(B, self.in_features).zero_() - # cache attention inputs - self.processed_inputs = self.attention.preprocess_inputs(inputs) - - def _parse_outputs(self, outputs, attentions, stop_tokens): - # Back to batch first - attentions = torch.stack(attentions).transpose(0, 1) - stop_tokens = torch.stack(stop_tokens).transpose(0, 1) - outputs = torch.stack(outputs).transpose(0, 1).contiguous() - outputs = outputs.view( - outputs.size(0), -1, self.memory_dim) - outputs = outputs.transpose(1, 2) - return outputs, attentions, stop_tokens - - def decode(self, inputs, mask=None): - # Prenet - processed_memory = self.prenet(self.memory_input) - # Attention RNN - self.attention_rnn_hidden = self.attention_rnn( - torch.cat((processed_memory, self.context_vec), -1), - self.attention_rnn_hidden) - self.context_vec = self.attention( - self.attention_rnn_hidden, inputs, self.processed_inputs, mask) - # Concat RNN output and attention context vector - decoder_input = self.project_to_decoder_in( - torch.cat((self.attention_rnn_hidden, self.context_vec), -1)) - - # Pass through the decoder RNNs - for idx in range(len(self.decoder_rnns)): - self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx]( - decoder_input, self.decoder_rnn_hiddens[idx]) - # Residual connection - decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input - decoder_output = decoder_input - - # predict mel vectors from decoder vectors - output = self.proj_to_mel(decoder_output) - # output = torch.sigmoid(output) - # predict stop token - stopnet_input = torch.cat([decoder_output, output], -1) - if self.separate_stopnet: - stop_token = self.stopnet(stopnet_input.detach()) - else: - stop_token = self.stopnet(stopnet_input) - output = output[:, : self.r * self.memory_dim] - return output, stop_token, self.attention.attention_weights - - def _update_memory_input(self, new_memory): - if self.use_memory_queue: - if self.memory_size > self.r: - # memory queue size is larger than number of frames per decoder iter - self.memory_input = torch.cat([ - new_memory, self.memory_input[:, :( - self.memory_size - self.r) * self.memory_dim].clone() - ], dim=-1) - else: - # memory queue size smaller than number of frames per decoder iter - self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] - else: - # use only the last frame prediction - # assert new_memory.shape[-1] == self.r * self.memory_dim - self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):] - - def forward(self, inputs, memory, mask, speaker_embeddings=None): - """ - Args: - inputs: Encoder outputs. - memory: Decoder memory (autoregression. If None (at eval-time), - decoder outputs are used as decoder inputs. If None, it uses the last - output as the input. - mask: Attention mask for sequence padding. - - Shapes: - - inputs: batch x time x encoder_out_dim - - memory: batch x #mel_specs x mel_spec_dim - """ - # Run greedy decoding if memory is None - memory = self._reshape_memory(memory) - outputs = [] - attentions = [] - stop_tokens = [] - t = 0 - self._init_states(inputs) - self.attention.init_states(inputs) - while len(outputs) < memory.size(0): - if t > 0: - new_memory = memory[t - 1] - self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) - output, stop_token, attention = self.decode(inputs, mask) - outputs += [output] - attentions += [attention] - stop_tokens += [stop_token.squeeze(1)] - t += 1 - return self._parse_outputs(outputs, attentions, stop_tokens) - - def inference(self, inputs, speaker_embeddings=None): - """ - Args: - inputs: encoder outputs. - speaker_embeddings: speaker vectors. - - Shapes: - - inputs: batch x time x encoder_out_dim - - speaker_embeddings: batch x embed_dim - """ - outputs = [] - attentions = [] - stop_tokens = [] - t = 0 - self._init_states(inputs) - self.attention.init_win_idx() - self.attention.init_states(inputs) - while True: - if t > 0: - new_memory = outputs[-1] - self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) - output, stop_token, attention = self.decode(inputs, None) - stop_token = torch.sigmoid(stop_token.data) - outputs += [output] - attentions += [attention] - stop_tokens += [stop_token] - t += 1 - if t > inputs.shape[1] / 4 and (stop_token > 0.6 - or attention[:, -1].item() > 0.6): - break - elif t > self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") - break - return self._parse_outputs(outputs, attentions, stop_tokens) - - -class StopNet(nn.Module): - r""" - Args: - in_features (int): feature dimension of input. - """ - - def __init__(self, in_features): - super(StopNet, self).__init__() - self.dropout = nn.Dropout(0.1) - self.linear = nn.Linear(in_features, 1) - torch.nn.init.xavier_uniform_( - self.linear.weight, gain=torch.nn.init.calculate_gain('linear')) - - def forward(self, inputs): - outputs = self.dropout(inputs) - outputs = self.linear(outputs) - return outputs diff --git a/layers/tacotron2.py b/layers/tacotron2.py deleted file mode 100644 index f11aee65..00000000 --- a/layers/tacotron2.py +++ /dev/null @@ -1,353 +0,0 @@ -import torch -from torch.autograd import Variable -from torch import nn -from torch.nn import functional as F -from .common_layers import init_attn, Prenet, Linear - - -class ConvBNBlock(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, activation=None): - super(ConvBNBlock, self).__init__() - assert (kernel_size - 1) % 2 == 0 - padding = (kernel_size - 1) // 2 - self.convolution1d = nn.Conv1d(in_channels, - out_channels, - kernel_size, - padding=padding) - self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5) - self.dropout = nn.Dropout(p=0.5) - if activation == 'relu': - self.activation = nn.ReLU() - elif activation == 'tanh': - self.activation = nn.Tanh() - else: - self.activation = nn.Identity() - - def forward(self, x): - o = self.convolution1d(x) - o = self.batch_normalization(o) - o = self.activation(o) - o = self.dropout(o) - return o - - -class Postnet(nn.Module): - def __init__(self, output_dim, num_convs=5): - super(Postnet, self).__init__() - self.convolutions = nn.ModuleList() - self.convolutions.append( - ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh')) - for _ in range(1, num_convs - 1): - self.convolutions.append( - ConvBNBlock(512, 512, kernel_size=5, activation='tanh')) - self.convolutions.append( - ConvBNBlock(512, output_dim, kernel_size=5, activation=None)) - - def forward(self, x): - o = x - for layer in self.convolutions: - o = layer(o) - return o - - -class Encoder(nn.Module): - def __init__(self, output_input_dim=512): - super(Encoder, self).__init__() - self.convolutions = nn.ModuleList() - for _ in range(3): - self.convolutions.append( - ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu')) - self.lstm = nn.LSTM(output_input_dim, - int(output_input_dim / 2), - num_layers=1, - batch_first=True, - bias=True, - bidirectional=True) - self.rnn_state = None - - def forward(self, x, input_lengths): - o = x - for layer in self.convolutions: - o = layer(o) - o = o.transpose(1, 2) - o = nn.utils.rnn.pack_padded_sequence(o, - input_lengths, - batch_first=True) - self.lstm.flatten_parameters() - o, _ = self.lstm(o) - o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True) - return o - - def inference(self, x): - o = x - for layer in self.convolutions: - o = layer(o) - o = o.transpose(1, 2) - # self.lstm.flatten_parameters() - o, _ = self.lstm(o) - return o - - -# adapted from https://github.com/NVIDIA/tacotron2/ -class Decoder(nn.Module): - # Pylint gets confused by PyTorch conventions here - #pylint: disable=attribute-defined-outside-init - def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm, - prenet_type, prenet_dropout, forward_attn, trans_agent, - forward_attn_mask, location_attn, attn_K, separate_stopnet, - speaker_embedding_dim): - super(Decoder, self).__init__() - self.frame_dim = frame_dim - self.r_init = r - self.r = r - self.encoder_embedding_dim = input_dim - self.separate_stopnet = separate_stopnet - self.max_decoder_steps = 1000 - self.gate_threshold = 0.5 - - # model dimensions - self.query_dim = 1024 - self.decoder_rnn_dim = 1024 - self.prenet_dim = 256 - self.attn_dim = 128 - self.p_attention_dropout = 0.1 - self.p_decoder_dropout = 0.1 - - # memory -> |Prenet| -> processed_memory - prenet_dim = self.frame_dim - self.prenet = Prenet(prenet_dim, - prenet_type, - prenet_dropout, - out_features=[self.prenet_dim, self.prenet_dim], - bias=False) - - self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim, - self.query_dim, - bias=True) - - self.attention = init_attn(attn_type=attn_type, - query_dim=self.query_dim, - embedding_dim=input_dim, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_win, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask, - attn_K=attn_K) - - self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim, - self.decoder_rnn_dim, - bias=True) - - self.linear_projection = Linear(self.decoder_rnn_dim + input_dim, - self.frame_dim * self.r_init) - - self.stopnet = nn.Sequential( - nn.Dropout(0.1), - Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init, - 1, - bias=True, - init_gain='sigmoid')) - self.memory_truncated = None - - def set_r(self, new_r): - self.r = new_r - - def get_go_frame(self, inputs): - B = inputs.size(0) - memory = torch.zeros(1, device=inputs.device).repeat(B, - self.frame_dim * self.r) - return memory - - def _init_states(self, inputs, mask, keep_states=False): - B = inputs.size(0) - # T = inputs.size(1) - if not keep_states: - self.query = torch.zeros(1, device=inputs.device).repeat( - B, self.query_dim) - self.attention_rnn_cell_state = torch.zeros( - 1, device=inputs.device).repeat(B, self.query_dim) - self.decoder_hidden = torch.zeros(1, device=inputs.device).repeat( - B, self.decoder_rnn_dim) - self.decoder_cell = torch.zeros(1, device=inputs.device).repeat( - B, self.decoder_rnn_dim) - self.context = torch.zeros(1, device=inputs.device).repeat( - B, self.encoder_embedding_dim) - self.inputs = inputs - self.processed_inputs = self.attention.preprocess_inputs(inputs) - self.mask = mask - - def _reshape_memory(self, memory): - """ - Reshape the spectrograms for given 'r' - """ - # Grouping multiple frames if necessary - if memory.size(-1) == self.frame_dim: - memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1) - # Time first (T_decoder, B, frame_dim) - memory = memory.transpose(0, 1) - return memory - - def _parse_outputs(self, outputs, stop_tokens, alignments): - alignments = torch.stack(alignments).transpose(0, 1) - stop_tokens = torch.stack(stop_tokens).transpose(0, 1) - outputs = torch.stack(outputs).transpose(0, 1).contiguous() - outputs = outputs.view(outputs.size(0), -1, self.frame_dim) - outputs = outputs.transpose(1, 2) - return outputs, stop_tokens, alignments - - def _update_memory(self, memory): - if len(memory.shape) == 2: - return memory[:, self.frame_dim * (self.r - 1):] - return memory[:, :, self.frame_dim * (self.r - 1):] - - def decode(self, memory): - ''' - shapes: - - memory: B x r * self.frame_dim - ''' - # self.context: B x D_en - # query_input: B x D_en + (r * self.frame_dim) - query_input = torch.cat((memory, self.context), -1) - # self.query and self.attention_rnn_cell_state : B x D_attn_rnn - self.query, self.attention_rnn_cell_state = self.attention_rnn( - query_input, (self.query, self.attention_rnn_cell_state)) - self.query = F.dropout(self.query, self.p_attention_dropout, - self.training) - self.attention_rnn_cell_state = F.dropout( - self.attention_rnn_cell_state, self.p_attention_dropout, - self.training) - # B x D_en - self.context = self.attention(self.query, self.inputs, - self.processed_inputs, self.mask) - # B x (D_en + D_attn_rnn) - decoder_rnn_input = torch.cat((self.query, self.context), -1) - # self.decoder_hidden and self.decoder_cell: B x D_decoder_rnn - self.decoder_hidden, self.decoder_cell = self.decoder_rnn( - decoder_rnn_input, (self.decoder_hidden, self.decoder_cell)) - self.decoder_hidden = F.dropout(self.decoder_hidden, - self.p_decoder_dropout, self.training) - # B x (D_decoder_rnn + D_en) - decoder_hidden_context = torch.cat((self.decoder_hidden, self.context), - dim=1) - # B x (self.r * self.frame_dim) - decoder_output = self.linear_projection(decoder_hidden_context) - # B x (D_decoder_rnn + (self.r * self.frame_dim)) - stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1) - if self.separate_stopnet: - stop_token = self.stopnet(stopnet_input.detach()) - else: - stop_token = self.stopnet(stopnet_input) - # select outputs for the reduction rate self.r - decoder_output = decoder_output[:, :self.r * self.frame_dim] - return decoder_output, self.attention.attention_weights, stop_token - - def forward(self, inputs, memories, mask, speaker_embeddings=None): - memory = self.get_go_frame(inputs).unsqueeze(0) - memories = self._reshape_memory(memories) - memories = torch.cat((memory, memories), dim=0) - memories = self._update_memory(memories) - if speaker_embeddings is not None: - memories = torch.cat([memories, speaker_embeddings], dim=-1) - memories = self.prenet(memories) - - self._init_states(inputs, mask=mask) - self.attention.init_states(inputs) - - outputs, stop_tokens, alignments = [], [], [] - while len(outputs) < memories.size(0) - 1: - memory = memories[len(outputs)] - decoder_output, attention_weights, stop_token = self.decode(memory) - outputs += [decoder_output.squeeze(1)] - stop_tokens += [stop_token.squeeze(1)] - alignments += [attention_weights] - - outputs, stop_tokens, alignments = self._parse_outputs( - outputs, stop_tokens, alignments) - return outputs, alignments, stop_tokens - - def inference(self, inputs, speaker_embeddings=None): - memory = self.get_go_frame(inputs) - memory = self._update_memory(memory) - - self._init_states(inputs, mask=None) - self.attention.init_states(inputs) - - outputs, stop_tokens, alignments, t = [], [], [], 0 - while True: - memory = self.prenet(memory) - if speaker_embeddings is not None: - memory = torch.cat([memory, speaker_embeddings], dim=-1) - decoder_output, alignment, stop_token = self.decode(memory) - stop_token = torch.sigmoid(stop_token.data) - outputs += [decoder_output.squeeze(1)] - stop_tokens += [stop_token] - alignments += [alignment] - - if stop_token > 0.7 and t > inputs.shape[0] / 2: - break - if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") - break - - memory = self._update_memory(decoder_output) - t += 1 - - outputs, stop_tokens, alignments = self._parse_outputs( - outputs, stop_tokens, alignments) - - return outputs, alignments, stop_tokens - - def inference_truncated(self, inputs): - """ - Preserve decoder states for continuous inference - """ - if self.memory_truncated is None: - self.memory_truncated = self.get_go_frame(inputs) - self._init_states(inputs, mask=None, keep_states=False) - else: - self._init_states(inputs, mask=None, keep_states=True) - - self.attention.init_win_idx() - self.attention.init_states(inputs) - outputs, stop_tokens, alignments, t = [], [], [], 0 - stop_flags = [True, False, False] - while True: - memory = self.prenet(self.memory_truncated) - decoder_output, alignment, stop_token = self.decode(memory) - stop_token = torch.sigmoid(stop_token.data) - outputs += [decoder_output.squeeze(1)] - stop_tokens += [stop_token] - alignments += [alignment] - - if stop_token > 0.7: - break - if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") - break - - self.memory_truncated = decoder_output - t += 1 - - outputs, stop_tokens, alignments = self._parse_outputs( - outputs, stop_tokens, alignments) - - return outputs, alignments, stop_tokens - - def inference_step(self, inputs, t, memory=None): - """ - For debug purposes - """ - if t == 0: - memory = self.get_go_frame(inputs) - self._init_states(inputs, mask=None) - - memory = self.prenet(memory) - decoder_output, stop_token, alignment = self.decode(memory) - stop_token = torch.sigmoid(stop_token.data) - memory = decoder_output - return decoder_output, stop_token, alignment diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/models/tacotron.py b/models/tacotron.py deleted file mode 100644 index ba42610c..00000000 --- a/models/tacotron.py +++ /dev/null @@ -1,160 +0,0 @@ -# coding: utf-8 -import torch -from torch import nn - -from TTS.layers.gst_layers import GST -from TTS.layers.tacotron import Decoder, Encoder, PostCBHG -from TTS.models.tacotron_abstract import TacotronAbstract - - -class Tacotron(TacotronAbstract): - def __init__(self, - num_chars, - num_speakers, - r=5, - postnet_output_dim=1025, - decoder_output_dim=80, - attn_type='original', - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - gst=False, - memory_size=5): - super(Tacotron, - self).__init__(num_chars, num_speakers, r, postnet_output_dim, - decoder_output_dim, attn_type, attn_win, - attn_norm, prenet_type, prenet_dropout, - forward_attn, trans_agent, forward_attn_mask, - location_attn, attn_K, separate_stopnet, - bidirectional_decoder, double_decoder_consistency, - ddc_r, gst) - decoder_in_features = 512 if num_speakers > 1 else 256 - encoder_in_features = 512 if num_speakers > 1 else 256 - speaker_embedding_dim = 256 - proj_speaker_dim = 80 if num_speakers > 1 else 0 - # base model layers - self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) - self.embedding.weight.data.normal_(0, 0.3) - self.encoder = Encoder(encoder_in_features) - self.decoder = Decoder(decoder_in_features, decoder_output_dim, r, - memory_size, attn_type, attn_win, attn_norm, - prenet_type, prenet_dropout, forward_attn, - trans_agent, forward_attn_mask, location_attn, - attn_K, separate_stopnet, proj_speaker_dim) - self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, - postnet_output_dim) - # speaker embedding layers - if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - self.speaker_project_mel = nn.Sequential( - nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh()) - self.speaker_embeddings = None - self.speaker_embeddings_projected = None - # global style token layers - if self.gst: - gst_embedding_dim = 256 - self.gst_layer = GST(num_mel=80, - num_heads=4, - num_style_tokens=10, - embedding_dim=gst_embedding_dim) - # backward pass decoder - if self.bidirectional_decoder: - self._init_backward_decoder() - # setup DDC - if self.double_decoder_consistency: - self.coarse_decoder = Decoder( - decoder_in_features, decoder_output_dim, ddc_r, memory_size, - attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, - forward_attn, trans_agent, forward_attn_mask, location_attn, - attn_K, separate_stopnet, proj_speaker_dim) - - - def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None): - """ - Shapes: - - characters: B x T_in - - text_lengths: B - - mel_specs: B x T_out x D - - speaker_ids: B x 1 - """ - self._init_states() - input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) - # B x T_in x embed_dim - inputs = self.embedding(characters) - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - # B x T_in x embed_dim + speaker_embed_dim - inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) - # B x T_in x encoder_in_features - encoder_outputs = self.encoder(inputs) - # sequence masking - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - # global style token - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) - if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, self.speaker_embeddings) - # decoder_outputs: B x decoder_in_features x T_out - # alignments: B x T_in x encoder_in_features - # stop_tokens: B x T_in - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask, - self.speaker_embeddings_projected) - # sequence masking - if output_mask is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) - # B x T_out x decoder_in_features - postnet_outputs = self.postnet(decoder_outputs) - # sequence masking - if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) - # B x T_out x posnet_dim - postnet_outputs = self.last_linear(postnet_outputs) - # B x T_out x decoder_in_features - decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() - if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward - if self.double_decoder_consistency: - decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask) - return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward - return decoder_outputs, postnet_outputs, alignments, stop_tokens - - @torch.no_grad() - def inference(self, characters, speaker_ids=None, style_mel=None): - inputs = self.embedding(characters) - self._init_states() - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) - encoder_outputs = self.encoder(inputs) - if self.gst and style_mel is not None: - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, self.speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs, self.speaker_embeddings_projected) - postnet_outputs = self.postnet(decoder_outputs) - postnet_outputs = self.last_linear(postnet_outputs) - decoder_outputs = decoder_outputs.transpose(1, 2) - return decoder_outputs, postnet_outputs, alignments, stop_tokens diff --git a/models/tacotron2.py b/models/tacotron2.py deleted file mode 100644 index 4a22b7fa..00000000 --- a/models/tacotron2.py +++ /dev/null @@ -1,169 +0,0 @@ -import torch -from torch import nn - -from TTS.layers.gst_layers import GST -from TTS.layers.tacotron2 import Decoder, Encoder, Postnet -from TTS.models.tacotron_abstract import TacotronAbstract - - -# TODO: match function arguments with tacotron -class Tacotron2(TacotronAbstract): - def __init__(self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type='original', - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - gst=False): - super(Tacotron2, - self).__init__(num_chars, num_speakers, r, postnet_output_dim, - decoder_output_dim, attn_type, attn_win, - attn_norm, prenet_type, prenet_dropout, - forward_attn, trans_agent, forward_attn_mask, - location_attn, attn_K, separate_stopnet, - bidirectional_decoder, double_decoder_consistency, - ddc_r, gst) - decoder_in_features = 512 if num_speakers > 1 else 512 - encoder_in_features = 512 if num_speakers > 1 else 512 - proj_speaker_dim = 80 if num_speakers > 1 else 0 - # base layers - self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) - if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, 512) - self.speaker_embedding.weight.data.normal_(0, 0.3) - self.encoder = Encoder(encoder_in_features) - self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, - attn_norm, prenet_type, prenet_dropout, - forward_attn, trans_agent, forward_attn_mask, - location_attn, attn_K, separate_stopnet, proj_speaker_dim) - self.postnet = Postnet(self.postnet_output_dim) - # global style token layers - if self.gst: - gst_embedding_dim = encoder_in_features - self.gst_layer = GST(num_mel=80, - num_heads=4, - num_style_tokens=10, - embedding_dim=gst_embedding_dim) - # backward pass decoder - if self.bidirectional_decoder: - self._init_backward_decoder() - # setup DDC - if self.double_decoder_consistency: - self.coarse_decoder = Decoder( - decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, - attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, - trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, proj_speaker_dim) - - @staticmethod - def shape_outputs(mel_outputs, mel_outputs_postnet, alignments): - mel_outputs = mel_outputs.transpose(1, 2) - mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) - return mel_outputs, mel_outputs_postnet, alignments - - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): - self._init_states() - # compute mask for padding - # B x T_in_max (boolean) - input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) - # B x D_embed x T_in_max - embedded_inputs = self.embedding(text).transpose(1, 2) - # B x T_in_max x D_en - encoder_outputs = self.encoder(embedded_inputs, text_lengths) - # adding speaker embeddding to encoder output - # TODO: multi-speaker - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - # B x T_in x embed_dim + speaker_embed_dim - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - # global style token - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) - # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask) - # sequence masking - if mel_lengths is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) - # B x mel_dim x T_out - postnet_outputs = self.postnet(decoder_outputs) - postnet_outputs = decoder_outputs + postnet_outputs - # sequence masking - if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) - # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) - if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward - if self.double_decoder_consistency: - decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask) - return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward - return decoder_outputs, postnet_outputs, alignments, stop_tokens - - @torch.no_grad() - def inference(self, text, speaker_ids=None): - embedded_inputs = self.embedding(text).transpose(1, 2) - encoder_outputs = self.encoder.inference(embedded_inputs) - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) - postnet_outputs = self.postnet(decoder_outputs) - postnet_outputs = decoder_outputs + postnet_outputs - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) - return decoder_outputs, postnet_outputs, alignments, stop_tokens - - def inference_truncated(self, text, speaker_ids=None): - """ - Preserve model states for continuous inference - """ - embedded_inputs = self.embedding(text).transpose(1, 2) - encoder_outputs = self.encoder.inference_truncated(embedded_inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) - mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( - encoder_outputs) - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs( - mel_outputs, mel_outputs_postnet, alignments) - return mel_outputs, mel_outputs_postnet, alignments, stop_tokens - - - def _speaker_embedding_pass(self, encoder_outputs, speaker_ids): - # TODO: multi-speaker - # if hasattr(self, "speaker_embedding") and speaker_ids is None: - # raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - # if hasattr(self, "speaker_embedding") and speaker_ids is not None: - - # speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0), - # encoder_outputs.size(1), - # -1) - # encoder_outputs = encoder_outputs + speaker_embeddings - # return encoder_outputs - pass diff --git a/models/tacotron_abstract.py b/models/tacotron_abstract.py deleted file mode 100644 index 75a1a5cd..00000000 --- a/models/tacotron_abstract.py +++ /dev/null @@ -1,180 +0,0 @@ -import copy -from abc import ABC, abstractmethod - -import torch -from torch import nn - -from TTS.utils.generic_utils import sequence_mask - - -class TacotronAbstract(ABC, nn.Module): - def __init__(self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type='original', - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - gst=False): - """ Abstract Tacotron class """ - super().__init__() - self.num_chars = num_chars - self.r = r - self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim - self.gst = gst - self.num_speakers = num_speakers - self.bidirectional_decoder = bidirectional_decoder - self.double_decoder_consistency = double_decoder_consistency - self.ddc_r = ddc_r - self.attn_type = attn_type - self.attn_win = attn_win - self.attn_norm = attn_norm - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.forward_attn = forward_attn - self.trans_agent = trans_agent - self.forward_attn_mask = forward_attn_mask - self.location_attn = location_attn - self.attn_K = attn_K - self.separate_stopnet = separate_stopnet - - # layers - self.embedding = None - self.encoder = None - self.decoder = None - self.postnet = None - - # global style token - if self.gst: - self.gst_layer = None - - # model states - self.speaker_embeddings = None - self.speaker_embeddings_projected = None - - # additional layers - self.decoder_backward = None - self.coarse_decoder = None - - ############################# - # INIT FUNCTIONS - ############################# - - def _init_states(self): - self.speaker_embeddings = None - self.speaker_embeddings_projected = None - - def _init_backward_decoder(self): - self.decoder_backward = copy.deepcopy(self.decoder) - - def _init_coarse_decoder(self): - self.coarse_decoder = copy.deepcopy(self.decoder) - self.coarse_decoder.r_init = self.ddc_r - self.coarse_decoder.set_r(self.ddc_r) - - ############################# - # CORE FUNCTIONS - ############################# - - @abstractmethod - def forward(self): - pass - - @abstractmethod - def inference(self): - pass - - ############################# - # COMMON COMPUTE FUNCTIONS - ############################# - - def compute_masks(self, text_lengths, mel_lengths): - """Compute masks against sequence paddings.""" - # B x T_in_max (boolean) - device = text_lengths.device - input_mask = sequence_mask(text_lengths).to(device) - output_mask = None - if mel_lengths is not None: - max_len = mel_lengths.max() - r = self.decoder.r - max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) - return input_mask, output_mask - - def _backward_pass(self, mel_specs, encoder_outputs, mask): - """ Run backwards decoder """ - decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask, - self.speaker_embeddings_projected) - decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() - return decoder_outputs_b, alignments_b - - def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, - input_mask): - """ Double Decoder Consistency """ - T = mel_specs.shape[1] - if T % self.coarse_decoder.r > 0: - padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) - mel_specs = torch.nn.functional.pad(mel_specs, - (0, 0, 0, padding_size, 0, 0)) - decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( - encoder_outputs.detach(), mel_specs, input_mask) - # scale_factor = self.decoder.r_init / self.decoder.r - alignments_backward = torch.nn.functional.interpolate( - alignments_backward.transpose(1, 2), - size=alignments.shape[1], - mode='nearest').transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward[:, :T, :] - return decoder_outputs_backward, alignments_backward - - ############################# - # EMBEDDING FUNCTIONS - ############################# - - def compute_speaker_embedding(self, speaker_ids): - """ Compute speaker embedding vectors """ - if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError( - " [!] Model has speaker embedding layer but speaker_id is not provided" - ) - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) - if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.speaker_embeddings_projected = self.speaker_project_mel( - self.speaker_embeddings).squeeze(1) - - def compute_gst(self, inputs, mel_specs): - """ Compute global style token """ - # pylint: disable=not-callable - gst_outputs = self.gst_layer(mel_specs) - inputs = self._add_speaker_embedding(inputs, gst_outputs) - return inputs - - @staticmethod - def _add_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand( - outputs.size(0), outputs.size(1), -1) - outputs = outputs + speaker_embeddings_ - return outputs - - @staticmethod - def _concat_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand( - outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) - return outputs diff --git a/notebooks/CheckSpectrograms.ipynb b/notebooks/CheckSpectrograms.ipynb index 66c3c7cc..dbb7a1be 100644 --- a/notebooks/CheckSpectrograms.ipynb +++ b/notebooks/CheckSpectrograms.ipynb @@ -16,9 +16,9 @@ "outputs": [], "source": [ "%matplotlib inline\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.visual import plot_spectrogram\n", - "from TTS.utils.generic_utils import load_config\n", + "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.visual import plot_spectrogram\n", + "from TTS.tts.utils.generic_utils import load_config\n", "import glob \n", "import IPython.display as ipd" ] diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index c747c764..b28489e0 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -22,12 +22,12 @@ "import numpy as np\n", "from tqdm import tqdm as tqdm\n", "from torch.utils.data import DataLoader\n", - "from TTS.datasets.TTSDataset import MyDataset\n", - "from TTS.layers.losses import L1LossMasked\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.visual import plot_spectrogram\n", - "from TTS.utils.generic_utils import load_config, setup_model, sequence_mask\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", + "from TTS.tts.datasets.TTSDataset import MyDataset\n", + "from TTS.tts.layers.losses import L1LossMasked\n", + "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.visual import plot_spectrogram\n", + "from TTS.tts.utils.generic_utils import load_config, setup_model, sequence_mask\n", + "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "%matplotlib inline\n", "\n", @@ -108,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 92b1d6c4..51413099 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -36,14 +36,14 @@ "import librosa\n", "import librosa.display\n", "\n", - "from TTS.layers import *\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import setup_model\n", - "from TTS.utils.io import load_config\n", - "from TTS.utils.text import text_to_sequence\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import plot_alignment\n", - "from TTS.utils.measures import alignment_diagonal_score\n", + "from TTS.tts.layers import *\n", + "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.io import load_config\n", + "from TTS.tts.utils.text import text_to_sequence\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.visual import plot_alignment\n", + "from TTS.tts.utils.measures import alignment_diagonal_score\n", "\n", "import IPython\n", "from IPython.display import Audio\n", @@ -96,7 +96,7 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", + "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", "\n", "# multi speaker \n", "if CONFIG.use_speaker_embedding:\n", diff --git a/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb b/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb new file mode 100644 index 00000000..390b20e2 --- /dev/null +++ b/notebooks/dataset_analysis/AnalyzeDataset-Copy1.ipynb @@ -0,0 +1,3406 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "TTS_PATH = \"/home/erogol/projects/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "import glob\n", + "import librosa\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import norm\n", + "from tqdm import tqdm_notebook as tqdm\n", + "from multiprocessing import Pool\n", + "from matplotlib import pylab as plt\n", + "from collections import Counter\n", + "from TTS.tts.datasets.preprocess import *\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "DATA_PATH = \"/home/erogol/Data/Spectie/audio/output/\"\n", + "META_DATA = \"metadata.txt\"\n", + "NUM_PROC = 8" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/erogol/Data/Spectie/audio/output/metadata.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# use your own preprocessor at this stage - TTS/datasets/proprocess.py\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mitems\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmozilla_de\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMETA_DATA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Number of audio files: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/TTS/tts_namespace/TTS/datasets/preprocess.py\u001b[0m in \u001b[0;36mmozilla_de\u001b[0;34m(root_path, meta_file)\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0mitems\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mspeaker_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"mozilla\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 83\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtxt_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ISO 8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mttf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 84\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mttf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0mcols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'|'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/erogol/Data/Spectie/audio/output/metadata.txt'" + ] + } + ], + "source": [ + "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", + "items = mozilla_de(DATA_PATH, META_DATA)\n", + "print(\" > Number of audio files: {}\".format(len(items)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# check wavs if exist\n", + "wav_files = []\n", + "for item in items:\n", + " wav_file = item[1].strip()\n", + " wav_files.append(wav_file)\n", + " if not os.path.exists(wav_file):\n", + " print(wav_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav']\n" + ] + } + ], + "source": [ + "# show duplicate items\n", + "c = Counter(wav_files)\n", + "duplicates = [item for item, count in c.items() if count > 1]\n", + "print(duplicates)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "folders = [w.split('/')[5] for w in wav_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'BATCH_10_FINAL',\n", + " 'BATCH_11_FINAL',\n", + " 'BATCH_12_FINAL',\n", + " 'BATCH_13_FINAL',\n", + " 'BATCH_14_FINAL',\n", + " 'BATCH_15_FINAL',\n", + " 'BATCH_16_FINAL',\n", + " 'BATCH_17_FINAL',\n", + " 'BATCH_18_FINAL',\n", + " 'BATCH_19_FINAL',\n", + " 'BATCH_1_FINAL',\n", + " 'BATCH_20_FINAL',\n", + " 'BATCH_2_FINAL',\n", + " 'BATCH_3_FINAL',\n", + " 'BATCH_4_FINAL',\n", + " 'BATCH_5_FINAL',\n", + " 'BATCH_6_FINAL',\n", + " 'BATCH_7_FINAL',\n", + " 'BATCH_8_FINAL',\n", + " 'BATCH_9_FINAL'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(folders)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "afbb94c274fe4913b256a8756584c0f6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "def load_item(item):\n", + " file_name = item[1].strip()\n", + " text = item[0].strip()\n", + " audio = librosa.load(file_name, sr=None)\n", + " sr = audio[1]\n", + " audio = audio[0]\n", + " audio_len = len(audio) / sr\n", + " text_len = len(text)\n", + " return file_name, text, text_len, audio, audio_len\n", + "\n", + "# This will take a while depending on size of dataset\n", + "if NUM_PROC == 1:\n", + " data = []\n", + " for m in tqdm(items):\n", + " data += [load_item(m)]\n", + "else:\n", + " with Pool(8) as p:\n", + " data = list(tqdm(p.imap(load_item, items), total=len(items)))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "53b7f6adb4db47279927ec064addb3c7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " > Number of words: 27102\n" + ] + } + ], + "source": [ + "# count words in the dataset\n", + "w_count = Counter()\n", + "for item in tqdm(data):\n", + " text = item[1].lower().strip()\n", + " for word in text.split():\n", + " w_count[word] += 1\n", + "print(\" > Number of words: {}\".format(len(w_count)))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8b48c3415e2a4ac1a174502c2308501d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14610.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "text_vs_durs = {} # text length vs audio duration\n", + "text_len_counter = Counter() # number of sentences with the keyed length\n", + "for item in tqdm(data):\n", + " text = item[1].lower().strip()\n", + " text_len = len(text)\n", + " text_len_counter[text_len] += 1\n", + " audio_len = item[-1]\n", + " try:\n", + " text_vs_durs[text_len] += [audio_len]\n", + " except:\n", + " text_vs_durs[text_len] = [audio_len]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# text_len vs avg_audio_len, median_audio_len, std_audio_len\n", + "text_vs_avg = {}\n", + "text_vs_median = {}\n", + "text_vs_std = {}\n", + "for key, durs in text_vs_durs.items():\n", + " text_vs_avg[key] = np.mean(durs)\n", + " text_vs_median[key] = np.median(durs)\n", + " text_vs_std[key] = np.std(durs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Avg audio length per char" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_7.wav', 'Schickes Heimkino!', 18, array([1.28518932e-05, 1.68334354e-05, 1.03571265e-05, ...,\n", + " 2.77877753e-05, 1.10460878e-05, 2.05760971e-05], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_12.wav', 'Das sieht ihm ähnlich.', 23, array([7.6380376e-05, 9.3327515e-05, 6.1386294e-05, ..., 3.4380835e-05,\n", + " 2.6692895e-05, 2.2882025e-06], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_20.wav', 'Oh, das Programm ist mir neu.', 29, array([-3.6327918e-05, -5.8332487e-05, -5.0294046e-05, ...,\n", + " -3.2606560e-05, -5.3037817e-05, -3.6754736e-05], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_21.wav', 'Niemand ist ein Alleskönner.', 29, array([2.5469655e-05, 1.5675920e-05, 2.6378759e-05, ..., 3.4840865e-05,\n", + " 3.4687979e-05, 2.3448023e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_25.wav', 'Dagegen ist kein Kraut gewachsen.', 33, array([8.6409571e-05, 1.6211446e-04, 1.2149933e-04, ..., 1.4264301e-05,\n", + " 2.6473885e-05, 4.1174495e-05], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_39.wav', 'Seid gegrüÃ\\x9ft!', 15, array([-4.95165441e-05, -9.18527076e-05, -1.06668835e-04, ...,\n", + " -4.00948884e-05, -6.23805026e-05, -4.42093369e-05], dtype=float32), 1.1808541666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_43.wav', 'Nicht mit dem FuÃ\\x9f!', 19, array([-2.4153460e-05, -9.5195399e-05, -1.8093537e-04, ...,\n", + " 2.0667248e-05, 2.7399163e-05, 5.0344559e-05], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_44.wav', 'Wissen ist Macht.', 17, array([-1.9221216e-05, -2.1811753e-05, -4.0165878e-06, ...,\n", + " -5.0537183e-06, -1.3825783e-05, -2.8384518e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_45.wav', 'Guck mal, ein Eichhörnchen!', 28, array([-8.8387278e-05, -7.1484370e-05, -9.1183894e-05, ...,\n", + " -2.6602589e-05, 1.1369466e-05, -1.4236821e-06], dtype=float32), 1.5245208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_58.wav', 'Ich bin mein eigener Hund.', 26, array([-1.3441265e-05, -1.3771249e-05, 2.1415319e-06, ...,\n", + " -2.9998329e-05, 6.4692267e-06, 1.6420488e-05], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_68.wav', 'Lach ich, oder was?', 19, array([1.20631594e-04, 2.69133277e-04, 3.61918297e-04, ...,\n", + " 2.52288628e-05, 1.12787602e-05, 2.01150815e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_76.wav', 'Moment mal, das ist neu.', 24, array([-4.0444505e-05, -5.6087447e-05, -7.0869857e-05, ...,\n", + " -5.9735464e-07, 1.4513580e-05, 1.7241922e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_79.wav', 'Wie lange zieht der Tee schon?', 30, array([ 1.3359761e-05, 1.4845427e-06, -8.4266394e-06, ...,\n", + " 8.4090761e-06, 5.6682808e-07, 1.4266146e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_95.wav', 'Schlaf gut!', 11, array([-8.3705861e-05, -1.3769916e-04, -1.0772650e-04, ...,\n", + " -1.2876300e-05, -3.5042558e-05, -1.5538299e-05], dtype=float32), 1.0839166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_97.wav', 'Entschuldigen Sie die Verwechslung!', 35, array([-4.3585667e-05, -4.9360351e-05, -2.4610319e-05, ...,\n", + " -1.4282005e-05, -7.0760620e-07, -2.8634834e-06], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_99.wav', 'Schönes Ding!', 14, array([-4.9598326e-05, -4.2029962e-05, -2.2566113e-05, ...,\n", + " 7.5142352e-06, -3.1275456e-05, -1.8421564e-05], dtype=float32), 0.9252916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_101.wav', 'Dann nichts wie weg hier!', 25, array([ 1.2582598e-05, 1.4227808e-05, 1.0588883e-05, ...,\n", + " 1.8725707e-07, -4.0784824e-05, -7.0644560e-06], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_120.wav', \"Wie geht's?\", 11, array([ 3.6131805e-05, 2.3445213e-05, 4.7948160e-05, ...,\n", + " -3.3656095e-05, -4.0791183e-05, -4.5296023e-05], dtype=float32), 0.9341041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_179.wav', 'Das ganze Haus hat gewackelt.', 29, array([ 1.31893430e-05, -2.02163919e-05, -5.92077959e-06, ...,\n", + " -8.03239527e-06, -1.91841791e-05, -1.46886205e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_186.wav', 'Woher kommt all der Hass?', 25, array([-1.0393358e-05, -4.2540119e-05, -1.8952907e-05, ...,\n", + " 1.9931360e-05, 2.8833035e-06, 2.6874868e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_189.wav', 'Stillgestanden!', 15, array([ 4.4343769e-06, 1.3210945e-05, 1.7683087e-05, ...,\n", + " 2.6131744e-05, -5.4923967e-06, 9.4311863e-06], dtype=float32), 1.2689791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_199.wav', 'Eine Sache zur Zeit.', 20, array([5.1501018e-05, 6.3279913e-05, 7.3763011e-05, ..., 1.0348874e-05,\n", + " 1.0562905e-05, 3.0424892e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_218.wav', 'Nichts für ungut!', 18, array([-4.0355466e-05, -4.5107645e-05, -7.7510209e-05, ...,\n", + " -2.0305148e-05, -3.0419576e-05, -1.7718892e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_220.wav', 'Sieh genau hin!', 15, array([-1.2045763e-02, -1.6849384e-02, -1.4799301e-02, ...,\n", + " 1.6059141e-06, -1.4713467e-05, 1.0609662e-05], dtype=float32), 1.3042291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_224.wav', 'Und welches Baujahr?', 20, array([-3.5566740e-05, -2.3342436e-05, -2.8526230e-05, ...,\n", + " 3.1306794e-05, 3.2872085e-05, 2.9171426e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_226.wav', 'Sofort umkehren!', 16, array([ 1.2734158e-04, 1.4998924e-04, 1.2418727e-04, ...,\n", + " -6.3872926e-06, -5.1714401e-06, -1.2052229e-05], dtype=float32), 1.3923541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_232.wav', 'Da muss man locker bleiben.', 27, array([-3.2585725e-05, -3.3840271e-05, 1.3126293e-05, ...,\n", + " -1.8632261e-05, -6.3017387e-06, -5.6675367e-06], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_237.wav', 'Probier es mal mit Aceton.', 26, array([ 7.5771743e-05, 1.0223542e-04, 1.0343192e-04, ...,\n", + " -2.1570906e-05, -3.1918564e-05, -1.1135696e-05], dtype=float32), 1.8858125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_244.wav', 'Kommt drauf an.', 15, array([ 2.7207607e-05, 1.8057373e-05, 1.2512723e-05, ...,\n", + " -6.0103289e-06, -2.1828011e-05, -8.1472344e-06], dtype=float32), 1.3571041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_246.wav', 'Man darf gespannt sein.', 23, array([-2.3668355e-03, -3.7321844e-03, -3.6732492e-03, ...,\n", + " 1.7768043e-06, 2.0778492e-05, 5.1516781e-06], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_251.wav', 'Daran scheiden sich die Geister.', 32, array([-2.39492147e-05, -4.70898958e-05, -2.53186899e-05, ...,\n", + " -4.88899059e-06, -1.34801885e-05, 1.04552892e-05], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_258.wav', 'Was habt ihr heute erlebt?', 26, array([ 3.5868085e-05, 8.2530729e-05, 4.6677309e-05, ...,\n", + " -8.4167405e-06, -2.0942105e-05, -6.2113932e-06], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_265.wav', 'Lass das sein!', 14, array([2.4356419e-05, 5.5347311e-05, 5.1189338e-05, ..., 2.7182332e-05,\n", + " 1.6106302e-05, 2.1714099e-05], dtype=float32), 1.2425208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_267.wav', 'Auch heute noch.', 16, array([ 1.6202603e-05, 1.8275598e-05, 1.5345126e-05, ...,\n", + " -9.9319268e-06, -1.4463866e-05, 7.9376441e-06], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_270.wav', 'Wir sehen uns in Bielefeld.', 27, array([5.0975410e-05, 4.6619494e-05, 5.2299667e-05, ..., 2.4641362e-05,\n", + " 2.0409352e-05, 1.7508868e-05], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_274.wav', 'Gerald muss Dampf ablassen.', 27, array([-1.4112990e-04, -2.2197423e-04, -2.2060136e-04, ...,\n", + " -4.0291343e-05, -3.2744192e-05, -1.7507429e-05], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_277.wav', 'Sehen Sie selbst!', 17, array([-3.6524234e-05, -2.8097162e-05, 4.4066533e-06, ...,\n", + " 2.1528131e-06, -1.2273627e-05, -8.5409883e-06], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_282.wav', 'Haben wir jemanden vergessen?', 29, array([-2.1900923e-05, -8.0311016e-05, -4.5058856e-05, ...,\n", + " 8.6369282e-06, 2.3358027e-05, 1.4141980e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_300.wav', 'Oh, der Besuch ist da!', 22, array([-1.1763951e-06, -6.4509544e-07, -2.1343028e-05, ...,\n", + " 8.3751611e-06, -2.0755753e-05, -3.9365756e-07], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_303.wav', 'Kannst du das bitte übernehmen?', 32, array([1.9790201e-05, 2.5795589e-05, 2.3016226e-05, ..., 4.4700668e-05,\n", + " 2.9440445e-05, 4.1151830e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_309.wav', 'Ich muss verrückt sein.', 24, array([-3.7773843e-05, -2.5238944e-05, -4.5549310e-05, ...,\n", + " -1.4228171e-05, -1.3738420e-05, -2.5079733e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_312.wav', 'Gestrichen!', 11, array([4.6765574e-05, 8.2428312e-05, 6.1315681e-05, ..., 1.7959255e-06,\n", + " 5.7119927e-08, 3.7900886e-06], dtype=float32), 0.9693541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_321.wav', 'Gott atmet nicht.', 17, array([3.9337472e-05, 4.7041980e-05, 5.6819965e-05, ..., 1.6601467e-05,\n", + " 1.5404070e-05, 3.0179035e-05], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_327.wav', 'Das ist mir auch klar.', 22, array([ 6.4578126e-05, 9.0902526e-05, 7.7864941e-05, ...,\n", + " -1.0411938e-05, -3.7324537e-06, 1.4365208e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_329.wav', 'Es sieht nach Unsinn aus.', 25, array([ 1.1480927e-06, 7.0667493e-06, -3.8140864e-05, ...,\n", + " 5.6332779e-06, 3.7668069e-05, 7.3043757e-06], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_333.wav', 'Das ist nur von auÃ\\x9fen.', 23, array([-3.8521201e-05, -4.7468315e-05, -3.4236415e-05, ...,\n", + " 5.2493826e-05, 3.7984686e-05, 3.3584591e-05], dtype=float32), 1.9915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_334.wav', 'Ich habe gerade ein DéjÃ\\xa0-vu.', 30, array([ 4.4728897e-04, 3.7400136e-04, -4.0894563e-04, ...,\n", + " 2.4757979e-05, 1.1479871e-05, 2.5551706e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_336.wav', 'Ich muss mich verzählt haben.', 30, array([-3.9173494e-05, -2.9986420e-05, -1.9012801e-05, ...,\n", + " -6.0724019e-06, 2.7600961e-05, -3.4350986e-05], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_342.wav', 'So kann man sich täuschen.', 27, array([-3.5296402e-05, -6.0332448e-05, -5.2051670e-05, ...,\n", + " -1.2274999e-05, -6.2373409e-05, 1.2240975e-05], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_345.wav', 'Ich weiÃ\\x9f nicht woher.', 22, array([-2.05518299e-05, -1.30783865e-05, -1.48754107e-05, ...,\n", + " -5.49699544e-05, -3.01012133e-05, -1.70801268e-05], dtype=float32), 1.4980833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_352.wav', 'Bist du jetzt beleidigt?', 24, array([-1.0385954e-05, 1.1672010e-05, -2.3844843e-05, ...,\n", + " 6.0053999e-06, -2.3204884e-05, -9.7573111e-06], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_357.wav', 'Gib mir zwei Minuten, ja?', 25, array([-1.8705783e-05, -3.0273133e-05, -2.4814160e-05, ...,\n", + " 1.4705538e-05, 9.7520942e-06, 1.7873571e-06], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_360.wav', 'Voll der Psycho-Blick!', 22, array([ 5.0691519e-06, 1.2665058e-05, 1.4902340e-06, ...,\n", + " 9.9865492e-06, -2.0948526e-05, -1.1750392e-05], dtype=float32), 1.4980833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_365.wav', 'Mein Freund ist Musiker.', 24, array([ 4.2413834e-05, 2.3999601e-05, 1.0646096e-05, ...,\n", + " -1.9632445e-05, -2.5183452e-05, -1.8877656e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_386.wav', 'Hast du Knoblauch gegessen?', 27, array([ 4.2124993e-06, 1.6061234e-05, 1.6008022e-05, ...,\n", + " 4.7057729e-05, -5.8230005e-05, -6.6850065e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_391.wav', 'Ist mir gar nicht aufgefallen.', 30, array([-1.2801524e-04, -1.8332504e-04, -1.6864720e-04, ...,\n", + " -1.7935792e-05, 1.3743926e-05, 4.5144670e-06], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_396.wav', 'Verdammt noch mal!', 18, array([-1.9188805e-05, 2.9282862e-06, 3.1274089e-06, ...,\n", + " 3.8011989e-05, 4.4447512e-05, 3.0465781e-05], dtype=float32), 1.3218541666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_403.wav', 'Klingt moralisch einwandfrei.', 29, array([-1.5154625e-06, -1.1907745e-05, -3.7140951e-06, ...,\n", + " 1.4816231e-06, -1.0694354e-05, -2.7909247e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_412.wav', 'Wie wunderschön du bist.', 25, array([ 8.1452117e-06, 1.2316134e-05, 1.2410718e-05, ...,\n", + " -2.5919973e-05, -1.5394140e-05, -1.6787388e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_419.wav', 'Ich kann nichts erkennen.', 25, array([-2.1261691e-05, -2.6662590e-05, -3.2895186e-05, ...,\n", + " -8.6166056e-06, 1.0871788e-06, -5.8716050e-06], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_423.wav', 'Jetzt aber zackig!', 18, array([ 2.4374567e-06, 2.0842881e-05, -1.5250983e-05, ...,\n", + " -1.6002667e-05, -4.2002972e-05, -2.0723968e-05], dtype=float32), 1.2953958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_425.wav', 'Ich bin schon ganz wirr im Kopf.', 32, array([2.9025901e-05, 3.5920395e-05, 4.5607205e-05, ..., 1.6718976e-05,\n", + " 2.1111184e-05, 3.3797973e-05], dtype=float32), 1.98275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_430.wav', 'Ihr gefällt die Kulisse.', 25, array([ 2.0069625e-05, 6.2984320e-05, 4.6121866e-05, ...,\n", + " -3.1357740e-05, -2.2353357e-05, -2.2545100e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_445.wav', 'GrüÃ\\x9f dich!', 12, array([-1.0602423e-05, -7.0546007e-06, 1.1231577e-05, ...,\n", + " -4.8423290e-06, -2.5039872e-05, -2.4532073e-05], dtype=float32), 0.7842916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_456.wav', 'Nach mir die Sintflut!', 22, array([ 2.0728099e-05, -9.0359263e-06, -4.4944873e-06, ...,\n", + " 6.8659042e-06, -1.2404760e-05, -2.2153192e-06], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_460.wav', 'Was soll das denn bringen?', 26, array([ 3.9292016e-05, 5.6996982e-05, 6.4746971e-05, ...,\n", + " -3.1001658e-05, -9.7075417e-06, -1.9902369e-05], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_461.wav', 'Er lädt immer noch.', 20, array([-1.6651324e-05, -5.8167420e-06, 5.8412393e-06, ...,\n", + " -5.8599158e-05, -5.3942535e-05, -2.6054968e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_479.wav', 'Was sollen wir nur tun?', 23, array([-4.4440752e-05, -5.3991145e-05, -4.1732972e-05, ...,\n", + " -5.2980035e-06, 1.0908753e-05, 1.9730707e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_481.wav', 'Schluss damit!', 14, array([-2.9023191e-05, -4.2109135e-05, -3.8624265e-05, ...,\n", + " -1.9805097e-05, -6.0203884e-06, 1.1789062e-05], dtype=float32), 0.9605416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_483.wav', 'Können sie mir ihr Passwort geben?', 35, array([ 2.5537942e-05, 5.2574283e-05, 5.7736743e-05, ...,\n", + " -5.4731267e-06, -2.9014491e-05, 3.6238887e-06], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_491.wav', 'Sie muss zum BogenschieÃ\\x9fen.', 28, array([-3.1108371e-05, -5.1357423e-05, -7.0860064e-05, ...,\n", + " -4.0438888e-05, -2.6810346e-06, -1.3582417e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_513.wav', 'Gib ihm die Schaufel wieder!', 28, array([-2.5840678e-05, -2.4174828e-05, -1.2895588e-05, ...,\n", + " 3.6998503e-05, 3.0887943e-05, 1.9229607e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_514.wav', 'Ich will mich kurzfassen.', 25, array([-5.4538796e-06, 1.6863480e-05, -2.4184583e-05, ...,\n", + " -7.9238208e-07, 9.8597202e-06, 2.5041477e-06], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_515.wav', 'Die ist hart im Nehmen.', 23, array([ 3.2496322e-05, 3.8166479e-05, 3.2249674e-05, ...,\n", + " -1.0363748e-05, 1.9095280e-05, 9.2708688e-06], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_516.wav', 'Oh mein Gott!', 13, array([ 1.0293347e-05, 2.3256578e-05, -2.6419082e-06, ...,\n", + " -1.2127157e-05, 1.4263560e-06, 3.2800324e-06], dtype=float32), 0.8812291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_517.wav', 'Einer noch!', 11, array([ 1.8490386e-05, 9.7866017e-05, 1.1555837e-04, ...,\n", + " -5.3282761e-08, -1.5481584e-05, 1.1070631e-06], dtype=float32), 0.7578541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_536.wav', 'Da hat er sich verhaspelt.', 26, array([-1.2101016e-05, -4.1350278e-05, -2.5068364e-05, ...,\n", + " -9.8568984e-05, 1.2527088e-04, 2.5078503e-04], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_538.wav', 'Kann ich mir nicht vorstellen.', 30, array([-7.1259085e-05, -6.6917557e-05, -7.5606287e-05, ...,\n", + " -1.7281625e-05, 1.9208239e-06, 9.8984492e-06], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_541.wav', 'Kannst du sie mal anstupsen?', 28, array([-3.0119493e-06, 3.5770699e-06, 8.4955855e-06, ...,\n", + " 1.3389642e-05, 2.2122082e-05, 1.8456800e-05], dtype=float32), 1.67875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_554.wav', 'Das wird nicht billig.', 22, array([-1.2833251e-05, -2.6942225e-05, -1.1592191e-05, ...,\n", + " -1.1226616e-05, 2.4460544e-05, 4.6120007e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_555.wav', 'Ã\\x9cberall wird hier gebaut.', 26, array([ 3.0397489e-06, 1.6576083e-05, 1.7184460e-05, ...,\n", + " -4.7443868e-06, 1.7984281e-07, 1.7898132e-05], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_556.wav', 'Was möchten Sie zu trinken?', 28, array([3.6597925e-05, 3.9522194e-05, 3.4265908e-05, ..., 4.9602304e-04,\n", + " 4.0240673e-04, 2.1699475e-04], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_559.wav', 'Waren Sie schon einmal bei uns?', 31, array([ 2.5204083e-06, -9.7146321e-06, 1.0508998e-05, ...,\n", + " 1.6337053e-05, 4.2958636e-05, 3.6466561e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_579.wav', 'Traut sich sonst noch jemand?', 29, array([-3.4311914e-05, -1.9934920e-05, -3.6420348e-05, ...,\n", + " -8.5477677e-06, -8.7745884e-06, -2.7311040e-05], dtype=float32), 1.9739583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_587.wav', 'Hier noch mal die Kurzform.', 27, array([ 4.8683055e-06, -9.0082349e-06, -6.4492651e-06, ...,\n", + " 1.2890940e-05, 1.4272653e-05, 9.0988487e-06], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_594.wav', 'Haste mal nen Euro?', 19, array([-8.6395357e-06, -1.0812845e-05, -3.0906973e-05, ...,\n", + " 9.5510404e-06, 1.9230547e-05, 3.1346096e-06], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_599.wav', 'Wie schreibt man das?', 21, array([-3.6024519e-06, -2.5525418e-05, -2.9170100e-05, ...,\n", + " -1.0803048e-05, 3.5519159e-05, 6.3340508e-06], dtype=float32), 1.6831458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_600.wav', 'Er kann es nicht mehr hören.', 29, array([-3.8066657e-05, -3.2469205e-05, -5.3206204e-05, ...,\n", + " 2.6021740e-05, -1.0833596e-06, 1.9787998e-05], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_601.wav', 'Bleib einfach cool.', 19, array([-4.1984731e-05, -2.3916245e-05, -3.1576215e-05, ...,\n", + " -1.8820670e-05, 6.2404342e-07, -9.7557686e-06], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_603.wav', 'Davon können Sie ausgehen.', 27, array([ 1.0824577e-05, -1.7968627e-05, -1.6179658e-05, ...,\n", + " -5.5361601e-05, -4.2508735e-05, -3.1106232e-05], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_606.wav', 'So ist das im Leben.', 20, array([ 1.0786475e-05, -1.3495748e-05, 6.5641157e-06, ...,\n", + " -3.1349493e-05, -2.5596510e-05, -2.9100025e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_625.wav', 'Du musst anders fragen.', 23, array([ 4.8367940e-03, 6.8724523e-03, 6.1804145e-03, ...,\n", + " -7.8923513e-06, 1.7550767e-06, 7.2876783e-06], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_628.wav', 'Es war nicht alles schlecht.', 28, array([ 1.08825125e-05, 1.04639130e-05, 8.46001694e-06, ...,\n", + " -2.05042506e-05, 7.06381434e-06, 2.37766089e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_643.wav', 'Das lasse ich mir nicht bieten!', 31, array([-8.2775728e-07, -4.0987805e-05, -1.7558119e-05, ...,\n", + " -2.1388867e-06, -4.9800960e-06, -1.3807499e-05], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_665.wav', 'Hallo, ich bin der Neue!', 24, array([-2.4004371e-04, -3.8098267e-04, -3.8909691e-04, ...,\n", + " -3.5481004e-05, 3.5560199e-05, -1.3612277e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_667.wav', 'Fastest du?', 11, array([-6.0218765e-05, -8.1393919e-05, -8.6645297e-05, ...,\n", + " 6.8678496e-06, -8.2385115e-05, -5.4868913e-05], dtype=float32), 1.2072708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_675.wav', 'Nur um das klarzustellen.', 25, array([ 2.7598284e-05, 4.3499585e-05, -7.3542742e-06, ...,\n", + " 4.4517365e-06, -9.3571025e-06, 3.8795395e-05], dtype=float32), 1.8681875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_699.wav', 'Jetzt wird es gemein.', 21, array([ 2.8973442e-05, 5.4584369e-05, 2.5356880e-05, ...,\n", + " 7.6631528e-05, 5.6628844e-05, -4.1394928e-06], dtype=float32), 1.8681875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_704.wav', 'So sieht das aus.', 17, array([7.2620540e-05, 1.0683333e-04, 1.9689680e-04, ..., 2.9477818e-05,\n", + " 1.5229379e-05, 4.7805424e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_710.wav', 'Gute Nacht ihr Lausbuben!', 25, array([-3.4681521e-04, -4.7425818e-04, -4.6133957e-04, ...,\n", + " 8.0735008e-06, -6.7210376e-06, 6.1622823e-06], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_727.wav', 'Tschüss, Mädels!', 18, array([ 5.8768086e-07, -7.6773445e-05, -4.4017674e-05, ...,\n", + " -7.9999263e-05, 3.1158263e-06, 9.4530027e-05], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_750.wav', 'Geh mir nicht auf den Keks.', 27, array([ 3.7033031e-05, -1.8765691e-05, 3.5605895e-05, ...,\n", + " -4.1894207e-05, -5.0918239e-05, -8.2971856e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_758.wav', \"Dir werd ich's zeigen.\", 22, array([ 5.9986287e-05, 3.1676023e-05, 9.2681257e-05, ...,\n", + " -2.7595996e-05, -4.2494954e-05, -1.1851616e-06], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_764.wav', 'Macht euch bereit!', 18, array([1.5598367e-04, 1.9868747e-04, 1.1692408e-04, ..., 8.2378487e-05,\n", + " 6.5455366e-05, 4.8687412e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_766.wav', 'Da kiekste wa?', 14, array([ 5.4184136e-07, -6.1094812e-05, -6.1461476e-05, ...,\n", + " 9.7159907e-05, 2.3223305e-05, 8.9147768e-05], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_778.wav', 'Das gibt es ja nicht!', 21, array([ 2.0350570e-04, 3.1676778e-04, 2.1080665e-04, ...,\n", + " -6.1200735e-05, 1.1813832e-05, -2.1792879e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_10_FINAL/10_789.wav', 'Das ist nicht mein Problem.', 27, array([-5.5885310e-05, -6.4690561e-05, -3.0270432e-05, ...,\n", + " -7.1330876e-05, -1.6931441e-05, -1.1536635e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_23.wav', 'Finde dich damit ab.', 20, array([ 7.2009592e-05, -2.1050539e-05, -8.4551131e-05, ...,\n", + " 5.7306173e-05, 9.7603959e-05, 1.5820342e-04], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_41.wav', 'Wie im Wilden Westen!', 21, array([ 1.4756477e-05, 3.1426986e-05, 9.2355578e-05, ...,\n", + " 8.1666811e-05, 7.9924212e-06, -1.6274511e-05], dtype=float32), 1.9915729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_63.wav', 'Da gehe ich mit.', 16, array([-1.10742374e-04, -1.88132090e-05, 1.54691588e-05, ...,\n", + " 2.89936361e-06, -3.01086147e-05, 3.05973408e-05], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_75.wav', 'Warum nur werktags?', 19, array([-0.00052728, -0.00052381, -0.00042873, ..., -0.00014365,\n", + " -0.00010449, -0.00010741], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_76.wav', 'Geht ihr zur Kommunion?', 23, array([-1.0898075e-04, -9.7388023e-05, -6.8978305e-05, ...,\n", + " -5.0831288e-05, -1.5921889e-05, 6.4072694e-05], dtype=float32), 1.7271979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_80.wav', 'Ihr Blick spricht Bände.', 25, array([-4.6483423e-05, -1.6536529e-04, -9.5357966e-05, ...,\n", + " -8.0715154e-06, -4.8390953e-05, -5.0536739e-05], dtype=float32), 1.6655104166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_85.wav', 'Ruhe in Frieden.', 16, array([ 1.12481954e-04, 1.02392871e-04, 1.89193961e-05, ...,\n", + " -1.02047234e-05, -6.91346722e-05, -7.76782108e-05], dtype=float32), 1.7095729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_97.wav', 'Es wird hart gekämpft.', 23, array([-0.0001628 , -0.00018412, -0.00010292, ..., 0.0001769 ,\n", + " 0.00018152, 0.00018817], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_98.wav', 'Warum das alles?', 16, array([-9.8717544e-05, -8.1991704e-05, -1.4659751e-04, ...,\n", + " -6.5778313e-06, -7.7343866e-05, 1.8901783e-05], dtype=float32), 1.3218333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_109.wav', 'Und Action!', 11, array([-2.8484770e-05, 8.8463985e-06, 5.4628901e-05, ...,\n", + " 6.9029898e-05, -7.5049247e-06, 2.7110993e-05], dtype=float32), 1.23371875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_112.wav', 'Bist du dir sicher?', 19, array([ 1.8312603e-05, -8.6757791e-07, -5.3837293e-06, ...,\n", + " 1.1187289e-05, -3.2346459e-05, 9.6363983e-06], dtype=float32), 1.6302708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_113.wav', 'Nur über meine Leiche!', 23, array([ 7.7449629e-05, 1.5036203e-04, 1.0243297e-04, ...,\n", + " -9.4819125e-06, -6.9288013e-05, 2.3950559e-05], dtype=float32), 1.8858229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_120.wav', 'Hoffentlich schafft er das.', 27, array([-1.6298418e-05, 1.6150392e-05, 2.2071041e-04, ...,\n", + " 5.1459443e-05, -2.1589445e-05, 3.2091139e-05], dtype=float32), 1.9210729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_147.wav', 'Komm, spiel mit mir!', 20, array([ 1.9483854e-05, 1.7799211e-06, 3.3775228e-05, ...,\n", + " 2.8417478e-05, -4.2961314e-05, -3.5597783e-05], dtype=float32), 1.9386979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_152.wav', 'Ui ui ui!', 9, array([5.5120941e-05, 5.6017692e-05, 4.3216096e-06, ..., 7.1505703e-05,\n", + " 3.5192006e-05, 7.0440023e-05], dtype=float32), 1.14559375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_157.wav', 'Riech mal!', 10, array([ 1.6765174e-05, 6.2451771e-05, 1.0707039e-04, ...,\n", + " -7.5908087e-05, -1.0923214e-04, -7.9517071e-05], dtype=float32), 1.03984375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_165.wav', 'Ich war nicht dabei.', 20, array([-9.2572387e-05, -7.4509240e-05, -3.5020537e-05, ...,\n", + " 2.8946462e-05, 6.8536661e-05, 1.4004428e-05], dtype=float32), 1.8065104166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_170.wav', 'Danke für die Einladung.', 25, array([-5.4829288e-05, -5.2409945e-05, -1.6216440e-05, ...,\n", + " 1.8202516e-05, 1.6152997e-05, 7.3245174e-05], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_197.wav', 'So soll es sein.', 16, array([ 6.0843304e-05, 1.4244186e-05, -1.4521269e-05, ...,\n", + " -1.3551622e-04, -8.4085783e-05, -1.3086156e-04], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_200.wav', 'Erschütternd!', 14, array([-1.85466139e-04, -1.61985561e-04, -1.26282161e-04, ...,\n", + " 6.37752237e-05, 1.00840225e-04, 1.20959485e-04], dtype=float32), 1.1543958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_203.wav', 'Nur das Ã\\x9cbliche.', 17, array([ 7.9542246e-05, 8.5164116e-05, 5.9246326e-05, ...,\n", + " -2.9600615e-05, 4.1036237e-05, 5.5239609e-05], dtype=float32), 1.8153229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_206.wav', 'Die hat nämlich ein Loch.', 26, array([-1.4263311e-05, 3.4131535e-05, -3.4750206e-05, ...,\n", + " -5.7866608e-05, 1.9035106e-05, 3.3172044e-05], dtype=float32), 1.9827604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_230.wav', 'Hol das Stöckchen.', 19, array([-0.00064988, -0.00065917, -0.00059873, ..., 0.00020419,\n", + " 0.00022752, 0.00016691], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_237.wav', 'Und bei dir?', 12, array([-2.9914919e-04, -2.2948935e-04, -2.3748397e-04, ...,\n", + " 1.1257434e-05, -3.9087045e-05, -2.3366434e-05], dtype=float32), 1.07509375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_264.wav', 'Es liegt in der Natur der Sache.', 32, array([ 3.1785059e-04, 3.4756004e-04, 3.4774767e-04, ...,\n", + " -3.1788899e-05, -7.7856974e-05, -7.3492403e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_268.wav', 'Mission erfolgreich!', 20, array([-5.1757845e-05, -2.9873547e-05, -5.2602922e-05, ...,\n", + " -1.0881226e-04, -7.0386566e-05, -4.1912252e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_274.wav', 'Kommt nicht in die Tüte!', 25, array([-2.6346192e-05, -6.4550313e-06, -4.2296477e-05, ...,\n", + " 6.7257854e-05, 5.5296507e-05, 6.6974962e-06], dtype=float32), 1.8505729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_284.wav', 'Ja, guten Tag!', 14, array([ 3.1975062e-05, 7.6259523e-05, 7.8669080e-05, ...,\n", + " -1.8048113e-05, -4.4206077e-05, -4.7247828e-05], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_308.wav', 'Es ist noch nicht lange her.', 28, array([ 2.2859822e-06, 6.0211198e-05, 5.7821064e-05, ...,\n", + " -8.3175619e-06, -2.3456680e-05, -1.9626390e-05], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_316.wav', 'Wiedersehen!', 12, array([2.8599703e-05, 6.1528997e-05, 8.9646070e-05, ..., 2.7208553e-06,\n", + " 2.9898734e-05, 9.2172457e-05], dtype=float32), 1.12796875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_328.wav', 'Mir ist schwindelig.', 20, array([ 2.4521294e-05, 5.4549360e-05, 2.9534258e-06, ...,\n", + " -8.9185494e-05, -1.0303867e-04, -5.3436386e-05], dtype=float32), 1.7976979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_329.wav', 'Sprechen sie deutsch?', 21, array([-2.4279220e-04, -2.6937225e-04, -2.3713916e-04, ...,\n", + " -2.8695989e-05, -2.7513888e-06, 5.1191882e-06], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_360.wav', 'So war es nicht gemeint.', 24, array([-5.8561371e-05, 8.4504954e-06, 3.6038864e-06, ...,\n", + " 9.6144824e-05, 5.4328477e-05, 8.8002511e-05], dtype=float32), 1.8681979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_399.wav', 'Schluss jetzt!', 14, array([ 1.60011361e-04, 1.10784895e-04, 1.05728453e-04, ...,\n", + " 1.56215738e-05, -7.51677726e-06, 3.21154062e-06], dtype=float32), 1.1940625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_414.wav', 'Sehen Sie genau hin!', 20, array([ 4.0775692e-05, 7.8341225e-05, 5.9709568e-05, ...,\n", + " 1.6227934e-05, 3.3044285e-05, -1.1752409e-06], dtype=float32), 1.7448229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_432.wav', 'Christina Habeck?', 17, array([-7.0921145e-05, -8.7887020e-05, -1.0741340e-04, ...,\n", + " 6.9928697e-05, 6.0020051e-05, 4.4092048e-05], dtype=float32), 1.6831354166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_491.wav', 'Olé, olé!', 11, array([-3.5300669e-05, -3.0546897e-05, -4.6127847e-05, ...,\n", + " -4.5910983e-06, 9.3032322e-06, 4.1992083e-05], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_498.wav', 'Nur mal so als Anregung.', 24, array([-5.8754493e-05, -2.6690983e-05, -4.8782116e-05, ...,\n", + " -4.1356816e-05, -3.8702921e-05, -2.8129245e-05], dtype=float32), 1.929875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_503.wav', 'Ich glaube ihr kein Wort.', 25, array([-1.92081643e-06, -2.77346317e-05, -5.22437476e-05, ...,\n", + " 6.71621965e-05, 1.27864005e-05, 3.48269168e-05], dtype=float32), 1.9915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_537.wav', 'Wie könnt ihr es wagen?', 24, array([-1.4561453e-03, -1.4608348e-03, -1.4617005e-03, ...,\n", + " 7.5047151e-06, -8.1957251e-07, 1.6147833e-05], dtype=float32), 1.8417604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_540.wav', 'Nach was schmeckt das genau?', 28, array([5.2316565e-05, 4.9443977e-05, 5.7626901e-05, ..., 2.5021756e-05,\n", + " 4.5578519e-05, 5.3426527e-05], dtype=float32), 1.9651354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_548.wav', 'Gänsehaut pur!', 15, array([-9.5325144e-05, -7.7983823e-05, -6.6722314e-05, ...,\n", + " 5.7276593e-05, 2.5111651e-05, 1.1992834e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_564.wav', 'Höret, höret!', 15, array([-6.9055131e-05, -6.1163970e-05, -7.0053116e-05, ...,\n", + " -1.7221355e-05, -7.2541329e-06, 1.8846076e-06], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_574.wav', 'Das Haus ist umstellt.', 22, array([ 4.3151813e-05, 5.5632776e-05, 2.7663889e-05, ...,\n", + " -4.0600127e-05, -3.0027895e-05, -4.6370071e-05], dtype=float32), 1.7183958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_606.wav', 'Den versteht keiner.', 20, array([-6.2417603e-05, -8.2428480e-05, -4.4267428e-05, ...,\n", + " -6.2675332e-05, -4.0452942e-05, -5.3965356e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_612.wav', 'Halten Sie sich fest!', 21, array([2.8007184e-05, 3.2632157e-05, 6.2635645e-06, ..., 5.3581707e-06,\n", + " 1.5780075e-05, 2.3362747e-06], dtype=float32), 1.6390729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_640.wav', 'Können Sie sich ausweisen?', 27, array([-4.1133004e-05, -3.4346365e-05, -2.0997140e-06, ...,\n", + " 2.5395755e-05, 1.5488129e-05, 1.3214269e-05], dtype=float32), 1.9298854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_645.wav', 'Genug ist genug.', 16, array([1.4217473e-04, 1.3088981e-04, 1.2007774e-04, ..., 8.0914921e-05,\n", + " 5.1820301e-05, 7.9144287e-05], dtype=float32), 1.7448229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_647.wav', 'Da bin ich ganz bei Ihnen!', 26, array([-6.2454426e-05, -7.3873220e-05, -9.7365184e-05, ...,\n", + " 1.7943923e-05, 1.8189858e-05, 2.0363577e-05], dtype=float32), 1.7183854166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_666.wav', 'Ich hasse dich!', 15, array([-4.7738231e-06, 1.0362664e-06, 9.6731110e-06, ...,\n", + " 3.2887896e-05, 6.7240894e-06, 7.3296378e-06], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_676.wav', 'Jetzt weiÃ\\x9f ich es wieder.', 26, array([-2.9731807e-05, -2.5498804e-05, -5.7221558e-05, ...,\n", + " -1.3199271e-05, -1.1122796e-05, -1.5994978e-05], dtype=float32), 1.9915729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_708.wav', 'Täuschkörper einsetzen!', 25, array([3.3980694e-05, 5.6047942e-05, 3.6845995e-05, ..., 2.0433601e-05,\n", + " 5.5359560e-05, 3.6635800e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_711.wav', 'So sind die Regeln.', 19, array([ 1.0646171e-05, 2.1217951e-05, -8.0062582e-06, ...,\n", + " -4.2156036e-05, -1.8816583e-05, -4.4005763e-05], dtype=float32), 1.6038229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_712.wav', 'Es schmeckt nach Zimt.', 22, array([ 2.2929296e-05, 2.9111379e-05, 4.6064979e-05, ...,\n", + " -1.8768259e-06, 7.4329464e-06, 1.2982395e-05], dtype=float32), 1.6831354166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_717.wav', 'Auch bei feuchtem Wetter nicht.', 31, array([1.6887316e-05, 6.2355371e-05, 7.5977659e-05, ..., 1.6490449e-05,\n", + " 2.1054177e-05, 1.1164552e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_731.wav', 'Warum denn nicht?', 17, array([ 6.4304750e-06, -6.7788221e-07, -1.0204109e-06, ...,\n", + " -9.7024295e-06, -3.1934254e-05, -2.7286467e-05], dtype=float32), 1.25134375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_746.wav', 'Was isst du da?', 15, array([ 4.1260464e-05, 1.0193682e-05, 3.5085955e-05, ...,\n", + " -3.5494733e-05, -1.2306450e-05, 1.2647797e-05], dtype=float32), 1.6919479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_750.wav', 'Alle schreien hier!', 19, array([-1.3079788e-04, -1.3171590e-04, -1.1580650e-04, ...,\n", + " -2.0512020e-05, -2.3779969e-05, -2.4454272e-05], dtype=float32), 1.7007708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_754.wav', 'Das ist genau mein Ding.', 24, array([-1.1629934e-05, -2.1403244e-05, 1.6778110e-06, ...,\n", + " 1.0532378e-05, 4.3498221e-05, 4.0848565e-05], dtype=float32), 1.6390729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_757.wav', 'Wo denken Sie hin?', 18, array([ 2.1430247e-05, 2.1772265e-05, 2.0838190e-05, ...,\n", + " 2.2910473e-05, -5.1848092e-06, -1.5559262e-06], dtype=float32), 1.4540208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_758.wav', 'Reine Gewöhnungssache.', 23, array([-4.3785589e-05, -4.8620215e-05, -4.8604503e-05, ...,\n", + " 1.0856102e-05, 7.9429465e-06, 6.5844351e-06], dtype=float32), 1.6126458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_760.wav', 'Tschüss!', 9, array([1.6893557e-05, 3.7733011e-05, 4.6923491e-05, ..., 3.5450230e-05,\n", + " 5.7595411e-05, 5.0426086e-05], dtype=float32), 0.6873541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_765.wav', 'Vergiss die Waschtasche nicht!', 30, array([-5.2931227e-05, -5.9350517e-05, -5.4635959e-05, ...,\n", + " -3.9712177e-05, -3.0881067e-05, -1.9957897e-05], dtype=float32), 1.929875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_788.wav', 'Längs oder quer?', 17, array([-5.8456011e-05, -4.5964895e-05, -2.6546955e-05, ...,\n", + " 1.1356072e-05, 1.8672996e-05, -7.0059104e-07], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_14.wav', 'Wer hat euch geschickt?', 23, array([-1.1148760e-04, 2.4612555e-05, 9.3476447e-05, ...,\n", + " -9.7927412e-05, -3.4095574e-05, -1.7279797e-05], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_34.wav', 'Wo bin ich hier nur gelandet?', 29, array([-1.3307537e-05, -1.0089541e-04, -1.2360289e-05, ...,\n", + " -4.9649680e-05, -7.3272109e-05, -6.8251233e-05], dtype=float32), 1.9306666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_40.wav', 'Natürlich behauptet sie das.', 29, array([ 1.2778574e-04, 5.9959311e-05, -8.1008322e-05, ...,\n", + " 1.9905625e-04, 2.6344018e-05, 1.1490170e-04], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_43.wav', 'Du hattest recht.', 17, array([-1.1000242e-04, -1.6242996e-04, -2.2294538e-04, ...,\n", + " 1.1730633e-04, -8.3676481e-05, -2.5764350e-05], dtype=float32), 1.152)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_44.wav', 'Verklagen Sie mich doch!', 24, array([ 1.94306958e-05, 1.91541476e-04, 6.15894969e-05, ...,\n", + " -1.00529454e-04, -2.00755429e-04, 5.24241113e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_45.wav', 'Die Bremse schleift.', 20, array([ 1.8599353e-04, 8.8273533e-05, 1.5005667e-04, ...,\n", + " -1.6525917e-04, -2.2365544e-05, -2.3978014e-04], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_51.wav', 'Hilfe!', 6, array([-1.7958642e-04, -2.2338594e-04, -2.7969983e-04, ...,\n", + " -1.4840752e-04, -3.4539087e-05, 3.2946355e-06], dtype=float32), 0.704)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_57.wav', 'Jetzt liegt es an dir.', 22, array([ 2.1328227e-04, 8.1810067e-05, -1.6158322e-04, ...,\n", + " 1.6350237e-04, 1.0099774e-04, 1.6040609e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_58.wav', 'Wo kann ich das kaufen?', 23, array([-9.1674337e-05, -1.6169342e-04, -1.8347435e-04, ...,\n", + " 4.6268760e-06, 2.3974455e-05, -1.1637783e-04], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_60.wav', 'Kann man jetzt auch nicht mehr ändern.', 39, array([-3.5826775e-04, -3.3033665e-04, -2.3628448e-04, ...,\n", + " -1.9967039e-04, -1.7616056e-05, 6.7053217e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_68.wav', 'Hör mir doch mal zu.', 21, array([-1.0109342e-04, -3.4855773e-06, 9.0611480e-05, ...,\n", + " -1.0345047e-04, -4.0894301e-05, -6.3259591e-05], dtype=float32), 1.4613333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_75.wav', 'Gibt es die Person wirklich?', 28, array([1.8891362e-04, 2.3809298e-04, 1.1160582e-04, ..., 2.3936841e-06,\n", + " 4.5461587e-05, 9.1474227e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_79.wav', 'Wo waren wir stehen geblieben?', 30, array([-6.7620305e-05, 3.2152042e-05, 6.8106332e-05, ...,\n", + " -1.8769420e-04, -6.5137865e-05, -2.5653889e-04], dtype=float32), 1.824)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_91.wav', 'Grundgütiger!', 14, array([ 7.70497209e-05, -5.13312625e-05, 7.22193681e-06, ...,\n", + " -1.11605725e-04, -1.26782295e-04, 8.50337819e-05], dtype=float32), 1.3546666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_92.wav', 'Wer bist du?', 12, array([-4.3348764e-04, -4.4667200e-04, -4.2408684e-04, ...,\n", + " -3.9185648e-05, -3.1797776e-05, -2.2222506e-04], dtype=float32), 1.024)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_95.wav', 'Schon gut.', 10, array([-3.07407812e-04, -4.31929773e-04, -5.19388705e-04, ...,\n", + " -1.07154076e-04, -7.57433227e-05, -1.24133236e-04], dtype=float32), 0.9173333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_99.wav', 'Murat, was ist los mit dir?', 27, array([-3.84323685e-05, 6.48807691e-05, -5.84455011e-05, ...,\n", + " 1.45171012e-04, -1.50349506e-05, 1.20676006e-04], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_101.wav', 'HeiÃ\\x9fe Würstchen!', 18, array([-0.00027939, -0.00039175, -0.00025548, ..., 0.00027689,\n", + " 0.00011903, 0.00012768], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_106.wav', 'Ich will auch mal einer werden.', 31, array([ 1.36086979e-04, -1.76298781e-05, -4.00176577e-05, ...,\n", + " 1.72844579e-04, 1.29597363e-04, -1.02162725e-04], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_112.wav', 'Ich will auch haben!', 20, array([-4.40885342e-05, -2.34828622e-04, -3.29593284e-04, ...,\n", + " -3.05666414e-04, -1.31685141e-04, -1.00833015e-04], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_116.wav', 'Setz dich bitte gerade hin!', 27, array([-2.2211492e-04, -2.0630175e-04, -1.4655131e-04, ...,\n", + " 1.6456892e-04, 1.0634777e-06, -1.4669505e-04], dtype=float32), 1.9306666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_122.wav', 'Findest du mich erwachsen?', 26, array([3.0208268e-04, 3.6579225e-04, 3.3154435e-04, ..., 6.2579543e-06,\n", + " 4.9250040e-05, 1.8107957e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_123.wav', 'Schrei nicht so!', 16, array([ 8.03208750e-05, 1.33657450e-04, -1.13144284e-04, ...,\n", + " 4.64295183e-04, 4.82034549e-04, 2.86602415e-04], dtype=float32), 1.152)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_136.wav', 'Das kam unerwartet.', 19, array([-3.3067852e-05, -4.8878199e-05, 5.8831414e-05, ...,\n", + " -3.5621467e-04, -3.7723745e-04, -2.3875662e-04], dtype=float32), 1.7386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_149.wav', 'Das ergibt doch keinen Sinn.', 28, array([6.0471892e-05, 8.1125305e-05, 2.7437322e-04, ..., 9.1583250e-05,\n", + " 2.0055164e-04, 2.2477485e-04], dtype=float32), 1.9733333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_155.wav', 'Aller Abschied fällt schwer.', 29, array([-2.2813781e-04, -5.5478893e-05, 1.6814301e-04, ...,\n", + " 1.2765558e-04, 1.7368943e-04, 2.6105065e-04], dtype=float32), 1.6533333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_165.wav', 'Erkennst du mich nicht?', 23, array([-2.3624673e-04, -3.1934463e-04, -2.9434697e-04, ...,\n", + " 1.7059442e-04, 1.9742029e-06, 1.3172596e-04], dtype=float32), 1.4293333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_169.wav', 'Willst du sie mal streicheln?', 29, array([ 1.9991475e-04, 3.4090909e-04, 3.2008073e-04, ...,\n", + " 4.6425943e-05, -8.5656990e-05, -1.2934266e-05], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_181.wav', 'Zur Anmeldung klicken Sie hier.', 31, array([ 5.3989668e-05, -9.8630007e-05, -1.1361165e-04, ...,\n", + " -2.2555150e-05, 3.3015600e-05, 1.0129590e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_194.wav', 'Elvis war nie tot.', 18, array([-6.78355209e-05, -5.90024465e-05, -1.47034181e-04, ...,\n", + " 1.19253775e-04, 2.40493591e-05, 3.28276219e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_196.wav', 'Irgendetwas zu verzollen?', 25, array([-1.2399687e-04, -3.0497483e-06, -1.2210968e-04, ...,\n", + " 1.4703360e-05, 4.4073422e-05, 2.5880148e-04], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_216.wav', 'Du bist doch nicht aus Zucker.', 30, array([-3.7417009e-05, -2.1370529e-04, -1.0503333e-04, ...,\n", + " -3.4687804e-05, -1.0006884e-04, 8.2270970e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_217.wav', 'Bald hat er sein Abi.', 21, array([-7.6955817e-05, -7.4724245e-05, -5.4779473e-05, ...,\n", + " -3.2609492e-05, -1.9532166e-04, -4.0988740e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_227.wav', 'Da lacht das Herz.', 18, array([0.000232 , 0.00019664, 0.00015979, ..., 0.00012966, 0.0001156 ,\n", + " 0.00015061], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_238.wav', 'Steht mir die Bluse?', 20, array([ 5.00293754e-05, 1.15090246e-04, -1.61606382e-04, ...,\n", + " -1.10758898e-04, 9.87306703e-05, 2.25929121e-04], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_240.wav', 'Kommt ihr zurecht?', 18, array([-1.4166623e-04, -1.7185905e-04, -1.0146119e-04, ...,\n", + " -1.9281202e-05, -4.6475827e-05, -7.9622550e-05], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_246.wav', 'Her damit!', 10, array([-1.0743736e-04, -6.3287393e-05, 5.4618115e-05, ...,\n", + " 1.7166793e-04, 1.5052129e-04, -4.3305259e-05], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_256.wav', 'Talente muss man fördern.', 26, array([ 2.9789119e-06, 2.0445570e-05, 3.6582744e-05, ...,\n", + " -8.0595542e-05, 2.8049317e-06, -2.4196431e-04], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_298.wav', 'Kein Kommentar!', 15, array([2.0757825e-04, 2.0225085e-05, 1.0584419e-04, ..., 2.2611262e-05,\n", + " 2.2597586e-04, 5.2457988e-05], dtype=float32), 1.1093333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_304.wav', 'Der atmet noch.', 15, array([-0.0001642 , -0.00022683, -0.00021831, ..., 0.00013961,\n", + " 0.00017319, 0.00013602], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_308.wav', 'Das dauert aber lange!', 22, array([4.1067542e-05, 4.3461972e-05, 1.7915755e-04, ..., 1.1849359e-04,\n", + " 1.6261388e-04, 1.4937650e-05], dtype=float32), 1.44)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_311.wav', 'Du kennst mich, Danton.', 23, array([-5.2089547e-04, -4.7035489e-04, -5.9835758e-04, ...,\n", + " -9.4374191e-05, -2.0053205e-05, 1.2992002e-06], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_314.wav', 'Mein Gott, Walter!', 18, array([ 4.9858125e-05, -2.4514409e-05, -4.7797763e-05, ...,\n", + " -2.9001143e-05, -1.4190034e-04, -2.5762929e-05], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_318.wav', 'Und was machst du sonst so?', 27, array([ 0.00041733, 0.00037329, 0.00035271, ..., -0.00016106,\n", + " -0.00041058, -0.00029774], dtype=float32), 1.6106666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_331.wav', 'Dort wird dir geholfen.', 23, array([-1.9671346e-04, -1.1574107e-04, 5.4965103e-06, ...,\n", + " 4.3039094e-05, -3.2543256e-05, -7.8007070e-05], dtype=float32), 1.5466666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_336.wav', 'Was ist denn hier los?', 22, array([0.00012079, 0.00029083, 0.00013022, ..., 0.00036718, 0.00031168,\n", + " 0.00049887], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_358.wav', 'Gleich sind wir dort.', 21, array([ 1.5992192e-04, 2.5509403e-04, 2.3052108e-04, ...,\n", + " 1.9194868e-04, 6.2326435e-05, -2.0080882e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_366.wav', 'Sind sie gut informiert?', 24, array([-1.2915327e-04, 5.4154119e-05, 9.4311297e-05, ...,\n", + " 1.4842945e-04, 1.6595995e-04, 1.6055972e-04], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_376.wav', \"Was soll's, ich bin bereit.\", 27, array([-0.00025371, -0.00037118, -0.00054651, ..., -0.00013142,\n", + " 0.000133 , 0.0001903 ], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_387.wav', 'Was soll das heiÃ\\x9fen?', 21, array([ 6.26799228e-05, -1.15550021e-04, -1.60253039e-04, ...,\n", + " -1.14853225e-04, 3.62789683e-06, -1.25641367e-04], dtype=float32), 1.6106666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_398.wav', 'Oder so!', 8, array([-0.00011172, -0.00021632, -0.0003379 , ..., 0.00016637,\n", + " 0.00021105, 0.00035037], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_431.wav', 'Fauche mich nicht so an!', 24, array([-1.69856430e-04, -2.14659201e-04, -1.17017007e-04, ...,\n", + " 1.06098436e-04, 1.30685687e-04, 8.11223654e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_439.wav', 'Genau zweihundert.', 18, array([ 4.3691549e-04, 4.2721629e-04, 2.1283170e-04, ...,\n", + " -1.0831581e-05, 6.4474931e-05, 1.3399551e-04], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_446.wav', 'Ja ja, das schickt!', 19, array([-1.5079082e-05, 1.2119063e-04, 1.9518439e-04, ...,\n", + " -8.6470172e-05, -3.4930470e-04, -3.7717246e-04], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_460.wav', 'Stein schlägt Schere.', 22, array([ 5.7708825e-05, 1.6740670e-04, 1.9982990e-04, ...,\n", + " -3.3077580e-05, 1.1591193e-04, 7.5874494e-05], dtype=float32), 1.936)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_468.wav', 'Simsalabim!', 11, array([-1.8192175e-05, -1.2427589e-04, 4.0916457e-05, ...,\n", + " -3.6532696e-05, 2.9238325e-05, 2.0148496e-05], dtype=float32), 1.0506666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_478.wav', 'Bitte Zutreffendes ankreuzen.', 29, array([-5.4858734e-05, -6.8480607e-05, -7.1117909e-05, ...,\n", + " -3.5092820e-05, 4.6205354e-05, 3.1237360e-05], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_480.wav', 'Dich kenne ich doch!', 20, array([-3.4106572e-04, -2.6489299e-04, -1.9887066e-04, ...,\n", + " 5.8086891e-05, 2.0823347e-04, -4.3870667e-05], dtype=float32), 1.4026666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_484.wav', 'Und los!', 8, array([ 2.0759732e-04, 2.4903464e-04, -3.9741102e-05, ...,\n", + " -1.4017121e-04, -2.2582384e-04, -2.2852831e-04], dtype=float32), 0.8906666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_485.wav', 'Der Patient ist eh schon tot.', 29, array([ 2.8383749e-04, 1.6098749e-04, 5.8996215e-05, ...,\n", + " -1.5776475e-04, -1.0137054e-04, -1.0374457e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_487.wav', 'Und zwar nicht zu knapp!', 24, array([-4.9983555e-05, 1.0859955e-04, 1.3262806e-04, ...,\n", + " 1.4716771e-04, 2.1034098e-04, 2.6678585e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_488.wav', 'Was ist mit dem Co-Piloten?', 27, array([-4.6707326e-04, -3.3664281e-04, -1.6913723e-04, ...,\n", + " 9.7057833e-05, -3.0600113e-05, -3.3933247e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_510.wav', 'Sie würde ihr letztes Hemd geben.', 34, array([ 1.5112071e-04, 9.9046929e-06, -7.1756775e-05, ...,\n", + " 1.4958363e-04, 2.2523174e-04, 4.5510088e-04], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_521.wav', 'Das wird eh nur Werbung sein.', 29, array([-0.00043494, -0.00045403, -0.00052693, ..., -0.00037776,\n", + " -0.00013905, -0.00029146], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_539.wav', 'Jetzt gibt es Kloppe.', 21, array([ 5.6757370e-05, 1.2752461e-05, -1.0132902e-04, ...,\n", + " -2.8363563e-04, -4.8957689e-04, -4.9631519e-04], dtype=float32), 1.4666666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_549.wav', 'Nee, lieber nicht.', 18, array([-6.2041539e-03, -6.1025852e-03, -5.7721483e-03, ...,\n", + " -4.7201215e-06, -8.9430447e-05, -4.9632461e-05], dtype=float32), 1.5626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_563.wav', 'Er soll schlieÃ\\x9flich etwas lernen.', 34, array([-5.03349729e-05, -2.22053477e-05, 5.14282438e-05, ...,\n", + " 1.08890556e-04, 3.83222614e-05, 6.10036659e-05], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_566.wav', 'Angeblich ja.', 13, array([ 1.7242544e-04, 1.8572621e-04, 1.3631192e-04, ...,\n", + " -4.0973751e-05, -1.5965881e-04, -1.0953719e-04], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_567.wav', 'Wie wäre es mit Wiesbaden?', 27, array([-9.5517004e-05, -2.3826263e-04, -1.0132407e-04, ...,\n", + " 4.5667308e-05, 1.4000830e-04, 2.1524900e-05], dtype=float32), 1.9093333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_576.wav', 'Hört mal zu, ihr Checker!', 26, array([-0.00049925, -0.00049119, -0.00044878, ..., 0.00019171,\n", + " 0.00023476, 0.00022403], dtype=float32), 1.7013333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_580.wav', \"Irgendwann wird's langweilig.\", 29, array([-0.00039041, -0.00038523, -0.00025343, ..., -0.00031044,\n", + " -0.00019142, -0.00014154], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_586.wav', 'Spuck ihn wieder aus!', 21, array([ 0.00012375, 0.00025117, 0.0001871 , ..., -0.00021903,\n", + " -0.00034992, -0.00024192], dtype=float32), 1.712)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_589.wav', 'Unterschätze den Knirps nicht.', 31, array([2.5606243e-04, 2.5400775e-04, 2.3841709e-04, ..., 2.1033855e-05,\n", + " 1.9420990e-04, 1.0694992e-04], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_596.wav', 'Darf ich vorkosten?', 19, array([-1.3477511e-04, -2.3315112e-04, 1.3153857e-05, ...,\n", + " 1.0751128e-04, 1.8084023e-04, 1.6106233e-04], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_597.wav', 'Ich traue mich nicht!', 21, array([-2.9329595e-04, -3.9892262e-04, -2.9478277e-04, ...,\n", + " -1.0763263e-04, 1.1553553e-04, 7.1091476e-05], dtype=float32), 1.4506666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_602.wav', 'Warum seid ihr so leise?', 24, array([ 2.9226076e-05, 1.6949150e-04, 1.3950269e-04, ...,\n", + " 2.4965027e-05, 7.3044146e-05, -1.8916466e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_603.wav', 'Nun stellt euch nicht so an!', 28, array([1.4806543e-04, 1.4012858e-04, 7.7195640e-05, ..., 1.4235765e-04,\n", + " 1.3738184e-04, 1.3289873e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_608.wav', 'Das Essen wird kalt.', 20, array([ 2.36780070e-05, -1.06394495e-04, -1.18256241e-04, ...,\n", + " 8.05624004e-05, -4.60968913e-05, -8.52375670e-05], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_614.wav', 'Fachidioten soll es auch geben.', 31, array([ 7.9924423e-05, 2.0709680e-04, -6.6771558e-05, ...,\n", + " 2.4189356e-05, 6.7659719e-05, -2.3424522e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_627.wav', 'Du bist vielleicht eine Knalltüte!', 35, array([ 1.7171216e-04, -3.8676033e-05, -8.2237340e-05, ...,\n", + " -1.8530877e-04, -1.3380373e-04, -1.6169780e-04], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_629.wav', 'Natürlich nicht seine eigene.', 30, array([-2.2751655e-04, -1.5005520e-04, -9.8528086e-05, ...,\n", + " 1.8771169e-04, 2.7484499e-04, 3.0332521e-04], dtype=float32), 1.8026666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_630.wav', 'Halten Sie die Presse zurück!', 30, array([ 3.1129293e-06, 7.3669260e-05, 3.3459681e-05, ...,\n", + " -1.5276406e-04, 2.6472675e-05, -1.9852230e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_636.wav', 'Ruf schnell die Polizei!', 24, array([5.1400399e-05, 6.7014749e-05, 5.1501669e-05, ..., 1.8976731e-04,\n", + " 2.0147586e-04, 1.5075490e-04], dtype=float32), 1.5573333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_637.wav', 'Dann nimmt man sie sich.', 24, array([-0.00050762, -0.00047607, -0.00053025, ..., 0.00035113,\n", + " 0.00017673, 0.00026363], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_674.wav', 'Gibst du mir deine Nummer?', 26, array([-1.0660516e-04, -1.8238377e-05, 9.7913333e-05, ...,\n", + " 3.0329258e-05, 9.0803427e-05, 2.0600615e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_675.wav', 'Man kann nicht alles haben.', 27, array([ 3.6246947e-04, 3.3836463e-04, 3.9515106e-04, ...,\n", + " 1.9603693e-05, -1.0797187e-07, 4.7195343e-05], dtype=float32), 1.696)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_686.wav', 'Wie oft denn noch?', 18, array([-0.00025807, -0.00045327, -0.00041516, ..., -0.00053778,\n", + " -0.00065512, -0.00057833], dtype=float32), 1.2906666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_703.wav', 'Der Erste in was?', 17, array([3.7513164e-05, 2.3692524e-05, 9.2795723e-05, ..., 1.8559145e-04,\n", + " 8.4898209e-05, 1.3820640e-05], dtype=float32), 1.4323645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_720.wav', 'Wie denn nun?', 13, array([-7.8975081e-06, -2.1718148e-05, 2.7641279e-05, ...,\n", + " 3.3564411e-05, 3.3564411e-05, 1.9743769e-05], dtype=float32), 0.9525625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_722.wav', 'Ihm wäre das zu müÃ\\x9fig.', 25, array([ 5.1333802e-05, 6.3180065e-05, -1.3820640e-05, ...,\n", + " -1.9743769e-05, 3.9487541e-06, -4.7385049e-05], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_724.wav', 'Ã\\x96l ist ausgelaufen.', 20, array([-3.7513164e-05, -7.8975081e-06, -1.5795016e-05, ...,\n", + " -1.3820640e-05, -1.3820640e-05, 4.5410670e-05], dtype=float32), 1.6087708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_730.wav', 'Willkommen im Neuland!', 22, array([-6.910320e-05, -6.515444e-05, 1.382064e-05, ..., -3.356441e-05,\n", + " -1.974377e-06, 8.489821e-05], dtype=float32), 1.6652083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_737.wav', 'Kannst du mich mal zwicken?', 27, array([ 3.9487539e-05, 3.9487541e-06, 3.3564411e-05, ...,\n", + " -1.3820640e-05, -3.1590032e-05, 5.9231312e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_752.wav', 'Friede sei mit dir.', 19, array([-0.00018362, -0.00025075, -0.00027839, ..., -0.00025864,\n", + " -0.0002389 , -0.00026457], dtype=float32), 1.2347916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_753.wav', 'Mit Speck fängt man Mäuse.', 28, array([-1.61898919e-04, -1.04641986e-04, -8.68725911e-05, ...,\n", + " -5.92313118e-05, 6.31800649e-05, 7.70007027e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_766.wav', 'Bin ich die Auskunft oder was?', 30, array([2.96156559e-05, 1.04641986e-04, 1.26360130e-04, ...,\n", + " 2.46797135e-04, 2.94182188e-04, 3.25772213e-04], dtype=float32), 1.99684375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_767.wav', 'Sesam, öffne dich!', 19, array([-3.8500351e-04, -3.3366971e-04, -3.5933661e-04, ...,\n", + " -5.9231312e-05, -2.3692524e-05, 2.9615656e-05], dtype=float32), 1.4253125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_772.wav', 'Er kennt seine Pappenheimer.', 28, array([-3.7513164e-05, -1.9743769e-05, -1.3820640e-05, ...,\n", + " -8.6872591e-05, -1.5202703e-04, -1.7177081e-04], dtype=float32), 1.7146145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_792.wav', 'Da geht noch was.', 17, array([ 2.0336083e-04, 1.6979642e-04, 1.6189892e-04, ...,\n", + " -4.9359427e-05, -2.9615656e-05, -7.3051953e-05], dtype=float32), 1.25596875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_803.wav', 'Er macht es eben gründlich.', 28, array([-5.5282559e-05, -8.2923834e-05, 1.9743769e-05, ...,\n", + " -9.4770097e-05, -1.8361707e-04, -2.5469463e-04], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_816.wav', 'Spionierst du mich aus?', 23, array([3.5538786e-04, 4.5015797e-04, 4.8767112e-04, ..., 4.3436296e-05,\n", + " 1.7769393e-04, 1.7769393e-04], dtype=float32), 1.7992708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_817.wav', 'Komm zurück!', 13, array([4.0672167e-04, 2.2902773e-04, 6.3180065e-05, ..., 3.7513164e-05,\n", + " 4.7385049e-05, 6.3180065e-05], dtype=float32), 1.11484375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_819.wav', 'Sie schwebt auf Wolke sieben.', 29, array([7.5026328e-05, 1.2438576e-04, 1.5005266e-04, ..., 1.1056512e-04,\n", + " 1.4215514e-04, 1.3820639e-04], dtype=float32), 1.9756770833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_826.wav', 'Wehret den Anfängen!', 21, array([ 1.4610391e-04, 1.3425764e-04, 1.2636013e-04, ...,\n", + " -5.9231311e-06, -1.5795016e-05, -2.9615656e-05], dtype=float32), 1.8486666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_834.wav', 'Altes Haus, lass dich drücken!', 31, array([1.75719557e-04, 1.63873294e-04, 8.88469658e-05, ...,\n", + " 1.04641986e-04, 2.15207096e-04, 1.46103906e-04], dtype=float32), 1.9333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_837.wav', 'Nicht nötig.', 13, array([-1.6189892e-04, -7.7000703e-05, -5.7256933e-05, ...,\n", + " 3.5538786e-05, 4.5410670e-05, 1.9743769e-05], dtype=float32), 1.2277395833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_838.wav', 'Wir sind eine Familie.', 22, array([-1.2241138e-04, -1.5992454e-04, -2.3100211e-04, ...,\n", + " 7.3051953e-05, 5.9231312e-05, 6.9103196e-05], dtype=float32), 1.7146041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_842.wav', 'Was schlagen Sie vor?', 21, array([ 3.1590032e-05, 3.5538786e-05, 4.9359427e-05, ...,\n", + " -8.6872591e-05, -6.1205690e-05, -1.2438576e-04], dtype=float32), 1.3406458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_844.wav', 'Probier mal!', 12, array([ 1.4018077e-04, 1.6782204e-04, 2.2902773e-04, ...,\n", + " -2.1718148e-05, 4.9359427e-05, 7.3051953e-05], dtype=float32), 1.0583958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_855.wav', 'Der Schein trügt.', 18, array([ 1.3228325e-04, 4.3436296e-05, 9.8718847e-06, ...,\n", + " 7.5026328e-05, 7.8975081e-06, -3.9487541e-06], dtype=float32), 1.45353125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_861.wav', 'Du hast mich nie geliebt.', 25, array([ 1.02667604e-04, 1.57950155e-04, 1.50052656e-04, ...,\n", + " -2.17181478e-05, 2.76412793e-05, 0.00000000e+00], dtype=float32), 1.7146041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_872.wav', 'Chili ist scharf.', 17, array([-1.1253949e-04, -8.6872591e-05, -1.1648824e-04, ...,\n", + " -1.1846262e-04, -2.5666901e-05, 1.9743770e-06], dtype=float32), 1.7710520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_874.wav', 'Das lässt mich kalt.', 21, array([ 2.1718148e-05, 3.3564411e-05, 5.3308180e-05, ...,\n", + " -1.1846262e-05, -1.9743769e-05, -7.3051953e-05], dtype=float32), 1.5805416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_12_FINAL/12_881.wav', 'Kinder brauchen Helden.', 23, array([-1.8361707e-04, -1.4610391e-04, -1.1846262e-04, ...,\n", + " -1.9743770e-06, -2.7641279e-05, 5.9231312e-05], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_3.wav', 'Voll der gute Vergleich!', 24, array([-3.94875406e-06, -1.08590735e-04, -1.40180768e-04, ...,\n", + " 3.94875387e-05, 1.12539492e-04, 1.16488241e-04], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_14.wav', 'Gibt es das überhaupt?', 23, array([-1.0069323e-04, -1.5202703e-04, -1.8164268e-04, ...,\n", + " -6.9103196e-05, -3.9487539e-05, -6.5154440e-05], dtype=float32), 1.5523125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_35.wav', 'Bleib wachsam.', 14, array([-1.5597578e-04, -1.4807828e-04, -3.1590032e-05, ...,\n", + " -1.9743770e-06, -5.9231311e-06, 4.5410670e-05], dtype=float32), 1.2983020833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_43.wav', 'Jeder hat das Recht auf Bildung.', 32, array([ 5.72569334e-05, 1.04641986e-04, 1.89540195e-04, ...,\n", + " -7.50263280e-05, -5.92313118e-05, -1.14513867e-04], dtype=float32), 1.8204479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_64.wav', 'Nur nicht politisch werden!', 27, array([-7.8975081e-06, 8.2923834e-05, 1.3425764e-04, ...,\n", + " -8.0949460e-05, -6.3180065e-05, -1.3623202e-04], dtype=float32), 1.6652083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_67.wav', 'Wir sprechen uns später noch mal.', 34, array([ 6.8037030e-03, 6.8649091e-03, 7.0327311e-03, ...,\n", + " 5.9231311e-06, -3.1590032e-05, -1.5795016e-05], dtype=float32), 1.9051145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_69.wav', 'Wem gehört welcher Becher?', 27, array([ 6.0810812e-04, 1.8756582e-04, 8.8846966e-05, ...,\n", + " 8.6872591e-05, -1.5795016e-05, -2.1323272e-04], dtype=float32), 1.7498854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_74.wav', 'Was kann der Arbeiter dafür?', 29, array([ 6.71288217e-05, 7.89750775e-05, 1.02667604e-04, ...,\n", + " -5.52825586e-05, -2.56669009e-05, -1.57950162e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_94.wav', 'Wir möchten abreisen.', 22, array([-1.2636013e-04, -7.3051953e-05, -7.7000703e-05, ...,\n", + " -3.1590032e-05, -4.1461917e-05, -1.7769393e-05], dtype=float32), 1.7075520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_97.wav', 'Halbe Fahrt voraus!', 19, array([ 5.3308180e-05, 2.7641279e-05, -1.1253949e-04, ...,\n", + " -7.8975081e-06, 1.9743769e-05, 7.3051953e-05], dtype=float32), 1.5382083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_101.wav', 'Gute Wahl!', 10, array([-5.3308180e-05, -4.1461917e-05, -4.3436296e-05, ...,\n", + " 1.9743769e-05, 2.5666901e-05, -1.9743769e-05], dtype=float32), 0.8608333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_111.wav', 'Ich kenne den doch gar nicht!', 29, array([ 4.9359427e-05, 3.5538786e-05, 6.9103196e-05, ...,\n", + " -2.7641279e-05, 1.3228325e-04, 7.7000703e-05], dtype=float32), 1.98978125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_135.wav', 'Die Hände auf den Rücken!', 27, array([-7.7000703e-05, -5.1333802e-05, -7.1077571e-05, ...,\n", + " -2.7641279e-05, -4.1461917e-05, 1.7769393e-05], dtype=float32), 1.6087604166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_142.wav', 'Am Deal wird nichts geändert.', 30, array([1.4412952e-04, 1.6979642e-04, 1.7571956e-04, ..., 4.5410670e-05,\n", + " 5.7256933e-05, 6.1205690e-05], dtype=float32), 1.9051145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_148.wav', 'Das ist eine Wucht.', 19, array([-4.93594271e-05, -1.57950155e-04, -1.08590735e-04, ...,\n", + " 2.44822761e-04, 1.61898919e-04, 1.16488241e-04], dtype=float32), 1.58053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_152.wav', 'Renitent!', 9, array([2.8233591e-04, 2.6061776e-04, 2.2902773e-04, ..., 1.5795015e-04,\n", + " 1.5202703e-04, 2.9615656e-05], dtype=float32), 1.3124166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_161.wav', 'Ist noch Kaffee da?', 19, array([-1.46103906e-04, -6.91031964e-05, -1.02667604e-04, ...,\n", + " -7.89750775e-05, -2.17181478e-05, 7.89750811e-06], dtype=float32), 1.6369895833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_166.wav', 'Da werden Erinnerungen wach.', 28, array([ 2.1718148e-05, 1.9743769e-05, -9.8718854e-05, ...,\n", + " 8.4898209e-05, 9.2795723e-05, 1.1846262e-05], dtype=float32), 1.7922291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_175.wav', 'Suchen Sie die Herausforderung?', 31, array([-1.4215514e-04, -9.4770097e-05, -1.2833450e-04, ...,\n", + " -4.5410670e-05, -8.2923834e-05, -6.9103196e-05], dtype=float32), 1.764)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_213.wav', 'Kommt ihr mit zur Demo?', 23, array([-7.3051953e-05, -3.7513164e-05, -6.3180065e-05, ...,\n", + " 6.1205690e-05, 1.2241138e-04, 1.4807828e-04], dtype=float32), 1.7781041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_214.wav', 'Was sagt er?', 12, array([-2.6456651e-04, -2.2507898e-04, -2.0928397e-04, ...,\n", + " 4.3436296e-05, 8.0949460e-05, 1.8164268e-04], dtype=float32), 1.622875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_215.wav', 'Ich will mehr Geld!', 19, array([-8.4898209e-05, -9.4770097e-05, -1.1451387e-04, ...,\n", + " -1.1056512e-04, -8.2923834e-05, -1.1846262e-04], dtype=float32), 1.5664270833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_219.wav', 'Du bist überstimmt.', 20, array([ 1.04641986e-04, 6.91031964e-05, 2.76412793e-05, ...,\n", + " -1.02667604e-04, -2.58643384e-04, -2.05335207e-04], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_242.wav', 'Rutsch mir doch den Buckel runter.', 34, array([-5.7256933e-05, -3.9487541e-06, 4.5410670e-05, ...,\n", + " 1.6979642e-04, 7.5026328e-05, -1.5795016e-05], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_280.wav', 'Und ab dafür!', 14, array([ 7.1077571e-05, 1.1056512e-04, 2.0138646e-04, ...,\n", + " -4.3436296e-05, 2.7641279e-05, -6.9103196e-05], dtype=float32), 1.2030416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_283.wav', 'Er meint den Doppeldecker.', 26, array([-8.0949460e-05, -7.7000703e-05, -2.9615656e-05, ...,\n", + " -1.2833450e-04, -8.0949460e-05, -1.8164268e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_284.wav', 'Oder spricht etwas dagegen?', 27, array([ 2.0533521e-04, 1.4215514e-04, 1.4018077e-04, ...,\n", + " -1.3820639e-04, -7.8975077e-05, -1.6584767e-04], dtype=float32), 1.7851666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_288.wav', 'Auf zu neuen Ufern!', 19, array([ 3.5736224e-04, 4.6990174e-04, 6.1798003e-04, ...,\n", + " 9.2795723e-05, 2.1718148e-05, -4.9359427e-05], dtype=float32), 1.7216666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_291.wav', 'Kostprobe gefällig?', 20, array([-1.7571956e-04, -2.3889962e-04, -1.9348894e-04, ...,\n", + " -2.5864338e-04, -1.6584767e-04, -2.9615656e-05], dtype=float32), 1.4182604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_299.wav', 'Der Wein muss noch atmen.', 25, array([-3.5341349e-04, -2.4482276e-04, -2.2705336e-04, ...,\n", + " -6.1205690e-05, 5.9231311e-06, 4.5410670e-05], dtype=float32), 1.9333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_300.wav', 'Das ist nichts Ernstes.', 23, array([1.5597578e-04, 1.7177081e-04, 6.1205690e-05, ..., 2.7641279e-05,\n", + " 3.1590032e-05, 4.9359427e-05], dtype=float32), 1.9121666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_311.wav', 'Nee, lass mal stecken.', 22, array([ 2.3692524e-05, 3.1590032e-05, -4.7385049e-05, ...,\n", + " 3.8105476e-04, 4.1264479e-04, 6.8313448e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_321.wav', 'Ha, das war die Rache!', 22, array([-1.6979642e-04, 3.3564411e-05, 1.1056512e-04, ...,\n", + " 1.6387329e-04, 2.7048966e-04, 2.0533521e-04], dtype=float32), 1.7922291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_326.wav', 'Eigentlich ist es logisch.', 26, array([ 7.3051953e-05, 3.9487541e-06, 2.5666901e-05, ...,\n", + " -1.5795016e-05, -7.1077571e-05, 7.8975081e-06], dtype=float32), 1.7075520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_351.wav', 'Der Wein muss atmen können.', 28, array([2.9615656e-05, 4.3436296e-05, 8.0949460e-05, ..., 4.7385049e-05,\n", + " 1.7769393e-05, 1.9743770e-06], dtype=float32), 1.8063229166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_354.wav', 'Mieter haben Rechte.', 20, array([-1.5795016e-05, -9.8718847e-06, 3.3564411e-05, ...,\n", + " -2.1520710e-04, -1.5992454e-04, -4.5410670e-05], dtype=float32), 1.7216666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_369.wav', 'Was für eine Erkenntnis!', 25, array([-1.02667604e-04, -8.68725911e-05, -4.73850487e-05, ...,\n", + " 3.35644108e-05, 7.70007027e-05, 8.68725911e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_371.wav', 'Ich schieÃ\\x9fe mit rechts.', 24, array([ 1.3623202e-04, 7.8975077e-05, 4.3436296e-05, ...,\n", + " -1.1056512e-04, -1.1451387e-04, -7.3051953e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_376.wav', 'Ist Scooter nicht eine Band?', 28, array([ 3.5538786e-05, 0.0000000e+00, -5.9231311e-06, ...,\n", + " -6.3180065e-05, -1.3820639e-04, -1.2043700e-04], dtype=float32), 1.9474479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_387.wav', 'Wir sind Dickhäuter.', 21, array([1.52027031e-04, 1.12539492e-04, 1.02667604e-04, ...,\n", + " 1.38206397e-05, 5.92313108e-06, 8.09494595e-05], dtype=float32), 1.5946458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_389.wav', 'Sei nicht so streng mit ihm!', 28, array([-3.5538786e-05, 1.7769393e-05, 7.1077571e-05, ...,\n", + " -1.1451387e-04, -1.6189892e-04, -2.0928397e-04], dtype=float32), 1.9474479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_390.wav', 'Na ja, was willst du machen?', 28, array([-1.3820640e-05, -4.1461917e-05, -4.5410670e-05, ...,\n", + " -9.0821341e-05, -1.1846262e-05, -4.3436296e-05], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_419.wav', 'Die Einschläge kommen näher.', 30, array([ 7.5026328e-05, 5.5282559e-05, 1.5597578e-04, ...,\n", + " 3.1590032e-05, 2.1718148e-05, -4.7385049e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_420.wav', 'Willst du mit mir gehen?', 24, array([-1.2438576e-04, -1.9546332e-04, -1.6782204e-04, ...,\n", + " -3.7513164e-05, -1.0661636e-04, 7.7000703e-05], dtype=float32), 1.8204479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_426.wav', 'Hier bitte eine Unterschrift.', 29, array([1.6979642e-04, 1.8361707e-04, 1.7177081e-04, ..., 1.5202703e-04,\n", + " 2.1718148e-05, 0.0000000e+00], dtype=float32), 1.891)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_430.wav', 'Zum Glück nicht.', 17, array([-6.7128822e-05, -9.8718854e-05, -3.1590032e-05, ...,\n", + " -7.3051953e-05, -9.4770097e-05, -1.1056512e-04], dtype=float32), 1.1571770833333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_437.wav', 'Einfach nur top!', 16, array([-3.9684979e-04, -4.2646544e-04, -4.0277292e-04, ...,\n", + " -2.4087400e-04, -3.7513164e-05, -1.4412952e-04], dtype=float32), 1.35475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_439.wav', 'Mach dir nichts daraus.', 23, array([ 4.2843982e-04, 5.0938927e-04, 4.6595297e-04, ...,\n", + " 7.8975081e-06, -3.1590032e-05, 2.5666901e-05], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_443.wav', 'Lauf doch nicht immer durchs Bild!', 34, array([-3.5538786e-04, -1.7769393e-04, -1.1451387e-04, ...,\n", + " -3.9487539e-05, -4.3436296e-05, -3.9487539e-05], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_464.wav', 'Hände hoch!', 12, array([-2.0138646e-04, -1.3425764e-04, -8.0949460e-05, ...,\n", + " 2.0138646e-04, 1.8756582e-04, 2.6061776e-04], dtype=float32), 1.04428125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_475.wav', 'Was weiÃ\\x9f ich denn?', 19, array([-2.7641279e-05, -1.9743770e-06, 8.2923834e-05, ...,\n", + " 7.3051953e-05, 9.8718854e-05, -4.9359427e-05], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_487.wav', 'Ich will noch nicht ins Bett!', 29, array([ 5.7256933e-05, -7.8975081e-06, 1.7769393e-05, ...,\n", + " -2.9615656e-05, -1.1846262e-05, 2.5666901e-05], dtype=float32), 1.9615625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_499.wav', 'Darüber kann man streiten.', 27, array([ 0.00011846, 0.00020534, 0.00027839, ..., -0.00031195,\n", + " -0.00021521, -0.00017769], dtype=float32), 1.8486666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_500.wav', 'Dazu braucht man Ruhe.', 22, array([-1.1056512e-04, -1.4610391e-04, -1.3425764e-04, ...,\n", + " 6.9103196e-05, 1.6189892e-04, 2.2507898e-04], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_524.wav', 'Das Zeug ist wirklich gut.', 26, array([-3.9487541e-06, 3.5538786e-05, -9.8718847e-06, ...,\n", + " 1.1846262e-05, 1.9743769e-05, 9.8718847e-06], dtype=float32), 1.891)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_539.wav', 'Betäubungsgewehr geladen!', 26, array([ 3.5538786e-05, 2.3692524e-05, 0.0000000e+00, ...,\n", + " -9.2795723e-05, -1.9151457e-04, -1.8756582e-04], dtype=float32), 1.8768854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_547.wav', 'Was wird wie geschrieben?', 25, array([-0.00012636, -0.00020336, -0.0002231 , ..., 0.00021521,\n", + " 0.00020336, 0.0001619 ], dtype=float32), 1.8768854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_569.wav', 'Nicht schon wieder, bitte.', 26, array([ 0.00035736, 0.00043436, 0.00037316, ..., -0.00013821,\n", + " -0.00013031, -0.0001619 ], dtype=float32), 1.5523229166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_582.wav', 'Bist du blind?', 14, array([ 1.9743769e-05, -1.5795016e-05, -5.7256933e-05, ...,\n", + " 0.0000000e+00, 8.6872591e-05, 4.5410670e-05], dtype=float32), 1.2841875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_592.wav', 'Blinzeln zählt nicht.', 22, array([ 5.9231311e-06, -4.5410670e-05, -9.8718854e-05, ...,\n", + " 1.6387329e-04, 1.3820639e-04, 7.1077571e-05], dtype=float32), 1.8768958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_595.wav', 'Ja, warum denn bitte schön nicht?', 34, array([-0.00036329, -0.00033959, -0.00036131, ..., -0.00016585,\n", + " -0.00021521, -0.0001619 ], dtype=float32), 1.93334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_597.wav', 'Mir wäre das peinlich.', 23, array([ 5.9231312e-05, 1.2241138e-04, 7.5026328e-05, ...,\n", + " -1.5795016e-05, -8.2923834e-05, -6.7128822e-05], dtype=float32), 1.8345520833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_637.wav', 'So kann es gehen.', 17, array([-7.8975081e-06, 3.9487541e-06, 4.1461917e-05, ...,\n", + " 5.1333802e-05, 1.3030888e-04, 3.9487539e-05], dtype=float32), 1.3688645833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_642.wav', 'Es bleibt spannend.', 19, array([1.8559145e-04, 1.8559145e-04, 1.5597578e-04, ..., 7.3051953e-05,\n", + " 5.7256933e-05, 1.1451387e-04], dtype=float32), 1.559375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_647.wav', 'Marek will noch mal.', 20, array([-2.44822761e-04, -1.04641986e-04, -8.09494595e-05, ...,\n", + " 1.48078281e-04, 1.81642681e-04, 2.50745885e-04], dtype=float32), 1.79221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_661.wav', 'Ruhig Brauner!', 14, array([-1.02667604e-04, -4.73850487e-05, 8.09494595e-05, ...,\n", + " -9.87188469e-06, -8.88469658e-05, -1.12539492e-04], dtype=float32), 1.2771354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_682.wav', 'Meine Rede!', 11, array([5.9428747e-04, 5.0544052e-04, 2.0730958e-04, ..., 7.5026328e-05,\n", + " 6.5154440e-05, 6.5154440e-05], dtype=float32), 1.0725104166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_685.wav', 'Was versprichst du dir davon?', 29, array([-1.1846262e-05, -9.8718847e-06, 4.3436296e-05, ...,\n", + " -2.3692524e-05, 1.9743770e-06, 2.7641279e-05], dtype=float32), 1.7710520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_693.wav', 'Ich nehme euch alle.', 20, array([-3.1590032e-05, -5.9231311e-06, -7.5026328e-05, ...,\n", + " -8.8846966e-05, -7.3051953e-05, -5.1333802e-05], dtype=float32), 1.7851666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_711.wav', 'Warum nämlich?', 15, array([ 5.9231312e-05, 5.9231312e-05, 3.1590032e-05, ...,\n", + " 1.1846262e-05, -5.9231311e-06, -7.5026328e-05], dtype=float32), 1.3688645833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_712.wav', 'Das hätte ich beinahe vergessen.', 33, array([-2.1125835e-04, -2.4482276e-04, -1.4610391e-04, ...,\n", + " 9.0821341e-05, 1.7966831e-04, 1.0661636e-04], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_726.wav', 'Möchtest du auch einen Muffin?', 31, array([-3.9487539e-05, -2.7641279e-05, 6.3180065e-05, ...,\n", + " 1.7769393e-05, 6.7128822e-05, 7.1077571e-05], dtype=float32), 1.9545104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_727.wav', 'Es hat nichts mit dir zu tun.', 29, array([-1.6584767e-04, -1.9348894e-04, -2.7641279e-04, ...,\n", + " 6.5154440e-05, 4.3436296e-05, 1.2438576e-04], dtype=float32), 1.7569479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_732.wav', 'Vielleicht war ich etwas vorschnell.', 36, array([1.7177081e-04, 1.6584767e-04, 8.6872591e-05, ..., 1.9546332e-04,\n", + " 1.8954019e-04, 1.5597578e-04], dtype=float32), 1.9192291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_735.wav', 'Hatschi!', 8, array([ 1.2043700e-04, -1.7769393e-05, -1.9743770e-06, ...,\n", + " -1.1846262e-05, -4.5410670e-05, -7.7000703e-05], dtype=float32), 0.8114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_739.wav', 'Ich bleibe dabei.', 17, array([-2.0533521e-04, -1.2438576e-04, -5.5282559e-05, ...,\n", + " 4.5410670e-05, -1.3820640e-05, -7.7000703e-05], dtype=float32), 1.2418541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_762.wav', 'Nicht zu fassen!', 16, array([1.7414006e-03, 1.4353720e-03, 9.6547039e-04, ..., 6.3180065e-05,\n", + " 1.8164268e-04, 8.0949460e-05], dtype=float32), 1.1712916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_773.wav', 'Gute Besserung!', 15, array([-5.1333802e-05, 0.0000000e+00, 2.1718148e-05, ...,\n", + " -1.2636013e-04, -1.9546332e-04, -1.4215514e-04], dtype=float32), 1.2771354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_775.wav', 'Ja, so ist es wohl.', 19, array([-1.61898919e-04, 1.97437694e-05, 1.02667604e-04, ...,\n", + " -6.51544397e-05, -1.26360130e-04, -6.71288217e-05], dtype=float32), 1.44646875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_778.wav', 'Mich selbst hat das überrascht.', 32, array([7.7000703e-05, 1.1846262e-04, 1.2241138e-04, ..., 1.3820639e-04,\n", + " 9.8718847e-06, 1.3820640e-05], dtype=float32), 1.8275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_782.wav', 'Wer kennt das nicht?', 20, array([-2.7641279e-05, 7.8975081e-06, -3.7513164e-05, ...,\n", + " -2.3297648e-04, -2.2902773e-04, -2.4087400e-04], dtype=float32), 1.72871875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_787.wav', 'Ich liebe diese Musik!', 22, array([-1.8361707e-04, -6.9103196e-05, -9.0821341e-05, ...,\n", + " 5.6862057e-04, 6.2587752e-04, 5.3110742e-04], dtype=float32), 1.8580729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_800.wav', 'Na endlich!', 11, array([-1.1846262e-04, -1.5202703e-04, -8.4898209e-05, ...,\n", + " 9.0821341e-05, -9.0821341e-05, -7.8975081e-06], dtype=float32), 0.91021875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_805.wav', 'Juliane gruselt sich.', 21, array([1.3425764e-04, 7.1077571e-05, 6.5154440e-05, ..., 9.8718854e-05,\n", + " 8.6872591e-05, 5.1333802e-05], dtype=float32), 1.86278125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_808.wav', 'Der andere nimmt.', 17, array([-8.6872591e-05, -1.1451387e-04, -8.2923834e-05, ...,\n", + " 2.5666901e-05, -7.3051953e-05, -7.5026328e-05], dtype=float32), 1.52409375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_814.wav', 'Wieso ich?', 10, array([-1.14513867e-04, -1.02667604e-04, -1.77693932e-04, ...,\n", + " -1.18462622e-05, 0.00000000e+00, 1.38206397e-05], dtype=float32), 0.9031666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_816.wav', 'Die Haare müssen ab.', 21, array([ 1.9546332e-04, 1.2636013e-04, 2.1125835e-04, ...,\n", + " 9.8718847e-06, -4.1461917e-05, -5.5282559e-05], dtype=float32), 1.2065729166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_13_FINAL/13_844.wav', 'Die ganze Woche steht das schon an.', 35, array([ 1.0602404e-03, 1.1017023e-03, 9.0031594e-04, ...,\n", + " -3.3564411e-05, -3.5538786e-05, 0.0000000e+00], dtype=float32), 1.8839479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_7.wav', 'Meinen Respekt hast du.', 23, array([-8.1613541e-07, 3.6258320e-05, 5.8615900e-05, ...,\n", + " -3.0361010e-05, 4.6051988e-05, 6.1613529e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_12.wav', 'Mein SchweiÃ\\x9f stinkt nicht.', 27, array([1.2758464e-03, 1.4472028e-03, 1.4819785e-03, ..., 1.1448720e-05,\n", + " 2.5002395e-05, 5.3266147e-05], dtype=float32), 1.872)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_24.wav', 'So sieht es jedenfalls aus.', 27, array([ 3.5462443e-05, -3.6511621e-05, -2.4387444e-05, ...,\n", + " 7.4399744e-05, 7.2159133e-07, 2.3660252e-05], dtype=float32), 1.808)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_31.wav', 'Es brennt lichterloh.', 21, array([-7.8527468e-05, -1.9054073e-04, -1.8275550e-04, ...,\n", + " -1.4771417e-05, 2.4868292e-05, -1.4910699e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_36.wav', 'Hat jemand Deo dabei?', 21, array([5.0298637e-05, 4.8803475e-05, 5.4532258e-05, ..., 3.4226623e-06,\n", + " 9.2322180e-06, 3.0618612e-05], dtype=float32), 1.7386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_44.wav', 'Der Hund will raus.', 19, array([-8.2374172e-05, -8.4805586e-05, -9.4096496e-05, ...,\n", + " 2.0108973e-05, 3.4747383e-05, -3.9627314e-05], dtype=float32), 1.5413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_46.wav', 'Nur Fliegen ist schöner.', 25, array([-2.5430196e-05, -6.4560918e-05, -6.8181558e-05, ...,\n", + " 6.0105547e-05, 9.7991426e-05, 2.9888753e-05], dtype=float32), 1.6693333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_57.wav', 'Endlich wieder Nachschub!', 25, array([-3.0662410e-05, -3.7799236e-05, -1.0512020e-04, ...,\n", + " -1.2799338e-04, -3.7069469e-05, 3.4687200e-05], dtype=float32), 1.568)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_63.wav', \"Jetzt langt's dann aber.\", 24, array([ 1.3113129e-06, -5.7142366e-05, 3.9664551e-06, ...,\n", + " 4.8476216e-04, 4.0935431e-04, 5.0957059e-04], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_76.wav', 'Ist hier noch ein Platz frei?', 29, array([ 4.6084756e-06, 2.1333383e-06, 1.0840034e-05, ...,\n", + " 4.7717163e-05, -4.3301993e-06, 5.9024904e-07], dtype=float32), 1.7653333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_91.wav', 'Möchten Sie durch?', 19, array([5.3242915e-05, 1.1775635e-04, 9.1564674e-05, ..., 6.9772730e-05,\n", + " 3.2825061e-05, 5.5504606e-05], dtype=float32), 1.1786666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_94.wav', 'Du hast sie angemalt.', 21, array([-8.2009647e-06, -7.8560508e-05, -1.1781590e-04, ...,\n", + " 5.8809797e-05, 3.5827401e-05, -3.8682600e-05], dtype=float32), 1.5946666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_97.wav', 'Anfassen heiÃ\\x9ft kaufen.', 23, array([ 6.7132327e-04, 6.4567651e-04, 4.5344225e-04, ...,\n", + " -2.1742040e-05, -1.2411790e-04, -3.8199389e-05], dtype=float32), 1.472)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_104.wav', 'Warum nicht lieber hier?', 24, array([-1.0701143e-05, -1.5738879e-06, 6.8153045e-06, ...,\n", + " -6.3156702e-05, -1.6941859e-04, -6.0139148e-05], dtype=float32), 1.4986666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_112.wav', 'Das war ein Abenteuer.', 22, array([ 2.6408197e-05, -6.0915321e-05, -9.1295704e-05, ...,\n", + " -5.6715970e-05, -3.1489210e-05, 1.5612791e-06], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_127.wav', 'Das wäre fatal.', 16, array([ 4.4660061e-05, -6.5924425e-05, -5.6830704e-05, ...,\n", + " -5.5352357e-06, 3.0260082e-05, 9.7271128e-05], dtype=float32), 1.4666666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_141.wav', 'Nicht doch!', 11, array([-1.4546166e-04, -1.4626759e-04, -9.7611184e-05, ...,\n", + " 9.3360104e-05, 3.5025540e-05, -1.6926177e-06], dtype=float32), 0.928)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_146.wav', 'Heiliger Strohsack!', 19, array([-3.7175673e-04, -2.1206291e-04, -8.9090288e-05, ...,\n", + " 1.0547445e-04, 1.0614831e-04, 5.8346381e-05], dtype=float32), 1.376)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_153.wav', 'Gehen wir in die Eisdiele?', 26, array([-3.72752729e-05, -6.43968451e-05, -1.19852075e-05, ...,\n", + " 6.90084271e-05, -1.81738214e-05, -2.24471933e-05], dtype=float32), 1.4826666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_158.wav', 'Das ist halt so.', 16, array([ 2.1661433e-05, -9.2656213e-05, -2.0038491e-05, ...,\n", + " 3.4980503e-06, 8.1309692e-05, -1.6156602e-05], dtype=float32), 1.2853333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_174.wav', 'Ich habe dich noch nie gesehen.', 31, array([ 1.68298247e-05, 2.35711445e-06, -1.13152724e-04, ...,\n", + " -5.31522637e-05, 5.38938584e-05, 1.89053408e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_179.wav', 'Das muss hart für dich sein.', 29, array([-9.2038817e-06, -9.7612574e-06, -6.3460277e-05, ...,\n", + " -5.0950723e-05, 2.0168585e-05, -1.5738755e-05], dtype=float32), 1.5893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_183.wav', \"Packen wir's!\", 13, array([-2.2114466e-05, 6.0876686e-05, -8.3392551e-05, ...,\n", + " 3.5826326e-06, -1.4385004e-05, -5.6348257e-05], dtype=float32), 0.9546666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_194.wav', 'Wir werden siegen!', 18, array([ 1.6911860e-04, 7.4598174e-05, 1.0261347e-04, ...,\n", + " 6.5378241e-05, 3.2076507e-06, -6.6169787e-06], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_197.wav', 'Darf ich mal bei dir abbeiÃ\\x9fen?', 31, array([-1.0340806e-05, 7.1646286e-06, 3.3313339e-05, ...,\n", + " -7.5323747e-05, -2.6892374e-07, -3.3816039e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_199.wav', 'Das ging aber fix!', 18, array([-9.3143040e-05, -4.3784836e-05, -1.1206182e-04, ...,\n", + " 8.7669920e-05, 1.0557293e-05, 4.2041685e-07], dtype=float32), 1.328)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_211.wav', 'Ich habe nachgedacht.', 21, array([ 5.0232731e-05, 1.2072114e-04, 1.8210443e-04, ...,\n", + " -6.5402834e-05, -5.1763345e-05, -6.0046054e-06], dtype=float32), 1.5093333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_229.wav', 'Wir lassen uns nicht erpressen.', 31, array([1.37981799e-04, 1.52958339e-04, 1.10953624e-04, ...,\n", + " 6.50644288e-05, 8.02592767e-05, 1.01248879e-04], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_236.wav', 'Sag du es mir.', 14, array([ 7.4462928e-06, -2.0409609e-05, -3.6314952e-05, ...,\n", + " -2.1986765e-05, -8.3042978e-05, 8.2145634e-06], dtype=float32), 1.216)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_240.wav', 'Ich vermisse ihn seit gestern.', 30, array([ 2.9365596e-04, 3.4678026e-04, 3.5397714e-04, ...,\n", + " -1.5735781e-05, -2.9272232e-05, 4.2558597e-05], dtype=float32), 1.9893333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_257.wav', 'So kannte ich sie gar nicht.', 28, array([ 4.4733344e-05, 7.7341829e-05, 1.1480036e-04, ...,\n", + " -1.8965245e-04, -1.4387793e-04, -1.2223862e-04], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_258.wav', 'Dem Kind geht es gut.', 21, array([ 2.3389544e-05, -1.0488247e-05, 1.0429079e-05, ...,\n", + " -8.0030593e-05, -9.8967379e-05, -4.5314195e-05], dtype=float32), 1.3066666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_260.wav', 'Lasst es krachen!', 17, array([-2.1083563e-04, -8.3892046e-05, -3.2037347e-05, ...,\n", + " -6.8306355e-05, -1.3884228e-04, -6.5104126e-05], dtype=float32), 1.2)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_266.wav', 'Wie sehen Sie überhaupt aus?', 29, array([-1.0680479e-05, -1.9320854e-05, -7.0852952e-06, ...,\n", + " -1.0408241e-05, 3.3198389e-06, 2.1512881e-06], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_281.wav', 'Damit könnte es klappen.', 25, array([-2.3432081e-05, -2.4900844e-05, -1.3450766e-04, ...,\n", + " 2.1617279e-05, 3.1534404e-05, -2.2315735e-05], dtype=float32), 1.488)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_307.wav', 'Tut das Husten weh?', 19, array([ 9.1145994e-06, 1.5820089e-05, 5.0116945e-05, ...,\n", + " 1.9206882e-05, -2.6969181e-05, -2.7526901e-05], dtype=float32), 1.5626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_310.wav', 'Und jetzt kräftig kurbeln!', 27, array([-8.4867512e-05, -1.3528325e-05, 6.7344299e-05, ...,\n", + " -5.5355646e-05, 3.2757125e-05, -1.3706725e-05], dtype=float32), 1.968)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_311.wav', 'Und was bekommt man geboten?', 28, array([-9.42486338e-07, -6.20736901e-05, -1.13615904e-04, ...,\n", + " 1.05647247e-04, 4.75407724e-05, 7.68981190e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_313.wav', 'Nimm doch mal den Hut ab!', 25, array([-1.4411381e-06, 1.8580539e-04, 1.8933907e-04, ...,\n", + " -1.0257358e-04, -9.1900030e-05, -2.2193763e-04], dtype=float32), 1.5733333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_322.wav', 'Der ist sauber.', 15, array([1.3459381e-04, 1.1068168e-04, 1.4088971e-04, ..., 1.4206764e-04,\n", + " 1.0958829e-05, 9.0381429e-05], dtype=float32), 1.344)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_333.wav', 'Danke für nichts!', 18, array([-2.6258719e-04, -2.9124424e-04, -4.0630574e-04, ...,\n", + " 9.1923815e-05, -9.6123731e-06, 3.9555922e-05], dtype=float32), 1.408)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_351.wav', 'Hier ist sie.', 13, array([-3.23740860e-05, -1.03745086e-04, -6.84802653e-05, ...,\n", + " 6.36538107e-06, 6.47425259e-05, -2.68384956e-05], dtype=float32), 1.2693333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_354.wav', 'Ist sie international bekannt?', 30, array([ 1.5060005e-05, 5.7448578e-05, 1.3811006e-04, ...,\n", + " 6.0413648e-05, -4.7934391e-05, -1.9190535e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_363.wav', 'Ich meine ja nur.', 17, array([ 5.6321147e-05, 9.9655284e-05, -8.9936962e-05, ...,\n", + " 1.1549123e-05, 3.7268135e-05, 7.3645397e-06], dtype=float32), 1.1253333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_390.wav', 'Gib mal die Seriennummer durch.', 31, array([ 7.2849958e-05, 9.1718932e-05, 5.6555116e-05, ...,\n", + " -2.9702240e-05, 3.8465154e-05, 2.2035034e-05], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_399.wav', 'Steht das Wasser auf dem Herd?', 30, array([6.5801214e-05, 1.3084775e-04, 8.1372353e-05, ..., 6.8494905e-05,\n", + " 2.1234882e-06, 2.7409065e-05], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_401.wav', 'Oh ja!', 6, array([ 2.2632883e-05, -2.7574149e-05, 2.7717488e-05, ...,\n", + " 2.9032512e-07, 1.7548422e-05, -1.3465881e-05], dtype=float32), 0.7146666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_409.wav', 'Ja oder nein?', 13, array([ 3.4988134e-05, -6.8858870e-05, -8.5955844e-06, ...,\n", + " -4.4800227e-06, 1.7184280e-05, 3.7901282e-05], dtype=float32), 1.4346666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_411.wav', 'Ist doch Jacke wie Hose.', 24, array([ 1.1507938e-04, 5.0565839e-05, -2.7287895e-05, ...,\n", + " 3.7775626e-05, -1.4040452e-05, 1.4159415e-06], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_413.wav', 'Ich habe es nie gelernt.', 24, array([ 2.58978853e-05, 6.50478396e-05, -1.03702390e-04, ...,\n", + " 8.01785427e-05, 3.00699157e-05, -1.05522995e-04], dtype=float32), 1.776)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_429.wav', 'Nicht schon wieder eine Razzia!', 31, array([-5.1378167e-05, -2.5352152e-05, -3.2764001e-05, ...,\n", + " 2.1145966e-05, 5.4651609e-05, -7.9359561e-05], dtype=float32), 1.888)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_431.wav', 'Niemand will es gewesen sein.', 29, array([6.13634029e-06, 1.00043821e-04, 1.26646410e-04, ...,\n", + " 4.00160025e-05, 6.57281998e-05, 1.20079676e-04], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_443.wav', 'Ihr seid doch bloÃ\\x9f neidisch.', 29, array([ 4.71922749e-06, -1.42986255e-05, 4.10590292e-05, ...,\n", + " -1.13690789e-04, -4.82848300e-05, 3.64537264e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_453.wav', 'Lesen lohnt sich.', 17, array([-1.1143904e-04, -9.7466742e-05, -1.4505965e-04, ...,\n", + " -1.1429377e-04, -8.0892445e-05, -8.6921274e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_458.wav', 'Oder er wurde dabei gestört.', 29, array([-1.8823694e-05, -3.1060394e-05, -9.3846960e-05, ...,\n", + " -1.2105788e-05, -3.4755056e-05, 3.5802004e-05], dtype=float32), 1.84)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_459.wav', 'Die Seele baumeln lassen.', 25, array([-4.6934008e-05, -1.4115409e-04, -1.9004452e-04, ...,\n", + " -4.7015623e-05, -2.2894224e-07, -4.3300730e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_468.wav', 'Der Nächste, bitte!', 20, array([ 8.1093880e-05, 2.9958397e-05, -3.9947310e-05, ...,\n", + " 6.6704742e-05, 1.2609754e-04, 1.1871241e-04], dtype=float32), 1.3386666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_469.wav', 'Wird schon schiefgehen.', 23, array([-1.8012641e-05, -6.1548446e-05, -1.2534855e-04, ...,\n", + " -2.9845067e-05, 3.1653948e-05, 1.2874776e-04], dtype=float32), 1.552)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_476.wav', 'Keine falsche Bewegung!', 23, array([-1.3065083e-04, -1.9577878e-04, -9.6719399e-05, ...,\n", + " 9.7838973e-05, -1.6546634e-05, 3.1119489e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_484.wav', 'Danach geht es ins Bett.', 24, array([1.4125947e-04, 1.4533960e-04, 1.3352933e-04, ..., 4.6569412e-06,\n", + " 8.5400243e-06, 1.0347654e-04], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_495.wav', 'Vorwärts immer, rückwarts nimmer!', 35, array([ 9.8868964e-05, 1.4638813e-04, 8.2029030e-05, ...,\n", + " 3.1947344e-05, -3.3244356e-05, -8.5653497e-05], dtype=float32), 1.5893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_508.wav', 'Ein Spanngurt ist gerissen.', 27, array([-1.3210842e-05, 5.2183852e-05, 1.1509426e-05, ...,\n", + " -6.6147322e-06, -1.3790486e-05, 4.0188141e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_511.wav', 'Das musst du gerade sagen!', 26, array([ 8.16162283e-05, 1.48853534e-04, 1.20252385e-04, ...,\n", + " -2.43115683e-05, 3.36854064e-05, -3.11621625e-05], dtype=float32), 1.9893333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_517.wav', 'Lösen Sie das Captcha!', 23, array([-3.2288870e-05, 5.6598521e-05, 4.2188087e-05, ...,\n", + " 7.7064447e-05, -4.7475376e-05, 4.4163811e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_520.wav', 'Ihr werdet schon sehen.', 23, array([-6.5363100e-05, 4.7253379e-05, 5.9942446e-05, ...,\n", + " 3.2326661e-05, 8.2957842e-05, 7.4098658e-05], dtype=float32), 1.7973333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_521.wav', 'Ich erkläre es dir.', 20, array([ 5.3491673e-05, -1.2072490e-05, 3.4197161e-05, ...,\n", + " -3.4515979e-05, -5.6132449e-05, 1.3709931e-04], dtype=float32), 1.5093333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_525.wav', 'Hau rein!', 9, array([ 2.57931824e-04, 2.11816674e-04, 1.78339556e-04, ...,\n", + " 7.76832676e-05, 1.51795175e-05, -4.37384588e-05], dtype=float32), 1.104)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_526.wav', 'Tief durchatmen!', 16, array([-2.6787920e-05, -3.2204316e-05, -5.5490927e-05, ...,\n", + " 2.2508255e-05, 5.4639313e-05, 1.8989524e-05], dtype=float32), 1.5253333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_538.wav', 'Und was bringt das?', 19, array([-5.9224880e-05, -4.4477289e-05, 3.8521583e-05, ...,\n", + " 9.5605545e-05, 1.2830349e-06, 1.5070126e-05], dtype=float32), 1.6213333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_541.wav', 'Karnickelfangschlag?', 20, array([3.9227842e-05, 3.2782922e-05, 4.6346566e-05, ..., 1.3389443e-05,\n", + " 3.6067817e-05, 6.0468155e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_562.wav', 'Ist ja mega!', 12, array([-1.1508126e-04, -1.5385580e-04, -1.8046032e-04, ...,\n", + " -4.1180385e-05, 2.7804810e-05, -9.9901524e-07], dtype=float32), 0.992)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_570.wav', 'Jasmin, du bist dran.', 21, array([-6.0017886e-05, 3.1120195e-05, 1.0854354e-04, ...,\n", + " -2.5416332e-06, 4.4546370e-05, -4.6334655e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_579.wav', 'Läuft es separat ab?', 21, array([ 2.2939121e-05, 2.0304271e-05, 4.7305216e-06, ...,\n", + " -4.0958774e-05, 8.3991254e-06, -4.0800154e-05], dtype=float32), 1.7813333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_594.wav', 'Ich zitiere!', 12, array([ 7.3269119e-05, 4.1316580e-06, -7.5483302e-05, ...,\n", + " 4.5700057e-05, 1.0702889e-06, 1.2143076e-05], dtype=float32), 1.2853333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_597.wav', 'Die Karten sind ja markiert!', 28, array([-7.7787427e-06, 1.3373171e-05, 1.1130486e-04, ...,\n", + " -3.4429740e-05, -9.2525712e-05, -3.0399795e-05], dtype=float32), 1.8613333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_600.wav', 'Weniger ist manchmal mehr.', 26, array([-3.2105188e-05, -1.2411436e-04, -1.7373836e-04, ...,\n", + " 1.9536817e-05, 4.0033923e-05, -4.9835093e-05], dtype=float32), 1.6693333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_610.wav', 'Zur Hölle mit ihm!', 19, array([ 4.1287938e-05, -1.5668693e-05, -4.7829257e-05, ...,\n", + " 1.2091287e-04, 3.0301053e-05, 5.0707073e-05], dtype=float32), 1.28)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_622.wav', 'Sonst kommt die Polizei.', 24, array([ 1.33967542e-05, -2.86651575e-05, 1.20430150e-05, ...,\n", + " -4.97728324e-05, -9.77511445e-05, -1.07504595e-04], dtype=float32), 1.9786666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_623.wav', 'Papa fährt immer schneller.', 28, array([-4.1551000e-05, 1.8333099e-05, -4.5995697e-05, ...,\n", + " 7.4864365e-05, -2.8456698e-05, -3.1763777e-06], dtype=float32), 1.7653333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_627.wav', 'Das Problem kenne ich.', 22, array([-1.6575548e-06, -6.4681786e-05, -2.4183499e-05, ...,\n", + " -6.1924133e-05, 4.0877181e-05, -4.8742072e-06], dtype=float32), 1.3973333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_646.wav', 'Gerade jetzt wird es spannend.', 30, array([-7.0382644e-05, -2.6976499e-05, -8.4537001e-05, ...,\n", + " 1.9848225e-05, 1.8570287e-05, 1.1454727e-04], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_650.wav', 'Pass mal auf!', 13, array([ 8.8038476e-05, 6.2287538e-05, 8.6767104e-05, ...,\n", + " -4.7867183e-05, 1.7106903e-06, -2.8001863e-05], dtype=float32), 1.0773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_653.wav', 'Führe mich nicht in Versuchung!', 32, array([ 1.5389375e-04, 8.4856605e-05, 1.1764471e-04, ...,\n", + " -4.1702488e-06, 4.8200640e-05, 3.7042355e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_658.wav', 'Dabei soll es bleiben.', 22, array([-6.8817273e-05, -1.4116750e-04, -2.5068663e-04, ...,\n", + " 3.3109423e-05, -1.2034771e-05, 5.3297503e-05], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_677.wav', 'Ich denke nicht daran.', 22, array([ 2.7965652e-06, -8.1217448e-05, -1.5171595e-04, ...,\n", + " -6.0021226e-05, 5.8105360e-07, -2.3721210e-05], dtype=float32), 1.472)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_702.wav', 'Sieh zu, dass du Land gewinnst!', 31, array([-3.9686485e-05, -4.1371659e-05, -5.1444043e-05, ...,\n", + " -6.5746033e-05, -6.9277223e-05, -3.0258396e-05], dtype=float32), 1.9466666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_705.wav', 'Was sagt uns das?', 17, array([-1.11950721e-04, -1.12432775e-04, -1.54395209e-04, ...,\n", + " 1.18786911e-05, -6.98161457e-05, -2.93514750e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_715.wav', 'Von nichts komm nichts.', 23, array([ 5.0694278e-05, -1.0824220e-04, -7.8278521e-05, ...,\n", + " 5.2878531e-05, 3.1005864e-05, 2.5896241e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_718.wav', 'Warum auch?', 11, array([ 2.5824769e-05, 7.0119269e-05, 3.9937982e-05, ...,\n", + " 1.3905319e-05, -2.6308078e-05, -5.1800267e-05], dtype=float32), 0.9493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_721.wav', 'Wo wohne ich noch mal?', 22, array([ 1.1702570e-04, 1.8368529e-04, 1.5237987e-04, ...,\n", + " -3.3846823e-05, -4.2944125e-06, 2.2590933e-05], dtype=float32), 1.6)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_725.wav', 'Zum Wohl!', 9, array([-2.1576473e-06, 2.8079157e-05, -2.9355248e-05, ...,\n", + " -2.9330091e-05, -3.0764484e-05, -1.3724362e-05], dtype=float32), 0.7466666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_735.wav', 'Wie geht es dir?', 16, array([ 3.0780422e-05, -4.9582297e-05, -8.5829226e-05, ...,\n", + " 2.1407772e-05, -4.8474238e-05, -4.5784309e-05], dtype=float32), 1.232)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_746.wav', 'Einmal drücken reicht.', 23, array([-4.4286557e-05, -5.6155724e-05, -5.2055671e-05, ...,\n", + " -5.5887984e-05, 1.7236773e-05, 9.8498596e-05], dtype=float32), 1.4373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_761.wav', 'Ersatz muss her.', 16, array([ 8.3686442e-05, 9.1279635e-06, -8.3661522e-05, ...,\n", + " 3.3542208e-05, 9.7035401e-05, -4.7421363e-05], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_769.wav', 'Kennen Sie diesen Eisbären?', 28, array([ 1.8226114e-04, 1.1602399e-04, 8.7942906e-05, ...,\n", + " -3.1415253e-05, 6.8828485e-05, 2.8598015e-05], dtype=float32), 1.7173333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_774.wav', 'Du tüdelst wohl!', 17, array([4.2244592e-05, 4.7479767e-05, 4.4327684e-05, ..., 2.9398587e-05,\n", + " 1.3265206e-04, 9.8947305e-05], dtype=float32), 1.312)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_776.wav', 'Einen Versuch ist es wert.', 26, array([-2.0919964e-05, -8.0129103e-05, -7.8644814e-05, ...,\n", + " 3.4572986e-05, 8.1091166e-05, 5.6626621e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_779.wav', 'Kruzifix noch mal!', 18, array([ 5.9276794e-05, 7.1346542e-05, 1.3115312e-05, ...,\n", + " -7.0933937e-05, 2.6771322e-05, 3.3997876e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_781.wav', 'Sind die echt?', 14, array([-3.2039690e-05, -4.8189206e-05, -9.0187306e-05, ...,\n", + " 2.1210299e-05, 9.5539394e-07, -6.0049209e-05], dtype=float32), 1.1946666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_789.wav', 'Wie war euer Jahrgangstreffen?', 30, array([ 9.86098894e-05, 1.05807514e-04, 1.31781504e-04, ...,\n", + " -6.47349443e-05, 5.55652514e-06, 6.68639914e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_796.wav', 'Langt das?', 10, array([-2.58835917e-05, -1.11602596e-04, -2.00994928e-05, ...,\n", + " 3.40378210e-05, 4.15314862e-05, -2.47353237e-05], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_851.wav', 'Nein, das gehört so.', 21, array([4.30460314e-05, 1.00948644e-04, 1.14135793e-04, ...,\n", + " 2.88395531e-04, 1.62498865e-04, 8.75307087e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_852.wav', 'Stellen Sie Blickkontakt her.', 29, array([-2.3877754e-05, -3.1883523e-05, -1.3378897e-04, ...,\n", + " -3.8810729e-05, 4.3067663e-05, 3.8920269e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_858.wav', 'Also echt jetzt!', 16, array([ 1.62354499e-05, 4.22473058e-05, -1.46273105e-05, ...,\n", + " -2.93930316e-05, 5.34094252e-05, 7.98595574e-05], dtype=float32), 1.216)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_7.wav', 'Ich glaube nicht.', 17, array([-1.0143876e-05, -3.8619244e-05, 8.2748767e-05, ...,\n", + " -9.9806406e-05, -4.3946784e-05, 6.9558562e-05], dtype=float32), 1.1946666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_18.wav', 'Hier ist es sicherer.', 21, array([ 4.6870970e-05, 9.9823235e-05, -4.0877108e-05, ...,\n", + " -1.4616339e-05, 7.3614872e-05, 1.0970575e-04], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_49.wav', 'Ja ja, als ob!', 14, array([ 5.3198488e-05, 1.8346685e-04, -2.1753046e-06, ...,\n", + " 1.7834389e-05, 5.3522737e-05, 8.4725587e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_59.wav', 'Geh, such deine Schwester!', 26, array([ 9.13840049e-05, 1.68439132e-04, 3.04173911e-04, ...,\n", + " -8.56241095e-05, -1.02150196e-04, 8.91289255e-06], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_94.wav', 'Gib mir meinen Becher wieder!', 29, array([-2.1092707e-04, -2.3195105e-04, -2.0152969e-04, ...,\n", + " 8.9153917e-05, -2.4260396e-06, 5.9283586e-05], dtype=float32), 1.8453333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_100.wav', 'Das führt doch zu nichts.', 26, array([-1.0273771e-04, -8.6229462e-05, -1.2574486e-04, ...,\n", + " 2.4963025e-05, 4.4582037e-05, 4.7964921e-05], dtype=float32), 1.9733333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_105.wav', 'Wo denn?', 8, array([-4.0845240e-05, 1.0149255e-04, 5.9910049e-05, ...,\n", + " -3.8421931e-05, 2.8110459e-05, 1.7339922e-05], dtype=float32), 0.9493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_106.wav', 'Du sitzt hinten.', 16, array([ 1.1350374e-04, 1.3197908e-04, 5.9344729e-05, ...,\n", + " -1.6409816e-04, -7.1399249e-05, -4.2459251e-05], dtype=float32), 1.44)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_112.wav', 'Das kann ich nicht.', 19, array([-9.4199102e-05, -3.3980414e-05, 9.0330948e-05, ...,\n", + " 1.1509175e-04, 2.2319029e-05, 5.1328014e-05], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_139.wav', 'Das hat sie gelernt.', 20, array([ 1.5456244e-04, 3.1872053e-04, 3.7880472e-04, ...,\n", + " -8.6764321e-06, -1.7240205e-05, -5.7155878e-05], dtype=float32), 1.4826666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_153.wav', 'Nicht alle Teenager sind so.', 28, array([7.9220721e-05, 5.8759109e-05, 1.1493213e-04, ..., 6.8786328e-05,\n", + " 1.5815135e-04, 8.5130850e-05], dtype=float32), 1.9946666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_156.wav', 'Frische Seeluft macht gesund.', 29, array([ 1.8124521e-04, 1.7306159e-04, 5.9669415e-05, ...,\n", + " 4.9480139e-05, 1.2296322e-04, -5.5897519e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_164.wav', 'Gönn dir!', 10, array([ 5.2993961e-05, 2.8179937e-05, 7.8242076e-05, ...,\n", + " -4.9057824e-05, 1.8003910e-05, 8.8817593e-05], dtype=float32), 0.9386666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_176.wav', 'Sag ich doch!', 13, array([ 4.2398951e-05, 5.6847359e-05, 7.0788061e-05, ...,\n", + " -3.2739328e-05, 9.7135853e-05, 6.0795941e-05], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_177.wav', 'Das darf doch nicht wahr sein.', 30, array([-5.1426803e-05, -5.0517308e-05, 4.6803252e-05, ...,\n", + " -8.1146150e-05, 2.9068062e-05, 7.5193479e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_199.wav', 'Jetzt sind wir quitt.', 21, array([-2.4918138e-05, 8.0159109e-05, -7.1328832e-05, ...,\n", + " -2.1099215e-04, -3.0862509e-05, -3.5725458e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_207.wav', 'Eben ging das noch.', 19, array([-5.0324921e-05, 1.3549793e-04, -3.3347860e-05, ...,\n", + " 9.8024408e-05, 1.5384333e-04, 1.5966935e-04], dtype=float32), 1.53875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_208.wav', 'Bug oder Feature?', 17, array([-3.7243055e-06, 6.9413843e-05, 7.5392752e-05, ...,\n", + " 5.2070121e-05, 2.8219682e-05, 8.4193009e-05], dtype=float32), 1.8053020833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_274.wav', 'Wir brauchen mehr davon!', 24, array([-2.0753406e-04, -1.9484414e-05, -2.8117347e-04, ...,\n", + " 1.2726737e-04, 2.6360145e-04, 2.9073044e-04], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_280.wav', 'Lass uns raus gehen.', 20, array([ 1.03469618e-04, 1.97744346e-04, -7.93442814e-06, ...,\n", + " 8.44921742e-05, 2.30915975e-05, -1.33781205e-05], dtype=float32), 1.5508645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_286.wav', 'SchluÃ\\x9f mit lustig.', 19, array([ 2.99623178e-04, 2.43378381e-04, 1.65333462e-04, ...,\n", + " -2.71533063e-05, 7.85075972e-05, -1.17198346e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_302.wav', 'Woher nehmt ihr eure Bildung?', 29, array([1.7700881e-04, 2.1893253e-04, 1.3036304e-04, ..., 1.3868474e-04,\n", + " 1.0062666e-04, 8.4173589e-05], dtype=float32), 1.9749270833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_308.wav', 'Du fährst, ich schieÃ\\x9fe!', 25, array([1.5563566e-04, 1.4856170e-04, 2.2446582e-04, ..., 6.8505600e-05,\n", + " 2.0769508e-04, 1.1925176e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_328.wav', 'Wirkt die Betäubung noch?', 26, array([-8.7537330e-05, -3.0825776e-04, -2.8424736e-04, ...,\n", + " 1.1261477e-04, 2.0012977e-04, 1.0000553e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_385.wav', 'Es kann nur einen geben!', 24, array([-1.8947560e-04, -2.3450297e-05, -1.2145152e-04, ...,\n", + " -6.9378242e-05, -1.1301338e-04, -2.5457976e-04], dtype=float32), 1.8901145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_400.wav', 'Wer weiÃ\\x9f es?', 13, array([ 8.2401210e-05, 1.2261249e-05, 1.3193028e-04, ...,\n", + " -9.9374527e-05, -2.4473227e-05, 7.3499345e-05], dtype=float32), 1.49028125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_406.wav', 'Tja, das ist Pech.', 18, array([2.4313416e-04, 4.7331341e-05, 1.6022228e-04, ..., 3.0806483e-04,\n", + " 2.9170502e-04, 3.0395557e-04], dtype=float32), 1.7810729166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_412.wav', 'Alles muss raus.', 16, array([2.3146431e-04, 2.1641712e-04, 1.4716707e-04, ..., 1.4341300e-04,\n", + " 3.7975753e-06, 9.1287213e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_415.wav', 'Stell die Heizung höher.', 25, array([-3.96930409e-05, 1.02812344e-04, 1.21250734e-04, ...,\n", + " -3.47016321e-05, -2.01824150e-04, -9.76954325e-05], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_418.wav', 'Etwa über mich?', 16, array([-0.00020996, -0.00011494, -0.00010331, ..., -0.00017556,\n", + " -0.00020319, -0.00027111], dtype=float32), 1.7689479166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_421.wav', 'Das ist natürlich bitter.', 26, array([-3.3627803e-04, -2.5203897e-04, -2.3072124e-04, ...,\n", + " 4.6018063e-06, 1.7239379e-05, 4.0267703e-05], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_440.wav', 'Hier knicken.', 13, array([-0.000481 , -0.00023708, -0.00018911, ..., -0.00022185,\n", + " -0.00025873, -0.00026997], dtype=float32), 1.30853125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_464.wav', 'Alles Lügen!', 13, array([-0.00027017, -0.00016623, -0.00022159, ..., -0.00033337,\n", + " -0.00044782, -0.00022404], dtype=float32), 1.4175833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_465.wav', 'Alles oder nichts!', 18, array([2.8375158e-05, 6.5034241e-05, 9.6457785e-05, ..., 1.0699107e-04,\n", + " 9.6596435e-05, 1.2572719e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_467.wav', 'Warum bleibst du stehen?', 24, array([-1.4808709e-04, -1.8631479e-04, -1.2836477e-04, ...,\n", + " -6.0794730e-05, -1.5104183e-05, -2.5347929e-04], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_473.wav', 'Zumindest ein bisschen.', 23, array([-0.00024013, -0.00025727, -0.00025987, ..., -0.00023257,\n", + " -0.00033333, -0.00025996], dtype=float32), 1.5993229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_474.wav', 'Sprich mir nach!', 16, array([-1.7584162e-04, -1.6248986e-04, -8.6785782e-05, ...,\n", + " 3.5318243e-04, 3.7314874e-04, 3.2366288e-04], dtype=float32), 1.4175833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_500.wav', 'Sehr witzig!', 12, array([ 7.5077987e-05, 1.1926649e-04, 1.8323194e-04, ...,\n", + " -3.8680941e-04, -3.2216642e-04, -3.3234112e-04], dtype=float32), 1.39334375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_502.wav', 'Achtung, Achtung!', 17, array([-4.0950408e-04, -2.9606355e-04, -3.7786187e-04, ...,\n", + " -2.1742952e-05, 3.0543149e-05, 8.8129680e-05], dtype=float32), 1.5145104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_512.wav', 'Wo bitte schön steht das?', 26, array([ 2.2647387e-04, 1.4740237e-04, 1.2381608e-04, ...,\n", + " -1.1670060e-04, -5.8438465e-05, -5.2704141e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_513.wav', 'SchlieÃ\\x9fen Sie bitte die Luke.', 30, array([ 0.00012086, 0.00019177, 0.00012352, ..., -0.00014259,\n", + " -0.00024671, -0.00014045], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_549.wav', 'Ich hasse meinen Wecker.', 24, array([-1.9575720e-05, -1.5009989e-04, -1.6873972e-04, ...,\n", + " -6.5268898e-05, -1.8595096e-04, -1.7330179e-04], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_576.wav', 'Nicht so laut!', 14, array([-1.6541444e-04, -8.3816949e-06, -1.0135791e-04, ...,\n", + " 3.1510697e-04, 4.1878404e-04, 3.6531710e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_595.wav', 'Ich tu mein Bestes.', 19, array([ 8.3501960e-05, 1.7197721e-04, 2.2250456e-04, ...,\n", + " -1.2569079e-04, -1.3276993e-04, -2.5823418e-04], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_597.wav', 'Alle guten Dinge sind drei.', 27, array([-1.1909505e-05, -8.7172106e-05, -1.2401433e-04, ...,\n", + " -1.4987224e-04, -1.3219267e-05, -7.9211000e-05], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_607.wav', 'Welche Vase?', 12, array([-1.8119848e-04, -2.7736003e-04, -1.8833524e-04, ...,\n", + " 5.6385907e-05, 1.3869893e-04, 1.9968288e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_651.wav', 'Zeig mal deine Muckis.', 22, array([-0.00038406, -0.0003124 , -0.00026326, ..., 0.00032153,\n", + " 0.00029355, 0.0004676 ], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_661.wav', 'Wir sind umzingelt.', 19, array([ 4.0317194e-05, 2.1714004e-04, 1.5210512e-04, ...,\n", + " 1.1821459e-04, 9.8579549e-05, -3.1008281e-06], dtype=float32), 1.57509375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_669.wav', 'Du zitterst ja!', 15, array([-0.0002655 , -0.00018808, -0.00023504, ..., 0.00028222,\n", + " 0.00025013, 0.00041103], dtype=float32), 1.2116145833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_687.wav', 'Ob sie schon Hunger haben?', 26, array([-7.1925861e-05, 1.8567745e-06, -5.7103756e-05, ...,\n", + " 2.6770154e-04, 7.6355340e-05, 2.2662200e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_699.wav', 'Das wird schon wieder.', 22, array([-2.5816666e-04, -8.4095438e-05, -1.2401373e-05, ...,\n", + " -1.9085000e-04, -2.3972438e-04, -1.5835713e-04], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_700.wav', 'Köpfe runter!', 14, array([ 8.14295272e-05, 1.14302085e-04, 1.28549975e-04, ...,\n", + " -2.10746948e-04, -2.65351351e-04, -3.40027531e-04], dtype=float32), 1.32065625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_712.wav', 'Sie sollten sich schämen!', 26, array([ 2.6346499e-04, 9.5443167e-05, 1.6159609e-04, ...,\n", + " -2.1241463e-04, -1.5395934e-04, -8.9938527e-05], dtype=float32), 1.6477916666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_732.wav', 'Schwund ist überall.', 21, array([-0.00039054, -0.00025168, -0.00026237, ..., 0.00020222,\n", + " 0.0002156 , 0.00019633], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_734.wav', 'Schon fertig?', 13, array([-6.8748363e-06, 5.9082297e-05, -3.8726441e-05, ...,\n", + " -1.3909466e-04, -2.0350730e-04, -1.0977411e-04], dtype=float32), 1.2237291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_743.wav', 'Musst du da reinschieÃ\\x9fen?', 26, array([0.00038867, 0.00026221, 0.0002308 , ..., 0.0001513 , 0.00017203,\n", + " 0.00012958], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_764.wav', 'Das wäre mir neu.', 18, array([-1.6335897e-04, -1.3920359e-04, -6.9949492e-05, ...,\n", + " 3.2939854e-05, 3.5769459e-05, -3.7220154e-05], dtype=float32), 1.91434375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_773.wav', 'Mission gescheitert!', 20, array([ 5.22215014e-05, 1.20894714e-04, 1.96668057e-04, ...,\n", + " -2.58956774e-04, -1.39872835e-04, -1.39142721e-04], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_782.wav', 'Dir kann geholfen werden.', 25, array([-5.4091932e-05, -2.9271763e-05, 1.2364880e-04, ...,\n", + " -1.4125406e-04, -2.3545137e-04, -2.5170582e-04], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_790.wav', 'Vertraust du mir blind?', 23, array([-1.3496955e-04, -4.5282133e-05, 1.7263924e-04, ...,\n", + " 1.0330205e-05, -1.9022463e-04, -1.3715150e-04], dtype=float32), 1.6235520833333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_793.wav', 'Wie stellen Sie sich das vor?', 29, array([5.7090012e-05, 9.3246163e-05, 1.4314597e-04, ..., 1.8600497e-04,\n", + " 1.2342732e-04, 2.2610810e-04], dtype=float32), 1.8901145833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_802.wav', 'Ist es nicht so?', 16, array([ 8.5881460e-05, 1.9039282e-04, 2.1635044e-04, ...,\n", + " 1.2600829e-04, 4.5968747e-05, -1.7667595e-05], dtype=float32), 1.4297083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_808.wav', 'Willst du mich umbringen?', 25, array([3.4704231e-04, 2.2213293e-04, 1.1007244e-04, ..., 1.0426929e-05,\n", + " 6.0499657e-05, 4.4495686e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_817.wav', 'Da ist die Tür!', 16, array([ 0.00014472, 0.00027025, 0.00040617, ..., -0.0001791 ,\n", + " -0.00014576, -0.00017543], dtype=float32), 1.7931770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_15_FINAL/15_820.wav', 'Ihr könnt nicht fliehen.', 25, array([ 3.3208958e-04, 1.8373384e-04, 2.8849186e-05, ...,\n", + " -1.9994991e-04, -4.2732576e-05, 5.1437601e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_8.wav', 'Erkennst du ihn wieder?', 23, array([-7.1132112e-05, 1.8191178e-04, 2.2640963e-04, ...,\n", + " -1.5948209e-04, -4.8810096e-05, -7.1736489e-05], dtype=float32), 1.69625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_29.wav', 'Du bist so ein Charmeur!', 24, array([ 8.7156383e-05, -7.5441625e-05, -8.7413508e-05, ...,\n", + " -3.7287452e-04, -2.6756592e-04, -2.7199855e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_30.wav', 'Das Wochenende war sehr schón.', 31, array([0.00010696, 0.00019241, 0.00022398, ..., 0.00018996, 0.00018264,\n", + " 0.00021606], dtype=float32), 1.9506875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_44.wav', 'Na, GroÃ\\x9fer!', 12, array([2.9556373e-05, 1.2606342e-04, 2.0366564e-04, ..., 1.8486078e-04,\n", + " 1.2593277e-04, 1.4429759e-04], dtype=float32), 1.7083645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_53.wav', 'Lassen wir das!', 15, array([-1.0015550e-03, -1.1123064e-03, -1.0633026e-03, ...,\n", + " -8.7814760e-06, 1.5665671e-04, 2.6885752e-04], dtype=float32), 1.2843020833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_57.wav', 'Es kann jeden treffen.', 22, array([-1.2930187e-04, -3.5622310e-05, 1.1325534e-04, ...,\n", + " 2.8466255e-05, -1.7107872e-04, -3.0454184e-04], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_61.wav', 'Das dürfen Sie nicht!', 22, array([7.9696401e-05, 2.5238540e-05, 2.6919068e-05, ..., 2.0004300e-04,\n", + " 1.7159608e-04, 2.0384404e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_69.wav', 'Oder muss man die einfrieren?', 29, array([ 2.3387831e-04, 2.0287969e-04, 2.3305746e-04, ...,\n", + " -2.0109433e-04, -1.5938835e-04, 1.9864538e-06], dtype=float32), 1.9628020833333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_75.wav', 'Nur für einen Tag.', 19, array([ 1.1010072e-04, 7.5059768e-05, 1.5811465e-04, ...,\n", + " -1.6034159e-04, 6.0707155e-09, -5.6600587e-05], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_85.wav', 'Ã\\x9cberall lauern Fallen.', 23, array([ 7.3672440e-05, 1.1084337e-04, 5.4723707e-05, ...,\n", + " -3.4976221e-04, -1.6772485e-04, -2.3993225e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_89.wav', 'Schön, dass du da warst.', 25, array([-1.5644990e-04, -1.6062504e-04, -1.5125731e-04, ...,\n", + " -1.4215022e-04, -3.6906120e-05, -1.1689674e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_104.wav', 'Bleib, wo du bist!', 18, array([-6.7565779e-05, -2.1604590e-06, 1.6737869e-04, ...,\n", + " -5.7721576e-05, -1.0027820e-05, -4.2661872e-05], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_113.wav', 'Erwischt!', 9, array([ 8.53675301e-05, -1.39195807e-04, -1.12849986e-04, ...,\n", + " -6.49508947e-05, -6.88307264e-05, -2.25101539e-04], dtype=float32), 1.06621875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_119.wav', 'Dann lass es liegen.', 20, array([-1.4928725e-04, 2.6696865e-05, -8.1158723e-05, ...,\n", + " 1.0134692e-04, 7.8540448e-05, -3.6887606e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_125.wav', \"Mach 'ne Fliege!\", 16, array([-4.2133670e-05, -4.1710995e-05, -9.2710856e-05, ...,\n", + " 6.1932937e-05, 5.9015078e-05, 1.2269965e-04], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_132.wav', 'Bei wem?', 8, array([ 3.2050626e-05, -1.8802975e-05, 6.2951531e-06, ...,\n", + " 3.6152644e-05, 5.9682232e-05, 1.7530509e-04], dtype=float32), 1.2479583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_164.wav', 'Einer reicht.', 13, array([-2.7248763e-05, -1.8096254e-04, -6.8749752e-05, ...,\n", + " -5.8457640e-06, -6.7224923e-06, -2.3102484e-05], dtype=float32), 1.5145104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_167.wav', 'Komm du mal hier her!', 21, array([-1.5554769e-04, 3.7891259e-06, 4.7066398e-05, ...,\n", + " -2.3639805e-05, 2.0737947e-05, 4.9913662e-05], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_187.wav', 'Die Dämmerung bricht an.', 25, array([-1.3250955e-06, 2.9998255e-05, 7.1768205e-05, ...,\n", + " 8.1620914e-05, -2.1789680e-05, -2.0792277e-04], dtype=float32), 1.82953125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_209.wav', 'Ich will nur mal gucken!', 24, array([ 2.0323754e-05, -4.8527312e-05, 7.2813884e-05, ...,\n", + " 5.2759733e-05, -1.1957207e-05, -4.8190817e-05], dtype=float32), 1.74471875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_219.wav', 'Weg damit!', 10, array([-3.5334317e-05, -1.1389485e-04, -8.2927254e-05, ...,\n", + " 9.7957432e-05, 2.3025880e-04, 8.2124512e-05], dtype=float32), 0.9935208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_228.wav', 'Der kleine Tümpel?', 19, array([-1.11052366e-04, -1.58417228e-04, 1.12858004e-04, ...,\n", + " -7.95750821e-05, 1.25983679e-05, 3.80305464e-05], dtype=float32), 1.7810729166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_261.wav', 'Danke vielmals!', 15, array([-1.0886707e-04, -2.8663597e-04, -2.3995244e-04, ...,\n", + " -9.9315126e-05, -1.0518550e-04, 8.9717643e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_262.wav', 'Greifen Sie zu!', 15, array([ 1.7402765e-04, 5.4675427e-05, -2.1378555e-05, ...,\n", + " -3.0241612e-05, -1.6510607e-05, 1.9972253e-05], dtype=float32), 1.5145208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_270.wav', 'Sein Telefon ist verwanzt.', 26, array([ 1.7227376e-05, 1.3369569e-04, 2.4036576e-04, ...,\n", + " -1.2941840e-04, -7.5057469e-05, 4.6790487e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_280.wav', 'Das kann ich nicht gutheiÃ\\x9fen.', 30, array([-2.46016367e-04, -1.46169405e-04, -1.01338104e-04, ...,\n", + " -2.12353916e-06, -4.44089965e-05, 4.71521271e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_284.wav', 'Nicht im Geringsten.', 20, array([ 6.9896785e-05, 4.9565413e-05, -5.2745858e-05, ...,\n", + " 4.9021692e-05, 4.1371193e-05, -4.8943206e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_290.wav', 'Magst du Sushi?', 15, array([ 4.5281922e-06, -7.7349956e-05, -9.6111427e-05, ...,\n", + " 6.7945102e-06, 5.8605725e-05, -4.7947608e-05], dtype=float32), 1.5993229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_294.wav', 'Ich hätte warten sollen.', 25, array([ 1.3215349e-05, 2.5886698e-05, 9.2406181e-06, ...,\n", + " 3.3613727e-05, -7.8962090e-05, 3.6267331e-05], dtype=float32), 1.5872083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_300.wav', 'Vielen Dank für den Hinweis.', 29, array([ 1.21899466e-04, 1.44075893e-04, 1.06153289e-04, ...,\n", + " 1.94679887e-04, -1.92022708e-05, -8.20819259e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_306.wav', 'Her mit dem Zaster!', 19, array([ 9.2032889e-05, -7.7123856e-05, 1.8857928e-06, ...,\n", + " 5.2272848e-05, 1.2463648e-04, -4.8004724e-05], dtype=float32), 1.90221875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_324.wav', 'Moment mal!', 11, array([-9.6486969e-05, -8.5642452e-05, 1.3726056e-05, ...,\n", + " 3.6692109e-05, 2.4882122e-05, -5.4820499e-05], dtype=float32), 1.2721875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_332.wav', 'Lass es sein.', 13, array([-6.2611114e-05, 8.5420121e-05, 1.1575574e-06, ...,\n", + " 1.8824625e-05, 2.6618896e-05, 5.5844474e-05], dtype=float32), 1.4296979166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_334.wav', 'Wir kommen bei ihnen vorbei.', 28, array([ 3.2983281e-04, 5.1712846e-05, -1.6061698e-04, ...,\n", + " 8.1734914e-05, -2.4410097e-05, 1.5291570e-04], dtype=float32), 1.99915625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_350.wav', 'Es fehlt nicht mehr viel.', 25, array([ 3.4581102e-05, -3.2403619e-05, 6.4223466e-05, ...,\n", + " -4.1160070e-05, 2.3247363e-05, 1.4443042e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_351.wav', 'So entdeckt man Fehler.', 23, array([-1.5804017e-05, -7.4724383e-05, 1.1222719e-05, ...,\n", + " 4.8898462e-05, 3.6749603e-05, -3.3983986e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_356.wav', 'Salve!', 6, array([-1.3447071e-04, 5.3523188e-05, 8.5717998e-05, ...,\n", + " 4.4749868e-05, -5.5393906e-05, 1.0913220e-05], dtype=float32), 1.0056354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_374.wav', 'Angeber!', 8, array([ 4.8461781e-05, 1.5487269e-04, 9.4685849e-05, ...,\n", + " -1.4769383e-04, -1.8351457e-05, -1.8764535e-05], dtype=float32), 1.1146875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_376.wav', 'Wer duckt sich da weg?', 22, array([ 8.9025889e-05, 2.0651723e-04, -8.5901571e-05, ...,\n", + " 8.8148518e-05, 1.3756873e-04, 1.2379605e-04], dtype=float32), 1.6356770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_16_FINAL/16_396.wav', 'Schlaf schön.', 14, array([ 1.56835347e-04, 2.10795515e-05, 6.19498023e-05, ...,\n", + " -4.29836909e-05, -1.05784595e-04, 4.19116714e-06], dtype=float32), 1.1631458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_22.wav', 'Eindeutig nein.', 15, array([ 1.7040480e-06, -2.4771760e-05, 2.0656289e-05, ...,\n", + " -4.9639581e-05, -6.2789266e-05, -6.4883228e-05], dtype=float32), 1.885)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_26.wav', 'Sie nickte.', 11, array([ 1.3571361e-04, 1.4810856e-04, 1.6444136e-04, ...,\n", + " -8.4158353e-05, -6.3345658e-05, -6.6707049e-05], dtype=float32), 1.3556458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_34.wav', 'Von wegen Rabenmutter!', 22, array([ 3.9614300e-05, 3.0917236e-05, 1.4100775e-05, ...,\n", + " 3.3664131e-05, -3.6520869e-05, -5.6032222e-05], dtype=float32), 1.7171666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_35.wav', 'Woran liegt das?', 16, array([ 1.03992148e-04, 8.12370126e-05, 1.09074477e-04, ...,\n", + " 5.26995609e-05, -2.80062741e-05, -1.37729285e-05], dtype=float32), 1.4718541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_68.wav', 'Das ist schlecht fürs Geschäft.', 33, array([ 3.3433552e-04, 4.7215325e-04, 3.9332887e-04, ...,\n", + " -3.3291522e-05, -7.3073941e-05, -6.2871884e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_70.wav', 'Das überlege ich mir noch.', 27, array([-2.7926452e-04, -4.7232458e-04, -4.5905521e-04, ...,\n", + " -5.0401053e-05, -7.6573851e-05, -1.9868592e-05], dtype=float32), 1.7688125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_71.wav', 'Oder er behält ihn.', 20, array([-1.95691573e-05, 1.42454119e-05, -1.12822245e-05, ...,\n", + " 6.27729896e-05, 6.37731318e-06, 7.33020497e-05], dtype=float32), 1.7429791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_76.wav', 'Viel Vergnügen!', 16, array([-1.4641756e-04, -2.3690579e-04, -2.0291538e-04, ...,\n", + " -6.4597036e-05, -3.9596798e-05, -5.9615340e-05], dtype=float32), 1.2975625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_82.wav', 'Sehr schön erklärt.', 21, array([ 2.8675116e-04, 4.4330378e-04, 3.8435950e-04, ...,\n", + " 9.6497361e-06, 3.9338884e-06, -3.2766162e-05], dtype=float32), 1.5880625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_104.wav', 'Du bist nicht fair.', 19, array([ 8.8045017e-05, 1.6864744e-04, 1.3682757e-04, ...,\n", + " -9.7046555e-05, -1.7125324e-04, -8.5282416e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_119.wav', 'Die in Pulverform.', 18, array([ 2.4460370e-04, 3.1504090e-04, 2.7829470e-04, ...,\n", + " 4.2608990e-05, -1.4765085e-05, -1.9486206e-05], dtype=float32), 1.7817083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_141.wav', 'Hier machen wir einen Schnitt.', 30, array([ 1.7057944e-04, 2.5346698e-04, 2.6541931e-04, ...,\n", + " -5.5827346e-05, -5.5662604e-05, -4.4612902e-05], dtype=float32), 1.6074166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_161.wav', 'Ganz und gar nicht!', 19, array([-1.09976885e-04, -1.06159037e-04, -9.40025275e-05, ...,\n", + " 5.14636531e-06, -7.86106375e-06, -1.38592986e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_162.wav', 'Du schnarchst.', 14, array([ 9.8031691e-05, 1.0789345e-04, 1.0408189e-04, ...,\n", + " 2.8527650e-06, 1.8555178e-05, -1.7833072e-05], dtype=float32), 1.2911041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_175.wav', 'Die in der zweiten Reihe.', 25, array([-7.3503418e-04, -1.0330433e-03, -9.6690352e-04, ...,\n", + " 1.0845856e-04, 9.5128053e-05, 1.3117766e-04], dtype=float32), 1.6590625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_185.wav', 'Viertel nach neun.', 18, array([-0.00025316, -0.00042128, -0.00041847, ..., 0.00012852,\n", + " 0.00010431, 0.00010823], dtype=float32), 1.794625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_214.wav', 'Der hat gut reden!', 18, array([-3.1999915e-04, -4.8188152e-04, -4.3341244e-04, ...,\n", + " 7.4479853e-05, 1.0070496e-04, 9.9988407e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_230.wav', \"Was gibt's denn?\", 16, array([ 5.3894956e-04, 7.5124111e-04, 6.7086820e-04, ...,\n", + " -4.5820485e-05, -5.6413213e-05, -2.6967809e-05], dtype=float32), 1.6719791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_233.wav', 'Fahren Sie bitte schneller.', 27, array([ 4.8254660e-04, 7.2192971e-04, 6.9296843e-04, ...,\n", + " -3.3325745e-05, 1.5315249e-05, 3.6237780e-05], dtype=float32), 1.8204583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_243.wav', 'Keine Ursache!', 14, array([-2.2485174e-04, -3.4637007e-04, -2.4121681e-04, ...,\n", + " -5.3969983e-05, -1.2160699e-05, -7.7381246e-06], dtype=float32), 1.2588333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_245.wav', 'Ich glaube, es geht los.', 24, array([-7.1201968e-05, -1.1457155e-04, -8.4426887e-05, ...,\n", + " 6.9712019e-05, 1.4468420e-05, 7.2575887e-05], dtype=float32), 1.8398125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_266.wav', 'Nicht sehr lange.', 17, array([-4.25688399e-04, -5.72862104e-04, -4.54291090e-04, ...,\n", + " 1.15649045e-05, -7.03342175e-06, 9.42021143e-06], dtype=float32), 1.34275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_268.wav', 'Fahr vorsichtig!', 16, array([ 4.7249952e-05, 6.6685003e-05, 8.1438702e-05, ...,\n", + " -7.7767829e-05, -4.4103599e-05, -3.7954072e-05], dtype=float32), 1.4589375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_278.wav', 'Dann aber mit Fanfare.', 22, array([ 3.0009818e-04, 5.0011458e-04, 4.6210812e-04, ...,\n", + " -1.1364354e-04, -6.8604320e-05, -7.7980949e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_280.wav', 'Habe ich doch!', 14, array([-0.00020733, -0.00032169, -0.00027389, ..., -0.00016337,\n", + " -0.00020018, -0.00013392], dtype=float32), 1.233)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_311.wav', 'Regnet es drauÃ\\x9fen?', 19, array([ 7.4172771e-04, 9.9716149e-04, 9.2472351e-04, ...,\n", + " -1.0082213e-04, -1.2750884e-04, -8.1061611e-05], dtype=float32), 1.8721041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_316.wav', 'Das ist eine lange Geschichte.', 30, array([ 2.24287433e-04, 1.93610642e-04, 1.16401294e-04, ...,\n", + " -1.26720734e-05, 2.45919164e-05, 5.34417049e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_342.wav', 'Welches Rad?', 12, array([4.8121543e-05, 4.5563989e-05, 2.0835963e-05, ..., 3.9729348e-05,\n", + " 3.7650581e-05, 3.3080996e-05], dtype=float32), 1.6397083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_367.wav', 'Nichts zu danken!', 17, array([-3.7277619e-05, -4.9238584e-05, -7.1403243e-05, ...,\n", + " -3.3696429e-05, 3.0755796e-06, -3.4646106e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_376.wav', 'Bitte noch einmal!', 18, array([ 0.00030744, 0.00045197, 0.00040104, ..., -0.00010688,\n", + " -0.00015312, -0.00013671], dtype=float32), 1.6267916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_378.wav', 'Immer in diese Richtung!', 24, array([-4.00174977e-05, 3.99114288e-05, 1.92868242e-06, ...,\n", + " -1.14653565e-04, -7.80621922e-05, -3.85478379e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_380.wav', 'Gefällt dir die Farbe rot?', 27, array([ 3.0378540e-04, 4.3046009e-04, 3.8851614e-04, ...,\n", + " 2.1661093e-05, -2.6406319e-06, -1.4788465e-05], dtype=float32), 1.9495625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_393.wav', 'Gib mir mal die Knarre.', 23, array([-4.2177099e-04, -5.7642709e-04, -4.9111585e-04, ...,\n", + " -8.2453604e-05, -1.6147584e-05, -7.7549201e-05], dtype=float32), 1.8075416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_410.wav', 'Einer geht noch.', 16, array([ 1.5394030e-04, 2.1875372e-04, 2.0080485e-04, ...,\n", + " -5.6117624e-05, -5.4007505e-05, -1.0993878e-05], dtype=float32), 1.3169166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_418.wav', 'Setzen Sie sich!', 16, array([-2.4320874e-05, -3.2748470e-05, -2.0884192e-05, ...,\n", + " 6.3705025e-05, 1.3131127e-04, 7.7887824e-05], dtype=float32), 1.4976875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_423.wav', 'Es geht ja nicht anders.', 24, array([-2.8326374e-04, -3.8826582e-04, -3.3924755e-04, ...,\n", + " -6.5105633e-05, -6.3098807e-05, -8.6217944e-05], dtype=float32), 1.9495625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_451.wav', 'Kopf hoch!', 10, array([-1.68412909e-04, -1.73757420e-04, -1.55442147e-04, ...,\n", + " -8.23870796e-05, -1.52904060e-04, -1.15380506e-04], dtype=float32), 1.3685625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_458.wav', 'Endlich geht es weiter!', 23, array([-1.5851386e-03, 2.6465717e-03, 5.2893539e-03, ...,\n", + " 3.7729558e-06, 3.5277069e-05, -3.3997758e-06], dtype=float32), 1.70425)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_460.wav', 'Schluss mit der Raserei!', 24, array([ 2.88873882e-04, 4.21624194e-04, 4.14417736e-04, ...,\n", + " -1.55140384e-04, -1.10896304e-04, -8.53765887e-05], dtype=float32), 1.6526041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_470.wav', 'Der Kerl ist dufte.', 19, array([-7.1235799e-04, -1.0205780e-03, -9.3518692e-04, ...,\n", + " -1.5202124e-04, -1.4708345e-04, -7.5756463e-05], dtype=float32), 1.9624791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_475.wav', 'Nicht hauen!', 12, array([-3.9160714e-04, -5.2419491e-04, -4.0734027e-04, ...,\n", + " -3.5391298e-05, -1.9862022e-05, -4.2017076e-05], dtype=float32), 1.613875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_489.wav', 'Davon ist auszugehen.', 21, array([-3.8098158e-05, -1.8117305e-05, -9.3444651e-05, ...,\n", + " -4.6410118e-05, -5.4083579e-05, -6.1566949e-05], dtype=float32), 1.8591875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_490.wav', 'Ã\\x84ndern wir das!', 16, array([-3.1039584e-04, -5.0911406e-04, -3.8009215e-04, ...,\n", + " -1.0358073e-05, 2.3063526e-06, -3.8572562e-05], dtype=float32), 1.4847708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_537.wav', 'Viel hilft viel.', 16, array([-7.0020906e-04, -9.7590697e-04, -8.4232452e-04, ...,\n", + " 2.6748754e-05, 3.9436178e-05, -1.5542679e-05], dtype=float32), 1.5105833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_561.wav', 'Voll abgezogen!', 15, array([0.0009425 , 0.00131688, 0.00114336, ..., 0.00054311, 0.00053014,\n", + " 0.00059172], dtype=float32), 1.304)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_569.wav', 'Was ist Liebe?', 14, array([ 1.7119097e-04, 2.4002905e-04, 1.4028113e-04, ...,\n", + " -1.1777198e-05, 4.3154125e-07, 1.1548834e-05], dtype=float32), 1.3814791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_613.wav', 'Bitte wenden Sie.', 17, array([-3.2704248e-04, -4.7001868e-04, -4.4811977e-04, ...,\n", + " 3.9887604e-05, 4.2593329e-05, -1.2635800e-05], dtype=float32), 1.5751458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_618.wav', \"Das spar'n wir uns jetzt.\", 25, array([-2.1968294e-04, -2.5130660e-04, -2.3470224e-04, ...,\n", + " 4.6512545e-05, 1.0168094e-04, 8.9639499e-05], dtype=float32), 1.9882916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_625.wav', 'Doppelt hält besser.', 21, array([1.0242802e-04, 1.4422902e-04, 1.5433358e-04, ..., 1.8618872e-05,\n", + " 2.6657151e-05, 8.0320706e-06], dtype=float32), 1.3169166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_645.wav', 'Die beiden werden bestimmt schwer.', 34, array([ 2.2716461e-04, 3.7214963e-04, 3.4043228e-04, ...,\n", + " -7.0017355e-05, -5.9255068e-05, -4.9753759e-05], dtype=float32), 1.975375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_650.wav', 'Dort steppt der Bär.', 21, array([3.4111144e-05, 3.9471229e-06, 1.3943841e-05, ..., 2.8798750e-04,\n", + " 3.4306329e-04, 2.3900693e-04], dtype=float32), 1.8204583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_658.wav', 'Offensichtlich nicht.', 21, array([-6.4643849e-05, -1.4843927e-04, -1.9616121e-04, ...,\n", + " 6.0427959e-05, 2.8176541e-05, 1.0887287e-04], dtype=float32), 1.8462708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_690.wav', 'Ganz sicher.', 12, array([-2.1219352e-04, -2.6916104e-04, -2.2152660e-04, ...,\n", + " -8.3999286e-05, -3.9927592e-05, -1.1057539e-04], dtype=float32), 1.4460208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_734.wav', 'Bin ich ein Mensch?', 19, array([7.3225739e-05, 8.4229468e-05, 6.0397753e-05, ..., 1.4409037e-04,\n", + " 5.4610227e-05, 2.8432718e-05], dtype=float32), 1.8721041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_737.wav', 'Wohl bekommts.', 14, array([3.8544985e-04, 5.4862851e-04, 4.7615587e-04, ..., 1.1308860e-05,\n", + " 1.5347328e-05, 3.9165672e-05], dtype=float32), 1.5880416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_747.wav', 'So eine will ich auch.', 22, array([-0.00023466, -0.00034498, -0.00035786, ..., 0.00014857,\n", + " 0.00014895, 0.00018565], dtype=float32), 1.9366458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_751.wav', 'Guter Rat ist teuer.', 20, array([-5.8482616e-05, -9.7700511e-05, -1.4372601e-04, ...,\n", + " 8.8569423e-06, 4.0626270e-05, -2.2441051e-05], dtype=float32), 1.885)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_785.wav', 'Noch Fragen?', 12, array([-5.4637530e-05, -9.7329437e-05, -6.5443433e-05, ...,\n", + " 1.3526098e-05, -1.7008400e-05, -2.3395469e-05], dtype=float32), 1.542875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_790.wav', 'Wie tut man das?', 16, array([ 1.4673925e-06, -7.7766053e-06, 2.2737586e-05, ...,\n", + " -2.2371720e-04, -2.6603421e-04, -2.1358255e-04], dtype=float32), 1.6009583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_794.wav', 'Ein billiger Trick.', 19, array([ 2.5642010e-05, 5.9448335e-05, 7.9047953e-05, ...,\n", + " -1.4398795e-05, -2.7475784e-05, -3.0437941e-05], dtype=float32), 1.4912291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_1.wav', 'Woher soll ich sie kennen?', 26, array([-7.0670452e-05, -2.2751169e-04, 3.6274258e-05, ...,\n", + " 6.2137144e-05, -1.4069478e-04, 1.5651318e-04], dtype=float32), 1.865875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_4.wav', 'Wo soll es hingehen?', 20, array([-4.3062766e-05, 6.9635964e-05, 2.7200711e-05, ...,\n", + " 7.3389943e-05, 9.7813630e-05, 7.5023250e-05], dtype=float32), 1.526625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_5.wav', 'Ã\\x84tsch!', 7, array([-5.5343335e-05, -1.0754153e-04, 1.0636374e-04, ...,\n", + " -2.3993191e-04, -1.1428300e-04, -1.9587418e-04], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_24.wav', 'Den mit dem Hund.', 17, array([-2.9083933e-05, -4.5743432e-06, -1.1590145e-04, ...,\n", + " -6.4060594e-05, -5.3663935e-06, -6.9100148e-05], dtype=float32), 1.5993333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_42.wav', 'Sieh mal schnell nach!', 22, array([ 1.1187345e-05, -2.7101662e-04, -4.0457569e-05, ...,\n", + " 3.8478026e-04, 1.3185160e-04, 1.9724603e-04], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_43.wav', 'Zieh Leine!', 11, array([ 1.4326118e-04, 1.4733149e-04, 2.3666536e-04, ...,\n", + " -8.1889502e-06, -2.2159066e-04, -1.0789347e-04], dtype=float32), 1.4115416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_55.wav', 'Meistens eher nicht.', 20, array([-1.2833555e-04, -4.5777502e-04, -2.9062675e-04, ...,\n", + " 3.7303114e-05, 1.7912805e-04, 9.5502997e-05], dtype=float32), 1.968875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_89.wav', \"Komm Du 'mal hier her!\", 22, array([-2.4593925e-05, -1.5391175e-04, -3.5177112e-05, ...,\n", + " -2.8054212e-05, -8.3761133e-06, -3.3427594e-05], dtype=float32), 1.8840625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_102.wav', 'Keine halben Sachen.', 20, array([-0.00010684, -0.00018609, -0.00036967, ..., 0.00014736,\n", + " 0.00013171, 0.00024668], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_172.wav', 'Zugriff!', 8, array([-1.7191633e-04, -2.6422989e-04, -1.8970467e-04, ...,\n", + " 1.4085844e-05, -6.5849432e-05, -1.2668260e-04], dtype=float32), 1.3994166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_187.wav', 'Viel SpaÃ\\x9f dabei!', 17, array([ 1.26890489e-04, 4.78873408e-04, 3.36644967e-04, ...,\n", + " -1.14277915e-04, 1.15070587e-04, -4.50995103e-05], dtype=float32), 1.8234791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_189.wav', 'Krass, oder?', 12, array([ 5.2673863e-06, -3.2042470e-05, 6.3032145e-05, ...,\n", + " 4.9474946e-04, 4.8315409e-04, 3.1584961e-04], dtype=float32), 1.4054791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_195.wav', 'Hat es geregnet?', 16, array([-1.5500655e-05, -2.4765370e-05, -1.3535780e-04, ...,\n", + " 1.0218658e-04, -7.7519953e-06, 8.1419450e-05], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_200.wav', 'Die Maschine läuft heiÃ\\x9f.', 26, array([-2.9662095e-05, -1.3571499e-04, -4.9048278e-05, ...,\n", + " 4.0860983e-04, 3.3467117e-04, 2.8713685e-04], dtype=float32), 1.890125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_232.wav', 'Friss ScheiÃ\\x9fe!', 15, array([-3.1462018e-04, -4.3994249e-04, -1.8601233e-04, ...,\n", + " 1.2004693e-04, 6.4006366e-05, 1.4038217e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_244.wav', 'Wasser marsch!', 14, array([-5.2966818e-05, -1.3111959e-06, -2.3756520e-05, ...,\n", + " -4.7830945e-05, -1.0526282e-04, 5.8504538e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_255.wav', 'Ein Halbstarker!', 16, array([8.9307170e-05, 4.3556365e-04, 5.6998286e-04, ..., 7.5660588e-05,\n", + " 1.9409347e-04, 7.0803260e-05], dtype=float32), 1.6841458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_266.wav', 'Stell den Fernseher ab!', 23, array([-6.1183324e-05, -1.4089182e-04, -1.1948228e-04, ...,\n", + " -1.9923897e-04, -1.7150129e-04, -2.3940729e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_283.wav', 'Kopf oder Zahl?', 15, array([-1.3454640e-04, -4.2848653e-05, -2.3553993e-04, ...,\n", + " -6.3240882e-06, -5.2672884e-05, -1.6467538e-04], dtype=float32), 1.550875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_300.wav', 'Jetzt verstanden?', 17, array([-1.7701862e-04, 3.8073360e-06, 6.6768931e-05, ...,\n", + " 1.5635177e-04, 2.4184166e-04, 2.0308173e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_305.wav', 'Jeder nur eine Kugel!', 21, array([ 1.0893906e-04, 3.5140860e-05, -8.6934997e-05, ...,\n", + " -1.5842280e-04, -7.1798029e-05, -2.1561602e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_327.wav', 'Leider nein.', 12, array([ 0.00020925, 0.00038225, 0.00030209, ..., -0.0002834 ,\n", + " -0.00024066, -0.000164 ], dtype=float32), 1.0783333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_345.wav', 'Irgendwas ist anders.', 21, array([-0.00026138, -0.00012453, -0.00022627, ..., -0.00013074,\n", + " -0.00016786, -0.00011485], dtype=float32), 1.9991666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_371.wav', 'Ein bisschen.', 13, array([ 6.19466882e-05, 1.81855256e-04, 2.56517378e-04, ...,\n", + " 9.61327260e-06, 2.89863237e-05, -1.07233864e-04], dtype=float32), 1.211625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_375.wav', 'Wir sitzen fest.', 16, array([-1.0016920e-05, -5.8360743e-05, -5.3961080e-06, ...,\n", + " -1.4201126e-07, -8.1081940e-05, -1.3083526e-05], dtype=float32), 1.5326875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_424.wav', 'Hat er nicht gesagt.', 20, array([1.4237937e-04, 3.5439979e-04, 4.2451522e-04, ..., 2.9889754e-05,\n", + " 4.3811939e-05, 5.3790947e-05], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_428.wav', 'Mach das ordentlich!', 20, array([-2.2249017e-04, -3.4736985e-04, -2.4423364e-04, ...,\n", + " -4.8614937e-05, 1.6576583e-04, 1.4303469e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_430.wav', 'Zurück zum Thema.', 18, array([0.00021488, 0.00048195, 0.00039156, ..., 0.00020808, 0.0002092 ,\n", + " 0.00014525], dtype=float32), 1.7023125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_433.wav', 'Auf mich hört sowieso niemand.', 31, array([-4.4078504e-05, 1.2701395e-04, 1.5659831e-04, ...,\n", + " 3.2407068e-05, 1.3882274e-04, 3.7292095e-06], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_434.wav', 'Weiter so!', 10, array([ 7.8434707e-05, 2.3782127e-04, 2.0620505e-04, ...,\n", + " -3.0293613e-06, 7.3579846e-05, 2.1203174e-04], dtype=float32), 0.9571666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_437.wav', 'Darüber herrscht Konsens.', 26, array([ 2.0915098e-04, 1.6340525e-04, -4.4762099e-05, ...,\n", + " 3.0228088e-05, -5.6204710e-05, 1.4202976e-04], dtype=float32), 1.9991666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_440.wav', 'Was ist so schlimm daran?', 25, array([ 5.3402138e-05, -1.7599798e-04, 1.1747003e-04, ...,\n", + " 1.8220089e-04, 2.5114723e-04, 2.9130204e-04], dtype=float32), 1.8053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_447.wav', 'Brüllend komisch!', 18, array([2.5463186e-04, 3.0699532e-04, 1.7949699e-04, ..., 1.3379526e-04,\n", + " 6.0049937e-05, 4.3341170e-05], dtype=float32), 1.4660416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_462.wav', 'Sehr einfallsreich!', 19, array([ 1.6625131e-04, 1.4804797e-04, 6.6010347e-05, ...,\n", + " -2.8519373e-05, -1.5197203e-05, -1.2542940e-04], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_468.wav', 'Einer fehlt hier noch.', 22, array([0.00021585, 0.0002281 , 0.00034421, ..., 0.00031288, 0.00025684,\n", + " 0.00014126], dtype=float32), 1.5448125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_505.wav', 'Wollen wir?', 11, array([-0.000173 , -0.00033364, -0.00012876, ..., 0.00012244,\n", + " 0.00032144, 0.00014797], dtype=float32), 1.029875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_514.wav', 'Und wir singen zusammen!', 24, array([ 0.00028886, 0.00030063, 0.00037314, ..., -0.00011231,\n", + " -0.00017524, -0.00013442], dtype=float32), 1.890125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_541.wav', 'Hier, fang!', 11, array([-9.6539197e-06, 9.8090044e-05, 7.5100412e-05, ...,\n", + " 1.8568999e-04, 3.1414471e-04, 1.8397035e-04], dtype=float32), 1.5326875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_548.wav', 'Ignorieren Sie die Warnung nicht.', 33, array([-7.0703449e-05, -2.1341034e-06, -2.6835096e-05, ...,\n", + " 1.0051801e-04, 6.5389222e-06, 2.1216212e-04], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_558.wav', 'Nirgends ist ein Ausweg.', 24, array([ 0.0002789 , 0.00025432, 0.00026059, ..., -0.0001307 ,\n", + " -0.00015316, -0.0001602 ], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_563.wav', 'Er will schmusen.', 17, array([ 8.3865758e-05, -4.9942853e-05, 5.9117421e-05, ...,\n", + " -4.3004973e-05, -1.0278272e-04, -8.9234527e-05], dtype=float32), 1.3146041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_590.wav', 'GrüÃ\\x9f Gott!', 12, array([ 3.8686660e-05, 8.4167688e-05, -4.1444160e-05, ...,\n", + " 7.9078745e-05, 6.6285960e-05, 7.3457479e-05], dtype=float32), 1.1328541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_591.wav', 'Doch, muss es.', 14, array([-2.7301039e-05, -9.8715776e-05, -5.1679286e-05, ...,\n", + " 1.7480909e-04, 8.8697474e-05, -8.7942986e-05], dtype=float32), 1.5811666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_594.wav', 'Höchst verdächtig!', 20, array([-1.5668831e-04, -1.4814634e-05, 1.2133464e-06, ...,\n", + " 1.1010807e-04, 5.0348262e-05, 3.2340708e-05], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_597.wav', 'Hat man das schon mal gehört?', 30, array([ 2.9468083e-05, 8.5217485e-05, -1.1223685e-05, ...,\n", + " 1.4429020e-05, -3.4263925e-05, -1.7569761e-04], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_628.wav', 'Habt noch ein wenig Geduld.', 27, array([-3.1721203e-05, -6.6361958e-05, 6.2947714e-05, ...,\n", + " 9.7825025e-05, -1.3173591e-04, 3.6439680e-05], dtype=float32), 1.9143541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_630.wav', 'Och, Schnucki!', 14, array([-3.5877591e-05, -2.9018152e-04, -1.0041694e-04, ...,\n", + " 1.2557590e-04, 8.4289997e-05, 1.0620209e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_654.wav', 'Womit kann ich dienen?', 22, array([-3.79744961e-05, 4.58159229e-05, 5.13197449e-07, ...,\n", + " 5.17356311e-05, 2.12984141e-05, 1.14942064e-04], dtype=float32), 1.6235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_657.wav', 'Ich bin der Gerichtsvollzieher.', 31, array([ 2.9084453e-05, -2.4720324e-05, 1.8879551e-06, ...,\n", + " -2.5064335e-04, -1.8888044e-04, -4.7750240e-05], dtype=float32), 1.9749166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_670.wav', 'Gute Nacht zusammen!', 20, array([ 1.36263785e-04, 8.22485454e-05, 1.07259955e-04, ...,\n", + " -1.70976884e-04, -4.60869487e-05, -1.28792832e-04], dtype=float32), 1.7810833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_691.wav', 'Läuft die Waschmaschine noch?', 30, array([-1.7628371e-04, 3.7217360e-05, 5.7620698e-05, ...,\n", + " 4.7630738e-06, -1.4578988e-04, -2.1564976e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_728.wav', 'Der zweite war nicht mehr so chic.', 34, array([ 3.7413691e-05, 2.5557930e-04, 3.8776739e-06, ...,\n", + " -1.6214621e-04, -2.7943292e-05, -4.3322394e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_748.wav', 'Das Licht wird schwächer.', 26, array([-8.6605805e-06, -9.4557421e-05, -4.0338778e-05, ...,\n", + " -4.2446409e-05, 4.2122399e-05, -6.5777012e-06], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_789.wav', 'Du hast mich durchschaut.', 25, array([-7.2653616e-05, -5.6117566e-05, -2.1032026e-04, ...,\n", + " -1.6650984e-05, -4.1212854e-05, 1.1137113e-04], dtype=float32), 1.7144375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_797.wav', 'Kennt ihr den Weg?', 18, array([-1.6756072e-04, -1.5301499e-04, -6.5641878e-05, ...,\n", + " 2.0324395e-04, 1.4747797e-04, 2.2508665e-04], dtype=float32), 1.5205833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_835.wav', 'Alles klar bei dir?', 19, array([ 1.1695884e-04, 1.1995935e-05, -1.2846527e-04, ...,\n", + " -1.9988464e-04, -2.4078601e-05, -4.2752044e-06], dtype=float32), 1.4054583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_18_FINAL/18_841.wav', 'Kommt jemand mit?', 17, array([ 4.9882954e-05, 4.0318602e-05, 1.2408203e-04, ...,\n", + " -1.1336284e-04, -1.6859797e-04, -3.4263285e-05], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_6.wav', 'Nur vom Hörensagen.', 20, array([ 4.0408637e-04, 5.5643718e-04, 5.7215214e-04, ...,\n", + " -7.1763410e-05, -1.0798458e-04, -3.2582655e-05], dtype=float32), 1.7205)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_14.wav', 'Ich weiÃ\\x9f es nicht mehr.', 24, array([0.00023374, 0.00015971, 0.0001749 , ..., 0.00011659, 0.00024648,\n", + " 0.00010209], dtype=float32), 1.6233125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_17.wav', 'Lass es raus!', 13, array([-3.1531116e-04, -3.3344212e-04, -5.9053692e-04, ...,\n", + " 5.4772248e-05, -1.1641844e-05, -6.8900968e-05], dtype=float32), 1.4902916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_68.wav', 'Sie müssen mir glauben!', 24, array([ 1.4851260e-04, 2.9638095e-04, 2.5485444e-04, ...,\n", + " -1.8143297e-05, 4.6757654e-05, 4.2184558e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_92.wav', 'Ihre Bestellung, bitte!', 23, array([ 9.8706114e-05, 2.2661808e-04, 1.6781769e-04, ...,\n", + " 5.1173961e-06, -2.6828362e-04, -2.2934456e-04], dtype=float32), 1.4297083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_95.wav', 'Was können Sie mir anbieten?', 29, array([-1.9375395e-04, -3.1588171e-04, -3.9896931e-04, ...,\n", + " 1.0834881e-04, -1.4949654e-05, -1.3323028e-05], dtype=float32), 1.7689583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_100.wav', 'Also nicht missverstehen!', 25, array([-1.1475936e-04, 3.5450608e-05, 5.9234120e-05, ...,\n", + " 7.9908222e-07, -7.6752185e-05, 3.1952815e-05], dtype=float32), 1.9507083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_101.wav', 'Jeder macht mal Fehler.', 23, array([-2.0121370e-05, 3.3358188e-05, 1.4433647e-05, ...,\n", + " 2.5029780e-04, 1.0649080e-04, 2.8118977e-04], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_107.wav', 'Immer dasselbe mit dir.', 23, array([ 4.04063358e-05, 2.61971072e-05, -1.03683014e-04, ...,\n", + " -2.34830455e-04, -1.33784546e-04, -7.84191070e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_113.wav', 'Jetzt erinnere ich mich.', 24, array([ 5.7016779e-05, 9.8553166e-05, 8.2001083e-05, ...,\n", + " 2.6238111e-05, 1.3704958e-05, -8.3586237e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_131.wav', 'Freiwillige vor!', 16, array([ 5.72854588e-05, 1.07770924e-04, 1.99439557e-04, ...,\n", + " -4.32070919e-05, -3.67913685e-06, 1.42182573e-04], dtype=float32), 1.5300416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_137.wav', 'Ich lehne ihn sogar ab.', 23, array([ 4.1758478e-05, 1.8570285e-05, 2.1333873e-04, ...,\n", + " 2.0144802e-05, -3.2468499e-05, 4.0363415e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_148.wav', 'Setz dich!', 10, array([-1.4053716e-04, -1.2715683e-04, -3.6183195e-04, ...,\n", + " 8.8158406e-05, -4.2700492e-05, 1.4811622e-04], dtype=float32), 1.1631458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_159.wav', 'Wie lief die Klausur?', 21, array([4.5470217e-05, 1.4640424e-04, 9.2724607e-05, ..., 1.4090222e-04,\n", + " 1.8730978e-04, 8.1763144e-05], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_188.wav', 'So viel Zeit muss sein!', 23, array([-7.5860844e-05, -1.8835207e-04, -2.0893685e-04, ...,\n", + " -5.3442498e-05, -6.1138802e-05, -8.8275759e-05], dtype=float32), 1.7810833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_194.wav', 'Zeit fürs Bettchen.', 20, array([-9.7486656e-05, -5.1642677e-05, -8.1966471e-05, ...,\n", + " -7.5118078e-05, -3.0586343e-05, -7.1709837e-05], dtype=float32), 1.6599166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_236.wav', 'Wir sind gleich da.', 19, array([-6.8177519e-06, 6.7671383e-05, -1.0620675e-04, ...,\n", + " 4.5802376e-06, -7.1226568e-05, -5.8944144e-05], dtype=float32), 1.6622916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_243.wav', 'Herrgott noch mal!', 18, array([ 1.7256364e-04, 1.5818405e-04, 2.4684667e-04, ...,\n", + " -1.7978776e-04, -2.2976559e-05, -3.1599044e-05], dtype=float32), 1.4440208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_262.wav', 'Früher war alles besser.', 25, array([ 1.6410025e-04, 2.0620895e-04, 2.0922835e-04, ...,\n", + " 4.5493864e-05, -7.6417935e-05, 7.0160553e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_264.wav', 'Wie heiÃ\\x9ft du?', 14, array([ 2.3004458e-04, 3.3690900e-04, 3.8855671e-04, ...,\n", + " -1.7735986e-04, -6.0517366e-05, 1.4090910e-05], dtype=float32), 1.24025)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_267.wav', 'Siehst du?', 10, array([ 8.0912840e-05, 5.0722783e-06, 6.0588944e-05, ...,\n", + " -1.2716564e-04, 2.9675630e-05, -1.6470523e-05], dtype=float32), 1.187375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_269.wav', 'Totgesagte leben länger.', 25, array([-1.0916409e-05, -1.7836766e-05, -5.1411305e-05, ...,\n", + " -1.2148214e-04, -2.2084620e-04, 8.5974034e-06], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_291.wav', 'Bin ich männlich?', 18, array([-2.0014251e-05, 2.6616051e-05, 1.2375216e-04, ...,\n", + " 1.3375390e-04, 5.5609209e-05, -7.4272582e-05], dtype=float32), 1.4418125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_295.wav', 'Was war in dem Umschlag?', 24, array([-6.2635612e-05, -4.7769913e-06, -1.3995348e-05, ...,\n", + " 7.0862757e-06, 9.2074784e-05, 9.0880349e-06], dtype=float32), 1.9507083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_316.wav', 'Ich bin bedient.', 16, array([2.5768091e-05, 1.6018275e-05, 3.7452736e-04, ..., 7.7061843e-05,\n", + " 1.8039568e-04, 7.1911185e-05], dtype=float32), 1.6599166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_317.wav', 'Tschüssikowski!', 16, array([ 1.3183661e-04, 8.4080348e-05, -2.6853681e-05, ...,\n", + " 5.1806877e-05, 1.5268542e-05, -6.9305977e-05], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_323.wav', 'Fang mich, wenn du kannst!', 26, array([-4.0345873e-05, 3.4187411e-05, -3.7680857e-05, ...,\n", + " -8.6350832e-05, -1.6245214e-04, -5.1246581e-05], dtype=float32), 1.7447291666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_343.wav', 'Ich bin kein Einbrecher!', 24, array([ 2.2356608e-05, -6.4235406e-05, -9.0699705e-06, ...,\n", + " 1.2990409e-04, 7.6688739e-05, -4.0372826e-05], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_346.wav', 'Hör nicht auf ihn.', 19, array([-2.9778299e-05, 3.8957646e-06, -7.7031938e-05, ...,\n", + " 1.9274552e-04, 1.7162508e-04, -1.3842691e-06], dtype=float32), 1.2964166666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_349.wav', 'Eine letzte Windung noch.', 25, array([-1.8898114e-05, -4.0488834e-05, 1.2324851e-04, ...,\n", + " -7.7293364e-05, 8.3202161e-05, 1.5701227e-04], dtype=float32), 1.5508541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_359.wav', 'Mir nach!', 9, array([ 9.4505500e-05, 2.3980458e-04, 3.7063317e-05, ...,\n", + " -4.1811028e-04, -4.7733358e-04, -4.6703668e-04], dtype=float32), 1.3489375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_360.wav', 'Schon wieder?', 13, array([-2.7792374e-04, -4.0585164e-04, -4.3411212e-04, ...,\n", + " -6.9041176e-05, -2.6838092e-07, 5.3586686e-05], dtype=float32), 1.0783333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_363.wav', 'Heidi funkelt ihn an.', 21, array([-1.39060983e-04, -9.78735334e-05, 9.33348783e-05, ...,\n", + " -1.00029130e-04, -1.25095859e-04, -1.00360034e-04], dtype=float32), 1.9506875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_375.wav', 'Kein Signal gefunden.', 21, array([-1.1299809e-04, -9.9104131e-05, -2.1005377e-05, ...,\n", + " -2.4724935e-04, 5.5919631e-06, 4.7323024e-06], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_403.wav', 'Entschuldigen Sie die Störung!', 31, array([ 6.84832412e-05, 1.86067002e-04, -1.04915016e-04, ...,\n", + " 1.84468547e-04, 4.62387870e-05, -5.50564218e-05], dtype=float32), 1.8174166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_404.wav', 'Guter Mann!', 11, array([ 4.2475749e-05, -3.8101676e-05, 8.2924860e-05, ...,\n", + " -9.0844223e-06, 8.0864724e-05, -4.9711874e-05], dtype=float32), 1.1268125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_416.wav', 'Oder etwa nicht?', 16, array([ 1.6924678e-05, 8.7618108e-05, 1.1962327e-04, ...,\n", + " -1.5572428e-04, -1.2718650e-04, -2.7018292e-05], dtype=float32), 1.5266458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_427.wav', 'Wer weiÃ\\x9f das schon.', 20, array([-1.2090163e-05, -1.1217411e-04, -3.4340650e-05, ...,\n", + " -1.9305095e-05, 1.0599474e-04, -7.2453047e-05], dtype=float32), 1.9157916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_440.wav', 'Walter hat es verpatzt.', 23, array([-9.9328121e-05, -3.7155328e-07, -5.4411164e-05, ...,\n", + " 1.3715628e-04, -4.9349186e-05, -1.4098950e-04], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_451.wav', 'So läuft das nicht.', 20, array([-1.21481185e-04, -1.13304653e-04, -2.73915475e-07, ...,\n", + " 1.47375540e-04, 1.44234422e-04, -2.10445778e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_457.wav', 'Unverhofft kommt oft.', 21, array([-1.8882036e-05, -2.5487921e-05, 2.6220470e-04, ...,\n", + " 5.6016044e-05, -7.5536453e-05, -4.1967660e-06], dtype=float32), 1.865875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_461.wav', 'Bist du noch Single?', 20, array([-7.4286567e-05, -1.6158549e-04, -1.6719839e-04, ...,\n", + " -9.1800161e-05, -1.2240406e-04, 3.6517587e-05], dtype=float32), 1.8416458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_468.wav', 'Mein Licht ist kaputt.', 22, array([ 4.02122387e-05, -1.00659774e-04, -8.88236755e-05, ...,\n", + " -4.64872028e-05, -2.63940365e-06, 7.19727832e-05], dtype=float32), 1.7735)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_489.wav', 'Gemeinsam sind wir dumm!', 24, array([-1.4583243e-04, -2.6087323e-04, -2.3470599e-05, ...,\n", + " -2.4694938e-04, -1.5543406e-04, -6.1786144e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_518.wav', 'Er hat Mama gesagt!', 19, array([ 2.6662483e-05, -7.8772522e-05, -5.4227519e-05, ...,\n", + " 1.4953410e-05, -6.7233414e-05, -9.8744909e-05], dtype=float32), 1.9264791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_551.wav', 'Ciao!', 5, array([-8.1419050e-05, -2.2554104e-05, -9.1002643e-05, ...,\n", + " 8.3599451e-05, -1.5038802e-05, 1.8543131e-05], dtype=float32), 0.8966041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_557.wav', 'Die Welt ist ungerecht.', 23, array([-7.9495927e-05, -2.2434435e-04, -1.8575993e-05, ...,\n", + " 4.3908138e-05, 4.8930386e-05, 1.4439608e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_573.wav', 'Wer weiÃ\\x9f?', 10, array([-2.4007348e-05, 2.8211702e-05, 1.1010996e-04, ...,\n", + " 3.2032028e-04, 2.8236501e-04, 3.1412503e-04], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_594.wav', 'Feierabend!', 11, array([-1.4223782e-05, -5.6433430e-05, -3.3835067e-06, ...,\n", + " -1.2677837e-04, 4.7294146e-05, 7.4652962e-05], dtype=float32), 1.6356875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_599.wav', 'Sag bloÃ\\x9f!', 10, array([ 2.12539035e-05, -1.20294884e-04, -8.79466315e-05, ...,\n", + " 2.56883359e-04, 2.45794392e-04, 4.15721239e-04], dtype=float32), 1.4781666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_618.wav', 'Geht das in Ordnung?', 20, array([-1.7039385e-04, -4.3828294e-04, -3.7954788e-04, ...,\n", + " 2.5719850e-04, 3.6655194e-05, 4.4241093e-05], dtype=float32), 1.7204791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_649.wav', 'Komm noch etwas näher!', 23, array([ 2.4222159e-06, -1.3579089e-04, -4.4756231e-05, ...,\n", + " -1.4951664e-04, -2.2786215e-04, -3.1124309e-04], dtype=float32), 1.4418125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_656.wav', 'Lach nicht!', 11, array([ 1.79771829e-04, 1.79155570e-04, 4.07271327e-05, ...,\n", + " 1.34896531e-04, 1.24606095e-05, -4.19603248e-06], dtype=float32), 1.3327708333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_678.wav', 'Ich fasse zusammen.', 19, array([-1.0120855e-04, 6.3165186e-05, -2.2567945e-05, ...,\n", + " 6.0140010e-05, 9.6748437e-05, 3.0506399e-05], dtype=float32), 1.708375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_695.wav', 'Umtausch ausgeschlossen!', 24, array([ 7.3856318e-05, 2.8886712e-05, 1.5315624e-04, ...,\n", + " -9.7581760e-05, 8.5684667e-05, -3.2478438e-05], dtype=float32), 1.6720208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_699.wav', 'Setzt euch.', 11, array([-1.6188849e-04, -1.0612092e-04, -6.7996967e-05, ...,\n", + " -1.1114984e-04, -2.0633070e-04, -1.5339212e-05], dtype=float32), 1.3085416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_703.wav', 'Ja, ist sie.', 12, array([-8.3997344e-05, -2.7474607e-05, -1.9123188e-05, ...,\n", + " 1.8876011e-04, 5.0511160e-05, 9.6139847e-05], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_707.wav', 'Nehmt sie ihnen ab!', 19, array([ 4.0254617e-04, 4.7474771e-04, 3.5727478e-04, ...,\n", + " -1.1594634e-06, -1.5993090e-04, -1.5013713e-05], dtype=float32), 1.8477083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_718.wav', 'Bitte schön lächeln.', 22, array([-5.2708318e-05, -1.2709903e-04, -3.1722573e-04, ...,\n", + " -1.4999519e-04, 1.3614057e-04, -2.6379108e-05], dtype=float32), 1.9809791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_732.wav', 'Kamelle!', 8, array([ 3.4038111e-04, 4.9238594e-04, 3.1708140e-04, ...,\n", + " -8.7314249e-05, -4.2823103e-05, 4.8170114e-06], dtype=float32), 1.0541041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_733.wav', 'Nichts daran war schlimm.', 25, array([2.4388860e-04, 1.5891306e-04, 1.7636098e-04, ..., 6.8294656e-05,\n", + " 7.4376767e-05, 9.9975718e-05], dtype=float32), 1.79925)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_742.wav', 'Hast du das auch gehört?', 25, array([ 2.2057726e-04, 3.3742579e-04, 1.5720318e-05, ...,\n", + " 1.6000369e-05, -1.9323647e-04, -1.1723922e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_751.wav', 'Irritiert dich das?', 19, array([ 3.1597829e-05, -1.0975795e-04, -4.8185088e-05, ...,\n", + " -7.8868754e-05, 9.2668552e-06, 1.6543895e-04], dtype=float32), 1.7750208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_758.wav', 'Das ist gar nicht so lange her.', 31, array([-0.00046848, -0.00072762, -0.00048674, ..., 0.00027484,\n", + " 0.00023592, 0.00020132], dtype=float32), 1.7750208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_759.wav', 'Die Chemie muss stimmen.', 24, array([ 2.8143785e-04, 3.1653995e-04, 3.5444429e-04, ...,\n", + " 8.1970691e-05, -5.0139199e-05, -1.7111432e-05], dtype=float32), 1.9446458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_763.wav', 'Stimmt eigentlich.', 18, array([-1.2765415e-06, -4.4488741e-05, -1.0883755e-04, ...,\n", + " 2.9581884e-04, 4.5865582e-04, 6.1051000e-04], dtype=float32), 1.3024791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_782.wav', 'Sagen Sie den Zielort.', 22, array([ 1.0137472e-04, 2.3555224e-04, 2.6113808e-04, ...,\n", + " -2.9943618e-05, 3.1559110e-05, 2.7199069e-06], dtype=float32), 1.7810625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_792.wav', 'Meldet euch freiwillig!', 23, array([2.3276571e-04, 3.9564463e-04, 2.9302380e-04, ..., 1.1956793e-04,\n", + " 7.0350601e-05, 1.8581332e-04], dtype=float32), 1.8052916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_793.wav', 'Die Boote liegen auf dem Trockenen.', 35, array([-0.00011364, -0.00017169, -0.00019618, ..., 0.00044204,\n", + " 0.00018713, 0.00049593], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_19_FINAL/19_802.wav', 'Gesundheit!', 11, array([-8.3673913e-05, -7.9538848e-05, -6.8612273e-05, ...,\n", + " 4.4534498e-04, 4.3816061e-04, 2.6374889e-04], dtype=float32), 1.2722083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_18.wav', 'Aber auch nur gerade so.', 24, array([-2.2079168e-05, -1.6145856e-05, 2.9195176e-06, ...,\n", + " -1.0078496e-05, -6.2482263e-06, -5.8464525e-06], dtype=float32), 1.8333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_83.wav', 'Aber gerne!', 11, array([ 8.6439795e-06, -4.9609935e-07, -6.4880319e-06, ...,\n", + " 3.4692115e-05, 2.2026890e-05, 7.4778809e-06], dtype=float32), 1.0416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_84.wav', 'Aber heute bleiben wir nicht so lang.', 37, array([-1.8894493e-06, 2.0465507e-06, 9.1691445e-06, ...,\n", + " -7.1275235e-06, -1.7749519e-05, -2.3891846e-05], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_91.wav', 'Aber ich kÃ\\x83¶nnte das nicht.', 29, array([-1.3084863e-05, -2.4588813e-05, -3.0510082e-05, ...,\n", + " 9.0740468e-06, 7.3771143e-06, 4.7309027e-06], dtype=float32), 1.7916666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_93.wav', 'Aber ich schweife ab.', 21, array([ 2.0572887e-05, 5.2324990e-06, 8.2274501e-06, ...,\n", + " -4.5831721e-06, -5.6718955e-06, 1.2206646e-06], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_107.wav', 'Aber ja!', 8, array([ 6.5076074e-06, 9.5467785e-06, 6.4050842e-06, ...,\n", + " -2.8310139e-06, -1.7247042e-06, 4.6768464e-06], dtype=float32), 1.25)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_115.wav', 'Aber locker!', 12, array([-3.1642696e-05, -3.3065215e-05, -3.9417675e-05, ...,\n", + " 5.7364587e-06, 8.1942826e-06, 2.0739385e-06], dtype=float32), 1.125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_135.wav', 'Aber nicht mein Koch.', 21, array([-3.2864332e-06, 6.4927585e-06, 1.8139610e-05, ...,\n", + " -1.9440764e-05, 6.6915834e-07, -2.3949342e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_150.wav', 'Aber sie wirkt.', 15, array([ 4.7021126e-06, 7.9376932e-06, 1.9524101e-05, ...,\n", + " -1.0560079e-05, 2.2925117e-07, 7.0664414e-06], dtype=float32), 1.6666666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_154.wav', 'Aber sonst schon.', 17, array([ 1.3162755e-05, 5.1608640e-06, 2.6601656e-06, ...,\n", + " -1.9497929e-05, -1.3883044e-05, -2.9709727e-05], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_172.wav', 'Aber wie kann das sein?', 23, array([-1.0407030e-05, -1.3223411e-05, -2.4366140e-05, ...,\n", + " 3.1900552e-06, -6.4861370e-06, -5.3326958e-06], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_183.wav', 'Abgemacht!', 10, array([ 4.3209253e-05, 3.8841117e-05, 2.0105661e-05, ...,\n", + " 3.7174163e-07, -1.4371894e-05, -1.6794727e-05], dtype=float32), 1.375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_203.wav', 'Ach Mann.', 9, array([-1.1161302e-05, -4.8241122e-06, 1.0564104e-06, ...,\n", + " 5.0679973e-06, 7.8539133e-06, 9.7488000e-06], dtype=float32), 1.0833333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_205.wav', 'Ach die!', 8, array([-1.2094329e-05, -6.8277895e-06, -9.1963557e-07, ...,\n", + " 1.1451033e-05, -2.4406472e-06, 1.2908078e-05], dtype=float32), 1.125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_206.wav', 'Ach du ScheiÃ\\x83Â\\x9fe!', 18, array([ 2.9578983e-05, 1.8899245e-05, 2.3418788e-05, ...,\n", + " -2.3013935e-07, 1.0615421e-05, 1.1895302e-05], dtype=float32), 1.5416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_208.wav', 'Ach du liebe Zeit!', 18, array([-1.7297025e-05, -4.8105571e-06, 4.0550490e-06, ...,\n", + " 1.3112809e-06, 2.7569813e-06, -5.3473241e-06], dtype=float32), 1.9166666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_209.wav', 'Ach du meine GÃ\\x83¼te!', 21, array([-1.4435645e-06, 1.5456475e-05, 7.5820367e-06, ...,\n", + " -5.9919462e-06, -2.8870822e-06, -8.3686264e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_219.wav', 'Ach nein?', 9, array([ 2.6512873e-05, 3.2190139e-05, 2.3575940e-05, ...,\n", + " 1.2494418e-06, -4.9369064e-06, 5.6602944e-06], dtype=float32), 1.0416666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_220.wav', 'Ach so das.', 11, array([1.7692106e-05, 1.0481614e-05, 2.4560395e-05, ..., 1.1682997e-05,\n", + " 1.4096242e-05, 1.0814229e-05], dtype=float32), 1.25)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_221.wav', 'Ach so geht das.', 16, array([-4.7354648e-04, -1.6085681e-04, 6.9589930e-04, ...,\n", + " 2.8736700e-05, 3.1944357e-05, 3.1408650e-05], dtype=float32), 1.3333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_223.wav', 'Ach so.', 7, array([ 1.7158927e-04, 2.4213194e-04, 3.3745603e-04, ...,\n", + " -7.4672876e-06, -9.1694219e-06, 5.6827762e-06], dtype=float32), 0.75)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_227.wav', 'Ach, da bist du ja!', 19, array([2.9949500e-05, 1.6420616e-05, 3.4700156e-06, ..., 1.3191027e-05,\n", + " 1.0943100e-05, 1.8516728e-06], dtype=float32), 1.875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_1_FINAL/1_247.wav', 'Achte auf den Verkehr.', 22, array([-3.3732314e-05, -1.7520404e-05, 3.1957079e-05, ...,\n", + " 9.2553882e-06, 1.9688600e-06, 8.4563535e-06], dtype=float32), 1.8333333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_22.wav', 'Eine letzte Sache noch.', 23, array([6.1327388e-05, 1.8792783e-04, 6.4210355e-05, ..., 9.2773196e-05,\n", + " 9.0997717e-05, 9.3233648e-05], dtype=float32), 1.9870416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_23.wav', 'Es ist aus und vorbei.', 22, array([-1.40103046e-04, -1.22702273e-04, 9.30938695e-05, ...,\n", + " 3.74735857e-04, 3.98035394e-04, 1.15837705e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_57.wav', 'Wie machst du das?', 18, array([ 2.4910548e-04, 4.8663982e-04, 3.5670877e-04, ...,\n", + " -7.4250769e-05, -2.8972838e-05, 5.8696533e-05], dtype=float32), 1.4660625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_88.wav', 'Die Göre lügt wie gedruckt.', 29, array([-2.1256006e-04, -1.5941747e-04, -9.0014306e-05, ...,\n", + " 8.4916828e-05, -1.1791480e-04, 2.8579583e-04], dtype=float32), 1.9022291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_94.wav', 'Nur wenn das Essen nicht schmeckt.', 34, array([-7.44715726e-05, -1.21678349e-04, 3.31091655e-07, ...,\n", + " -1.03946346e-04, -1.27610518e-04, -1.86876860e-04], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_97.wav', 'Niemals!', 8, array([-4.9271861e-05, 5.3212247e-05, 3.3188411e-05, ...,\n", + " 6.3736064e-05, 4.1986009e-06, 8.9537862e-05], dtype=float32), 1.2479583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_98.wav', 'Und nun zum Wetter.', 19, array([ 5.21471120e-05, -9.25690911e-05, -1.22024496e-04, ...,\n", + " 6.86152780e-05, -3.58715624e-05, 9.09384198e-06], dtype=float32), 1.6356666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_99.wav', 'Die Ã\\x96ffnung ist dehnbar.', 25, array([ 5.6826313e-05, 6.8275417e-06, 9.2087415e-05, ...,\n", + " 3.3015142e-05, 6.6053515e-05, -1.5007930e-04], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_108.wav', 'Habt ihr schon angefangen?', 26, array([1.8725816e-04, 1.5125435e-04, 1.8410715e-04, ..., 7.2607516e-05,\n", + " 2.0626400e-04, 8.0785358e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_120.wav', 'Wie konnte das passieren?', 25, array([8.6616979e-05, 1.3365489e-04, 4.9586175e-05, ..., 2.3242908e-06,\n", + " 9.4004557e-05, 2.2714035e-04], dtype=float32), 1.5751041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_121.wav', 'Schnappen Sie die!', 18, array([ 3.8893679e-05, -8.0967751e-05, 9.0245063e-05, ...,\n", + " -1.8313204e-04, 3.8293081e-05, -2.9012112e-06], dtype=float32), 1.38125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_127.wav', 'Kommst du mit auf die Demo?', 27, array([2.5501425e-04, 3.7619186e-04, 2.3280202e-04, ..., 1.0214894e-04,\n", + " 8.1334627e-05, 1.0037446e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_131.wav', 'Neymar schummelt immer.', 23, array([ 9.5439558e-05, -2.0274975e-04, -2.7297903e-05, ...,\n", + " -1.8293603e-04, -8.1430808e-05, 2.3813642e-05], dtype=float32), 1.6962708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_146.wav', 'Hä, wie jetzt?', 15, array([ 1.0428468e-05, 1.2862872e-04, 1.4709163e-04, ...,\n", + " 2.5179393e-06, -3.9250128e-05, 1.4990567e-04], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_154.wav', 'Gehst du mit mir kicken?', 24, array([-2.3674214e-05, 1.5158611e-04, 2.0247647e-04, ...,\n", + " -5.0921575e-05, 1.6530334e-04, 2.6747581e-05], dtype=float32), 1.5508541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_165.wav', 'Die Erlösung naht.', 19, array([-2.7500470e-05, 4.6476634e-05, 9.3239294e-05, ...,\n", + " 1.3720182e-04, 3.3580043e-05, 1.6966692e-04], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_170.wav', 'Worauf wartest du noch?', 23, array([ 1.9643597e-04, 1.8858226e-04, 1.2341220e-04, ...,\n", + " 1.9399264e-04, 7.9539248e-05, -8.9550871e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_174.wav', 'Wegen der Sicherheit.', 21, array([ 5.0312192e-05, -4.7642745e-05, 7.9094330e-05, ...,\n", + " 1.6562216e-04, -3.8164351e-05, -8.3325220e-05], dtype=float32), 1.53875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_179.wav', 'Was tun Sie da?', 15, array([8.1822727e-05, 1.5520566e-04, 2.9996689e-04, ..., 9.4358256e-05,\n", + " 6.1927640e-05, 1.5151841e-04], dtype=float32), 1.550875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_190.wav', 'Die Play-Offs haben begonnen.', 29, array([-2.7691500e-04, -2.5398214e-04, -1.5421546e-04, ...,\n", + " 3.4238459e-05, -1.6769451e-04, -1.3444168e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_193.wav', 'Spinnst du?', 11, array([ 1.0871515e-04, 1.6241276e-04, -7.8830650e-05, ...,\n", + " -1.6421604e-04, -1.6669222e-04, -1.5261788e-04], dtype=float32), 1.5993333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_194.wav', 'Nicht mit mir!', 14, array([ 4.5433408e-05, -1.3075510e-04, 6.4006963e-05, ...,\n", + " -2.2528745e-04, -1.7135930e-05, -1.1135123e-04], dtype=float32), 1.4539375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_197.wav', 'Lang lebe die Königin!', 23, array([-1.6047362e-04, -1.5451153e-05, -1.0221335e-04, ...,\n", + " 7.2540395e-05, 9.8553333e-05, -3.9703427e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_220.wav', 'Der Punkt geht an euch.', 23, array([ 6.8754802e-05, -3.1321447e-06, 2.6729414e-05, ...,\n", + " 5.2136878e-05, 6.9546691e-06, 1.5569202e-04], dtype=float32), 1.878)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_253.wav', 'Ich geh jetzt duschen.', 22, array([-3.17401755e-05, 7.48557359e-05, -5.43324859e-05, ...,\n", + " -1.39205178e-04, -6.44034174e-07, 1.28346255e-05], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_270.wav', 'Ich bin stärker.', 17, array([ 9.11816460e-05, 1.44324003e-04, -2.98500763e-05, ...,\n", + " 1.31568195e-05, 6.36509794e-05, 6.90339657e-05], dtype=float32), 1.3933541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_273.wav', 'Wie lange fahre ich noch?', 25, array([ 2.9487488e-05, -1.3105408e-04, 5.8441510e-05, ...,\n", + " 3.1229702e-05, -5.4796135e-05, -6.3286854e-05], dtype=float32), 1.6841458333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_275.wav', 'Wer hat es dir verraten?', 24, array([ 1.2313928e-04, 1.3087156e-04, -1.2932777e-04, ...,\n", + " 4.8921556e-05, 1.4495553e-04, -3.3808697e-05], dtype=float32), 1.8295416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_293.wav', 'Gib nicht anderen die Schuld.', 29, array([-7.5512668e-05, -3.6905835e-06, 6.9531779e-05, ...,\n", + " 4.3623371e-05, 1.8721327e-04, 7.1873088e-05], dtype=float32), 1.9628125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_297.wav', 'Ist es schon so weit gekommen?', 30, array([-4.3128319e-05, -1.7937485e-04, -1.0890597e-04, ...,\n", + " -2.6245858e-04, -1.7716063e-04, 2.2997918e-04], dtype=float32), 1.6114375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_318.wav', 'Einfach reinstechen!', 20, array([ 9.1551570e-05, 8.9795518e-05, -6.6505017e-05, ...,\n", + " 1.0614502e-04, 1.8572784e-05, 1.7793228e-04], dtype=float32), 1.7568333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_320.wav', 'Also bleibt alles beim Alten.', 29, array([-4.5057204e-06, 1.0390608e-04, 2.8324797e-05, ...,\n", + " -9.8345605e-05, -4.1500021e-05, -2.5271966e-05], dtype=float32), 1.8053125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_324.wav', 'Fragen wir das Publikum!', 24, array([ 3.0478600e-06, -1.7624698e-04, -1.1634296e-04, ...,\n", + " 1.3709384e-04, 8.2070706e-05, 1.4319613e-04], dtype=float32), 1.7931875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_336.wav', 'Nicht nur in Norddeutschland.', 29, array([ 1.6894817e-05, 7.2304661e-05, -1.7737957e-04, ...,\n", + " 7.4396456e-05, 1.5326528e-04, -3.0850897e-05], dtype=float32), 1.8537708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_339.wav', 'Lass uns welche wegräumen.', 27, array([-1.3355519e-04, 3.6361063e-05, 1.2765500e-04, ...,\n", + " -4.6465106e-05, -9.3052886e-06, -3.1085176e-06], dtype=float32), 1.9264583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_356.wav', 'Bauch schlägt Hirn.', 20, array([-8.7791312e-05, -9.9132430e-06, -7.8506528e-05, ...,\n", + " -1.2898828e-04, 1.9388601e-05, -7.8024947e-05], dtype=float32), 1.7326041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_20_FINAL/20_381.wav', 'Polen Sie die Maschine um!', 26, array([ 8.2736617e-05, 1.0996176e-04, 9.2422182e-05, ...,\n", + " -2.2247934e-05, 7.0410904e-05, -2.1137239e-05], dtype=float32), 1.9385833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_125.wav', 'Danke sehr!', 11, array([ 3.3982175e-05, 3.0489264e-05, -3.2230830e-05, ...,\n", + " 1.3063883e-04, 6.5418164e-05, 1.0737507e-04], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_132.wav', 'Nerve ich dich?', 15, array([-1.3204044e-04, -3.8424434e-05, -1.6640245e-04, ...,\n", + " 2.0048997e-04, 2.0114701e-04, 2.8921696e-04], dtype=float32), 1.8133333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_188.wav', 'Kann ich mal riechen?', 21, array([7.4782380e-05, 1.5360968e-04, 1.7683143e-04, ..., 7.1163136e-05,\n", + " 3.2413329e-05, 1.6134117e-04], dtype=float32), 1.5949583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_195.wav', 'Sehe ich das richtig?', 21, array([-4.4274679e-03, -6.2118913e-03, -5.6534973e-03, ...,\n", + " -5.3494594e-05, 1.0948109e-05, 2.8244473e-05], dtype=float32), 1.8706875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_211.wav', 'Ein Dessert gefällig?', 22, array([ 6.1982937e-05, 8.9088433e-05, 2.1896411e-04, ...,\n", + " -5.3060539e-05, 5.5113655e-05, 2.0669409e-06], dtype=float32), 1.6305416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_264.wav', 'Was schätzen Sie?', 18, array([-9.39443475e-04, -1.31584110e-03, -1.22378767e-03, ...,\n", + " 5.19938067e-06, -1.39896365e-05, 3.26375412e-05], dtype=float32), 1.9933125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_266.wav', 'Hast du Geld dabei?', 19, array([-1.3200377e-05, 3.8996362e-04, 1.0263748e-03, ...,\n", + " -2.9147041e-05, 9.2981281e-06, -4.0353654e-05], dtype=float32), 1.707375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_327.wav', 'Augen auf die StraÃ\\x9fe!', 22, array([-1.1210357e-04, -1.8035798e-04, -1.8643556e-04, ...,\n", + " 8.4691441e-05, 5.8400867e-05, 5.8256945e-05], dtype=float32), 1.8399791666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_346.wav', 'Was soll ich da machen?', 23, array([-2.5878362e-05, 2.1881026e-05, -1.2260079e-05, ...,\n", + " 4.5499460e-06, 4.0606970e-05, -2.3619448e-05], dtype=float32), 1.9433541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_349.wav', 'Immer dasselbe mit euch!', 24, array([-2.3236821e-04, -3.3517351e-04, -3.0884243e-04, ...,\n", + " 8.0186677e-05, 1.6797509e-05, -1.6808892e-05], dtype=float32), 1.9652708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_373.wav', 'Kennen wir uns?', 15, array([-5.0764916e-06, -7.3543859e-05, 1.1312031e-05, ...,\n", + " -3.2780910e-05, -1.3342450e-04, -8.3744824e-05], dtype=float32), 1.2833125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_378.wav', 'Redet ihr nicht miteinander?', 28, array([ 3.3598881e-05, 2.8617033e-05, -4.8224880e-05, ...,\n", + " 7.4195086e-06, -4.8723170e-05, 6.5784006e-05], dtype=float32), 1.9491458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_420.wav', 'Ich hasse Rituale.', 18, array([ 7.1912136e-06, 3.0618376e-06, 8.3010753e-05, ...,\n", + " -1.4567961e-05, 1.1762774e-05, 3.1641615e-05], dtype=float32), 1.9995833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_486.wav', 'Wie groÃ\\x9f ist er denn?', 22, array([ 3.0858202e-05, 7.4509022e-05, 1.3619277e-04, ...,\n", + " -3.3022930e-06, 9.8051796e-06, -2.7459086e-05], dtype=float32), 1.867625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_537.wav', 'Es ist zum Heulen.', 18, array([-1.91718082e-05, 6.43216190e-05, 1.19517106e-04, ...,\n", + " 1.98961898e-05, 2.61543628e-05, -1.34301990e-06], dtype=float32), 1.9879583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_544.wav', 'Nimm es ihm nicht übel.', 24, array([ 4.2532893e-08, -6.0193088e-05, 4.5228205e-07, ...,\n", + " 1.0533330e-04, 4.6245714e-05, -1.5597003e-05], dtype=float32), 1.7243333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_547.wav', 'Um Gottes Willen!', 17, array([-1.3659755e-05, -1.1149528e-04, -7.7302495e-05, ...,\n", + " -5.2225241e-05, -6.4986933e-05, -1.9107327e-05], dtype=float32), 1.5258125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_570.wav', 'Voll der Lauch!', 15, array([ 2.3544633e-05, -8.2356913e-05, -8.4443280e-05, ...,\n", + " -8.3270104e-05, -1.1799393e-04, -4.4736080e-05], dtype=float32), 1.8773958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_587.wav', 'Das will ich meinen!', 20, array([ 1.15228731e-05, -1.00152036e-04, -3.91713802e-05, ...,\n", + " -3.00788033e-05, -2.60362140e-05, -2.54406623e-05], dtype=float32), 1.823375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_595.wav', 'Gib dir keine Mühe!', 20, array([1.1918874e-05, 7.7710565e-06, 2.2653954e-05, ..., 1.2088865e-06,\n", + " 7.3900424e-05, 4.7324560e-05], dtype=float32), 1.7467083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_612.wav', 'Entschuldige', 12, array([-3.3377805e-06, -1.3742609e-05, -3.8612947e-05, ...,\n", + " -4.1617693e-07, -5.6907498e-05, -6.3263155e-06], dtype=float32), 1.096375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_21.wav', 'Ich glaube, ja.', 15, array([-8.5291895e-06, -1.9790486e-05, 2.0588757e-05, ...,\n", + " 4.3540977e-06, 3.3659559e-05, 2.8167133e-05], dtype=float32), 1.7166458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_48.wav', 'Was denn jetzt?', 15, array([3.3551037e-06, 7.2315837e-05, 9.8261240e-05, ..., 1.8147666e-04,\n", + " 1.3495231e-04, 1.4128252e-05], dtype=float32), 1.5235625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_53.wav', 'Ist es das wert?', 16, array([ 6.972987e-06, -6.975743e-05, -8.996664e-05, ..., -8.399185e-06,\n", + " -8.876120e-05, -7.246290e-05], dtype=float32), 1.8518125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_118.wav', 'Findest du?', 11, array([-1.12564965e-04, -6.36710465e-05, -1.04058718e-05, ...,\n", + " 9.31948132e-04, 8.68959934e-04, 9.69569141e-04], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_139.wav', \"Wohl bekommt's.\", 15, array([-5.15776883e-05, -1.17497526e-04, -1.66595215e-04, ...,\n", + " 2.18412912e-04, 1.14814145e-04, 9.11775787e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_242.wav', 'An die Latte!', 13, array([-2.9736115e-05, 6.2128674e-05, -1.7713173e-06, ...,\n", + " -9.5688220e-06, -3.3155960e-05, -2.0475885e-05], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_256.wav', 'Wie lange noch?', 15, array([-2.0701043e-05, 4.3786262e-05, -9.4478482e-06, ...,\n", + " -5.2062300e-05, -2.7314949e-05, -9.1643757e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_287.wav', 'Halt die Klappe!', 16, array([5.4399417e-05, 1.7967819e-04, 1.5970672e-04, ..., 6.5669185e-05,\n", + " 5.5145654e-05, 4.6019220e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_320.wav', 'Mach selber!', 12, array([-6.9740723e-05, 4.4339331e-06, -8.3184044e-05, ...,\n", + " 1.4031340e-05, 1.2219901e-05, 7.0223352e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_406.wav', 'Alles frisch?', 13, array([-1.15522525e-04, -1.33178124e-04, -1.96026522e-04, ...,\n", + " 5.01462309e-05, 9.76682568e-05, 2.38532848e-05], dtype=float32), 1.4626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_577.wav', 'Nun ja.', 7, array([-4.87583275e-05, -1.09872217e-05, -2.24729556e-05, ...,\n", + " 4.66253441e-05, 1.96394685e-04, 1.52344255e-05], dtype=float32), 1.2373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_584.wav', 'Wer macht Kaffee?', 17, array([ 3.8115049e-05, -9.6357744e-06, 7.8119905e-05, ...,\n", + " -2.0809734e-04, -1.8620661e-04, -1.3914006e-04], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_3_FINAL/3_666.wav', 'Verflixt noch mal!', 18, array([-2.2882066e-04, -2.9250007e-04, -2.8351255e-04, ...,\n", + " 1.1955178e-04, 1.7373663e-04, 7.4429918e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_30.wav', 'Schweigen Sie!', 14, array([-3.1788008e-05, -3.4064793e-05, -2.7987528e-05, ...,\n", + " -1.5091732e-05, -2.6680038e-05, -3.8527149e-05], dtype=float32), 1.7066666666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_152.wav', 'Danke für die Blumen.', 22, array([ 1.7122936e-06, 6.9385942e-06, 3.6246149e-07, ...,\n", + " -1.4888439e-05, 2.3918087e-06, -7.6587348e-06], dtype=float32), 1.8791666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_218.wav', 'Und das stimmt sogar.', 21, array([ 4.1728057e-05, 5.5362845e-05, 6.8501140e-05, ...,\n", + " -2.8829272e-05, -9.4307861e-06, -1.7323953e-05], dtype=float32), 1.77075)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_228.wav', 'Oder etwa doch?', 15, array([-1.9058538e-05, -1.6082793e-05, -2.4990761e-05, ...,\n", + " -3.7682898e-05, -2.6903717e-05, -2.3563476e-05], dtype=float32), 1.8430416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_235.wav', 'Lass es gut sein.', 17, array([2.5800218e-05, 2.4886122e-05, 2.6301905e-05, ..., 2.0628368e-05,\n", + " 1.3992375e-05, 1.1405512e-05], dtype=float32), 1.8430416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_243.wav', 'Was für ein Schwachsinn!', 25, array([-3.7606616e-05, -4.6087491e-05, -5.2579282e-05, ...,\n", + " -9.6937197e-07, -2.7171711e-05, -4.9796104e-06], dtype=float32), 1.79625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_247.wav', 'Meinen Sie etwa mich?', 21, array([3.4092998e-05, 2.4871710e-05, 3.1290274e-05, ..., 3.8184229e-05,\n", + " 3.8311930e-05, 1.9864283e-05], dtype=float32), 1.7936666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_266.wav', 'Doch, der kommt mit.', 20, array([-8.7682038e-06, 3.3905403e-06, -2.5130439e-06, ...,\n", + " -7.3065071e-06, -4.2862930e-06, -2.6758978e-06], dtype=float32), 1.9898125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_324.wav', 'Du willst eine Revanche?', 24, array([ 7.33632942e-06, 5.97303369e-06, 5.83600695e-06, ...,\n", + " 1.49849775e-05, 1.08204476e-05, -3.58769762e-06], dtype=float32), 1.9875833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_359.wav', 'Achtung, Lebensgefahr!', 22, array([ 1.4763166e-05, 2.4559184e-05, -6.1735605e-06, ...,\n", + " -4.0966352e-06, -3.3091931e-06, -8.6383498e-06], dtype=float32), 1.9786666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_368.wav', 'Sag doch was!', 13, array([ 2.2444649e-06, 7.6022111e-06, 4.6965952e-06, ...,\n", + " -3.8131137e-05, -2.2596261e-05, -3.6410544e-05], dtype=float32), 1.6553333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_377.wav', 'Klar geht das!', 14, array([ 7.9997551e-07, 7.2854018e-06, 1.5502587e-06, ...,\n", + " 4.2983497e-06, 1.1067883e-06, -6.2062031e-06], dtype=float32), 1.6706666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_399.wav', 'Ganz wie ihre Mutter!', 21, array([-1.3625373e-05, -1.5324851e-05, -8.2329316e-06, ...,\n", + " -3.1325493e-05, -3.4243036e-05, -3.8296192e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_404.wav', \"Und ab geht's!\", 14, array([-1.6434673e-05, -4.6597820e-06, -3.0193429e-05, ...,\n", + " 5.6945028e-06, 4.0367054e-06, 2.6991445e-06], dtype=float32), 1.7606666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_409.wav', 'Mahlzeit!', 9, array([-1.6801674e-05, -1.1057600e-05, -2.5246043e-05, ...,\n", + " -5.8098987e-08, -1.3756068e-05, 7.1873791e-07], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_417.wav', 'Was für ein Ding?', 18, array([ 6.9620419e-06, 2.2064933e-05, -7.5111966e-06, ...,\n", + " -2.0811036e-05, -7.9874835e-06, -4.7895933e-06], dtype=float32), 1.6473333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_468.wav', 'Genau einen.', 12, array([-7.29009771e-05, -8.52458907e-05, -1.06200605e-04, ...,\n", + " -5.32185413e-06, -1.07338547e-05, -8.40487064e-06], dtype=float32), 1.3666666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_514.wav', 'Zu Befehl!', 10, array([-2.3591008e-05, -3.5732090e-05, -3.4227767e-05, ...,\n", + " -2.8442626e-05, 1.2019399e-05, -1.3777444e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_611.wav', 'So viel dazu.', 13, array([ 7.4472086e-06, 7.6988908e-06, 1.9191646e-05, ...,\n", + " -3.9837760e-06, -5.9473659e-06, -1.5347923e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_633.wav', 'Doch nicht diese!', 17, array([-1.5188496e-05, -1.3384078e-05, -2.5278267e-05, ...,\n", + " -9.0744479e-06, -1.7723884e-05, -8.7737453e-06], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_637.wav', 'Da musste durch.', 16, array([-6.1405983e-05, -6.6703440e-05, -6.7519111e-05, ...,\n", + " -3.0437115e-05, -1.0807975e-05, -2.7072128e-05], dtype=float32), 1.752)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_660.wav', 'Bitte haben Sie Geduld.', 23, array([-5.3847558e-05, -7.3710136e-05, -6.7579982e-05, ...,\n", + " -1.0283680e-05, -3.1539796e-05, -2.2386694e-05], dtype=float32), 1.7706666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_668.wav', 'Na logo!', 8, array([-2.3636436e-05, -1.5810723e-05, -2.8241622e-05, ...,\n", + " -1.3751334e-06, 1.1204750e-05, 6.0684874e-06], dtype=float32), 0.992)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_676.wav', 'Ich bin Student.', 16, array([ 7.12830888e-06, -1.04677674e-05, 5.06380366e-06, ...,\n", + " 2.56778890e-06, 2.41716316e-06, 1.42220715e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_721.wav', 'Warum glaubst du ihm?', 21, array([-2.8855115e-05, -2.1601849e-05, -4.5714023e-05, ...,\n", + " 1.0700950e-06, -8.6324471e-06, -1.1586128e-05], dtype=float32), 1.888)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_767.wav', 'Alle Lichter einschalten', 24, array([ 3.82986327e-05, 4.59369221e-05, 5.11867729e-05, ...,\n", + " -3.22036831e-05, -1.03011635e-05, -3.75456489e-06], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_775.wav', 'Schlaf dich gesund!', 19, array([ 8.9927544e-06, 3.7294924e-07, 2.0666816e-07, ...,\n", + " -1.4574092e-05, 9.9155943e-07, -1.1447136e-05], dtype=float32), 1.8826666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_4_FINAL/4_785.wav', 'Wer spricht da?', 15, array([-5.0560098e-05, -5.3028423e-05, -5.4164509e-05, ...,\n", + " 1.4739732e-05, 9.2475852e-07, 2.9554553e-06], dtype=float32), 1.8953333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_9.wav', 'Kannst du häkeln?', 18, array([ 5.7386926e-05, 8.2160957e-05, 5.5038501e-05, ...,\n", + " -4.3172963e-06, 4.1677453e-05, 4.7943948e-05], dtype=float32), 1.6993333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_44.wav', 'Bitte kommen!', 13, array([1.0956727e-04, 1.5614097e-04, 1.3331856e-04, ..., 1.3650022e-05,\n", + " 1.1109641e-05, 1.3527738e-06], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_53.wav', 'Hör zu!', 8, array([-6.0608932e-06, -4.1002470e-05, 2.2774377e-05, ...,\n", + " -8.5628499e-06, -1.7102975e-05, -5.2866948e-05], dtype=float32), 1.3013333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_54.wav', 'Bitte, bleib da.', 16, array([ 3.5020625e-05, 5.4955650e-05, 8.0653575e-05, ...,\n", + " -2.3735600e-05, 3.2219548e-05, -2.8188835e-05], dtype=float32), 1.3893333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_64.wav', 'Was piept hier so?', 18, array([4.8969712e-05, 1.0184415e-04, 1.0672094e-04, ..., 1.0047335e-04,\n", + " 8.2428909e-05, 7.4903524e-05], dtype=float32), 1.476)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_65.wav', 'Die Tränen sind echt.', 22, array([-2.5628888e-04, -3.2446094e-04, -2.8078147e-04, ...,\n", + " 6.0525483e-05, 4.5224155e-05, 3.3287215e-05], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_66.wav', 'Oh, wie ist das schön!', 23, array([-1.3561957e-04, -2.9620592e-04, -1.1127204e-04, ...,\n", + " -1.3441611e-05, -2.0591922e-05, -4.1845051e-05], dtype=float32), 1.9373333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_70.wav', 'Nein, die andere.', 17, array([1.08759763e-04, 2.17104956e-04, 2.50456098e-04, ...,\n", + " 1.99571132e-05, 1.15319264e-04, 1.09982837e-04], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_73.wav', 'Der Hunger treibt es hinein!', 28, array([-7.6006359e-04, -1.0618430e-03, -9.1635465e-04, ...,\n", + " -2.1929874e-05, -3.9133694e-05, -2.3749919e-05], dtype=float32), 1.8006666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_81.wav', 'Dann machen alle Mann kehrt.', 28, array([-1.5950583e-04, -1.6477516e-04, -1.3784993e-04, ...,\n", + " 6.2336148e-05, 1.8180552e-05, 9.2034599e-05], dtype=float32), 1.952)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_90.wav', 'Komm mal klar.', 14, array([2.0439363e-04, 2.6905714e-04, 1.8548965e-04, ..., 3.1710202e-05,\n", + " 2.3530252e-05, 2.1564969e-05], dtype=float32), 1.4186666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_95.wav', 'Ist noch alles dran?', 20, array([-2.2047247e-04, -3.2201153e-04, -2.8738266e-04, ...,\n", + " -7.7452714e-05, -4.3362299e-05, 7.5945250e-06], dtype=float32), 1.632)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_99.wav', 'Nie glaubt sie mir.', 19, array([ 1.5801163e-05, 5.7899309e-05, 3.1942949e-05, ...,\n", + " -3.0608622e-05, -8.0015372e-05, -3.3063152e-05], dtype=float32), 1.5613333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_207.wav', 'Sperrt sie ein!', 15, array([1.7913821e-04, 3.0638310e-04, 2.4345164e-04, ..., 5.7913669e-05,\n", + " 2.3223187e-05, 5.4880878e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_249.wav', 'Ja was geht denn ab?', 20, array([-1.0661902e-04, -9.4065879e-05, -6.9818758e-05, ...,\n", + " -3.3508950e-05, 3.7770699e-06, 2.3758860e-06], dtype=float32), 1.9973333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_250.wav', 'Dümmste Ausrede ever!', 22, array([ 3.16905534e-05, 3.74705655e-06, -2.55898794e-05, ...,\n", + " 4.44019097e-05, 2.41961206e-05, 1.06514235e-05], dtype=float32), 1.9806666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_251.wav', 'Wir sind hier ja unter uns.', 27, array([-3.3862656e-04, -5.0057843e-04, -4.7798100e-04, ...,\n", + " 3.9128430e-05, -4.0246316e-05, -1.3086459e-05], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_278.wav', 'Er ist ein User!', 16, array([ 5.7516689e-05, 4.9558192e-05, 6.3942927e-05, ...,\n", + " -2.3214375e-06, 1.1798247e-05, 3.6477853e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_282.wav', 'Zurückbleiben, bitte!', 22, array([ 1.8404999e-04, 2.6386097e-04, 3.0643051e-04, ...,\n", + " -6.5650514e-05, -5.8646885e-05, -6.5778695e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_287.wav', 'Gut getrollt.', 13, array([-3.0470208e-05, -6.1425657e-05, -3.8205933e-05, ...,\n", + " 6.9129404e-05, 1.1258064e-04, 1.2031732e-04], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_336.wav', 'Ganz sicher sogar.', 18, array([2.2912030e-04, 2.5114618e-04, 1.9525687e-04, ..., 8.7549386e-05,\n", + " 8.5029111e-05, 7.8950601e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_341.wav', 'Wohl kaum.', 10, array([ 1.6102573e-04, 1.7911245e-04, 1.5706589e-04, ...,\n", + " -2.9753184e-05, -4.4280365e-05, 3.1124373e-06], dtype=float32), 1.2586666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_405.wav', 'Wie geht das?', 13, array([-1.6796951e-04, -1.9163813e-04, -1.9830326e-04, ...,\n", + " -5.0582935e-06, 1.2309533e-05, -2.6891148e-05], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_407.wav', 'Befehl ist Befehl!', 18, array([ 9.3892188e-05, 1.0890782e-04, 9.6308002e-05, ...,\n", + " -3.0468544e-05, -2.8461071e-05, -7.1021976e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_412.wav', 'Mit wem spreche ich?', 20, array([ 7.7782068e-05, 9.2144561e-05, 2.8574361e-05, ...,\n", + " -1.1466493e-05, 5.7958755e-06, 6.2275390e-06], dtype=float32), 1.7813333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_422.wav', 'An schlechten Tagen ja.', 23, array([ 4.2690190e-05, -2.3120232e-05, -2.5523063e-05, ...,\n", + " 2.1898361e-05, -2.7946093e-05, 4.6620054e-05], dtype=float32), 1.9833333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_460.wav', 'Sie haben richtig geraten!', 26, array([-9.0950904e-05, -1.4647168e-04, -7.1847418e-05, ...,\n", + " 2.8589966e-05, -2.2244849e-05, 1.1577226e-05], dtype=float32), 1.9626666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_476.wav', 'Alle sprechen so leise.', 23, array([-6.9834332e-06, -3.1972188e-05, -3.9375213e-05, ...,\n", + " -2.6475973e-05, 1.4716678e-05, -4.5046556e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_477.wav', 'Woher willst du das wissen?', 27, array([-2.12417421e-04, -2.56415573e-04, -2.42886104e-04, ...,\n", + " 9.67599408e-05, 9.51452384e-05, 1.15144765e-04], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_488.wav', 'Anders als man denkt.', 21, array([ 1.8948530e-04, 3.4113604e-04, 1.9700162e-04, ...,\n", + " -7.6619792e-05, -3.6041514e-05, -1.6451453e-06], dtype=float32), 1.9413333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_495.wav', 'Runter mit den Waffen!', 22, array([ 1.12369155e-04, 4.44092657e-05, 8.84383553e-05, ...,\n", + " -7.52444794e-06, -4.84231314e-05, -4.22670855e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_504.wav', 'Und jetzt?', 10, array([-5.6267181e-06, -5.9708807e-05, -3.4106170e-06, ...,\n", + " -1.0430286e-04, -1.2670284e-04, -1.4261479e-04], dtype=float32), 1.344)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_511.wav', 'Jein.', 5, array([ 5.89297160e-05, 1.19100565e-04, 6.77589633e-05, ...,\n", + " -1.61726966e-05, -7.95948727e-05, -2.88161173e-05], dtype=float32), 1.0453333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_522.wav', 'Vorsicht Stufe!', 15, array([ 6.2581657e-06, 4.7380847e-05, 8.6832886e-05, ...,\n", + " 6.6710568e-06, 2.2640632e-05, -3.9922857e-06], dtype=float32), 1.3866666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_526.wav', 'War ich zu zickig?', 18, array([1.6193213e-03, 2.2825657e-03, 2.0064272e-03, ..., 6.6650551e-05,\n", + " 7.2444294e-05, 8.5881074e-05], dtype=float32), 1.728)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_530.wav', 'Wo drückt der Schuh?', 21, array([-1.46389175e-05, 3.62552214e-06, -9.26516877e-05, ...,\n", + " -3.03967099e-05, -1.01135854e-04, 3.96938458e-06], dtype=float32), 1.536)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_534.wav', 'Kann das noch warten?', 21, array([1.74110639e-04, 1.80995979e-04, 2.26840231e-04, ...,\n", + " 1.18193166e-04, 7.83515134e-05, 5.11603030e-05], dtype=float32), 1.664)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_539.wav', 'Passen die Sätze so?', 21, array([-3.1769360e-04, -4.7089945e-04, -4.3369626e-04, ...,\n", + " 1.6810809e-04, 5.3649095e-05, 1.4577823e-04], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_542.wav', 'Ã\\x9cbermorgen.', 12, array([-2.4301407e-04, -3.5653665e-04, -2.1825638e-04, ...,\n", + " 6.1351508e-05, 9.2918686e-05, 8.8779299e-05], dtype=float32), 1.1306666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_589.wav', 'Ich mag deinen Mantel.', 22, array([-2.1532472e-04, -3.8814778e-04, -2.9697348e-04, ...,\n", + " -3.1324416e-05, -3.5802710e-05, 8.7614599e-06], dtype=float32), 1.6746666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_604.wav', 'Wie macht er das bloÃ\\x9f?', 23, array([-1.8150010e-04, -2.0398400e-04, -1.5460433e-04, ...,\n", + " -3.4698380e-05, -6.5080814e-05, -1.8794183e-06], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_666.wav', 'Was soll ich sagen?', 19, array([-8.8535160e-07, -7.4019059e-05, 7.4082243e-05, ...,\n", + " -6.2706102e-05, 2.9464120e-06, -1.1627621e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_694.wav', 'Wie misst man das?', 18, array([ 2.5176766e-04, 1.8225121e-04, 3.6178919e-04, ...,\n", + " 2.0104897e-06, 5.5382880e-05, -2.6957323e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_704.wav', 'Mir fehlen die Worte.', 21, array([ 1.7020236e-04, 3.3776514e-04, 3.4704659e-04, ...,\n", + " 4.7222587e-05, -1.5073445e-05, -1.6250522e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_714.wav', 'Gehen wir?', 10, array([ 1.5890028e-04, 1.6513607e-04, 1.7650245e-04, ...,\n", + " 1.3219027e-05, 3.1738135e-05, -9.3036484e-05], dtype=float32), 1.3226666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_723.wav', 'Ich komme noch mal dran.', 24, array([-4.6879621e-05, -1.1869792e-04, -5.2995206e-06, ...,\n", + " 1.0155864e-05, -8.1713588e-05, -3.8661747e-05], dtype=float32), 1.8773333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_742.wav', 'Bitte schön!', 13, array([-3.4623430e-04, -4.4416677e-04, -3.0297900e-04, ...,\n", + " 5.3006592e-05, 5.1509913e-05, 7.1368544e-05], dtype=float32), 1.1733333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_743.wav', 'Das haben sie gesagt.', 21, array([-2.3902958e-05, 4.5714452e-05, 7.7266725e-07, ...,\n", + " -5.0056198e-05, 3.0718882e-05, 6.8078203e-05], dtype=float32), 1.8346666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_799.wav', 'Ist der Kugelschreiber blau?', 28, array([-1.6907173e-04, -2.9390136e-04, -2.4633619e-04, ...,\n", + " 5.9892503e-05, 6.6163295e-05, 1.4039288e-04], dtype=float32), 1.984)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_21.wav', 'Alles nach vorne!', 17, array([2.0106880e-04, 3.4844220e-04, 2.3129249e-04, ..., 9.6451986e-05,\n", + " 7.4439027e-05, 9.3146300e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_55.wav', 'Nichts dergleichen.', 19, array([-2.7673854e-04, -3.7996779e-04, -2.6658855e-04, ...,\n", + " -4.9654176e-07, -4.3088527e-05, -2.0399790e-05], dtype=float32), 1.5786666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_66.wav', \"Langsam nervt's.\", 16, array([ 7.7058452e-05, 4.7672478e-05, 2.6094380e-05, ...,\n", + " -6.2562191e-05, 2.7688688e-07, -1.2926825e-05], dtype=float32), 1.7493333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_98.wav', 'Seid ihr verrückt?', 19, array([-1.3435316e-04, -1.8146966e-04, -1.6307829e-04, ...,\n", + " -3.7551112e-07, 1.6737657e-05, 1.7336246e-05], dtype=float32), 1.6426666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_110.wav', 'Gib mir fünf!', 14, array([1.9428060e-04, 2.9409130e-04, 2.5521498e-04, ..., 1.9916235e-05,\n", + " 3.7017526e-05, 2.2721317e-05], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_140.wav', 'Auch wieder wahr.', 17, array([-6.21244908e-05, -1.39888449e-04, -1.16935575e-04, ...,\n", + " -9.32170296e-05, -7.70114566e-05, -1.37492418e-04], dtype=float32), 1.3653333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_167.wav', 'Sicher ist sicher.', 18, array([ 1.6774700e-04, 2.7458806e-04, 1.3175888e-04, ...,\n", + " -3.9984116e-05, -4.5541576e-05, 2.3846082e-05], dtype=float32), 1.792)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_175.wav', 'Wie soll ich sagen?', 19, array([-2.0688836e-05, -6.4790765e-05, -1.1548823e-05, ...,\n", + " -1.0844359e-05, -3.6513706e-05, -4.4623717e-05], dtype=float32), 1.6213333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_200.wav', 'Ist doch Ehrensache!', 20, array([ 1.07319385e-04, 1.08591557e-04, 6.78624638e-05, ...,\n", + " 3.66282293e-05, -4.84154953e-05, -2.46383879e-05], dtype=float32), 1.92)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_258.wav', 'Jeder Mensch ist anders.', 24, array([ 9.4392788e-05, 1.3444535e-04, 1.5623294e-04, ...,\n", + " -9.0343368e-05, -1.2968398e-04, -2.8964683e-05], dtype=float32), 1.8986666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_262.wav', 'Nächstes Mal vielleicht.', 25, array([-4.9963495e-04, -7.3549181e-04, -5.7168922e-04, ...,\n", + " 5.7476438e-05, 8.7852583e-05, 6.3541149e-05], dtype=float32), 1.76)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_290.wav', 'Ich wollte nur nett sein.', 25, array([-3.0248266e-04, -4.1539475e-04, -4.3182663e-04, ...,\n", + " -6.8298694e-05, -3.5496461e-05, -8.2268067e-05], dtype=float32), 1.856)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_293.wav', 'Sie haben Post.', 15, array([7.0743052e-05, 1.5683858e-04, 7.2936782e-05, ..., 3.4985551e-05,\n", + " 2.5512374e-05, 4.4657580e-05], dtype=float32), 1.6)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_302.wav', 'Mit dem Raumschiff bitte!', 25, array([-3.3868386e-05, -4.2923082e-05, 2.2873657e-05, ...,\n", + " 2.9917417e-05, -9.9794874e-05, -1.3378082e-04], dtype=float32), 1.5470625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_309.wav', 'Hä, wieso das denn?', 20, array([-4.2834796e-05, -1.3094838e-04, -2.1130700e-05, ...,\n", + " -4.5203033e-05, -6.0939405e-05, -4.7152938e-05], dtype=float32), 1.9385)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_321.wav', 'Lass dich nicht so hängen!', 27, array([ 3.3312430e-05, 1.1557561e-04, 1.7304946e-04, ...,\n", + " -5.3516556e-05, -6.5977452e-05, -8.5248823e-05], dtype=float32), 1.6589166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_333.wav', 'Sehen wir uns in der Bib?', 25, array([1.8330962e-04, 1.0809512e-04, 2.0564985e-04, ..., 5.3472275e-05,\n", + " 1.1819158e-04, 1.3498007e-04], dtype=float32), 1.9571458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_340.wav', 'Klingt logisch.', 15, array([-4.9080444e-07, -4.6037778e-05, -1.0552061e-04, ...,\n", + " -7.5399061e-05, -1.1574150e-04, -1.1011600e-04], dtype=float32), 1.137)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_361.wav', 'Lies mir etwas vor!', 19, array([-5.9860780e-05, -1.2714561e-04, -4.6063276e-05, ...,\n", + " 1.3993531e-04, 1.7140653e-04, 1.5545388e-04], dtype=float32), 1.5284375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_368.wav', 'Nö, nicht wirklich.', 20, array([1.4233610e-05, 5.8029418e-05, 2.2922040e-05, ..., 2.8016962e-04,\n", + " 1.9504840e-04, 1.6919435e-04], dtype=float32), 1.77075)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_401.wav', 'Besser als gar nichts.', 22, array([-1.9661777e-04, -3.8629526e-04, -3.8140707e-04, ...,\n", + " 4.2625456e-06, 9.6469674e-05, 2.5569330e-05], dtype=float32), 1.7055)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_402.wav', 'Lass mich doch mal träumen.', 28, array([5.1605228e-05, 2.0454232e-05, 5.4702823e-06, ..., 1.0539140e-04,\n", + " 9.8325436e-05, 6.1908002e-05], dtype=float32), 1.87325)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_407.wav', 'Wochenende!', 11, array([-7.1158116e-05, -1.3735623e-04, -1.4360537e-04, ...,\n", + " 7.2980845e-05, -2.7338607e-05, -2.3744215e-06], dtype=float32), 1.0251666666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_410.wav', 'Ich habe dich gewarnt.', 22, array([-2.9008405e-04, -3.9160642e-04, -3.8535651e-04, ...,\n", + " -8.1862388e-05, -2.1166212e-04, -1.1729619e-04], dtype=float32), 1.5563958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_420.wav', 'Kann schon sein.', 16, array([8.5848145e-04, 1.2030958e-03, 1.0428407e-03, ..., 9.0862151e-05,\n", + " 1.8885999e-04, 1.3144755e-04], dtype=float32), 1.2395208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_430.wav', 'Schön gespielt.', 16, array([-3.1265599e-04, -3.5982658e-04, -3.4920897e-04, ...,\n", + " -5.9947542e-05, -2.8197737e-05, -8.6103646e-05], dtype=float32), 1.3606666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_432.wav', 'Gut geschlafen?', 15, array([-4.6266021e-05, -4.5735891e-05, -1.5800438e-04, ...,\n", + " -5.1101240e-05, -4.5094261e-05, -1.9669098e-05], dtype=float32), 1.2488333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_435.wav', 'Auf das Wetter natürlich auch.', 31, array([-3.5034932e-04, -4.7157385e-04, -4.0150300e-04, ...,\n", + " 1.4378574e-04, 3.5348174e-05, 1.3807646e-04], dtype=float32), 1.9664583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_437.wav', 'Komm, geh weg!', 14, array([ 3.15589714e-05, 1.08517845e-04, 6.59165744e-05, ...,\n", + " -1.43856349e-04, -9.36611250e-05, -1.37200404e-04], dtype=float32), 1.4119375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_469.wav', 'Schluss mit lustig!', 19, array([ 1.0199297e-04, 1.2600295e-04, 1.6211855e-04, ...,\n", + " -1.5054672e-04, -7.8931960e-05, 6.7272131e-06], dtype=float32), 1.4259166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_483.wav', 'Das spart Geschirr.', 19, array([ 6.6607544e-04, 7.1844418e-04, 6.1214896e-04, ...,\n", + " -3.3901462e-05, 1.3226962e-04, 3.8378406e-05], dtype=float32), 1.8080208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_486.wav', 'Das haben Recherchen ergeben.', 29, array([-9.0566078e-05, -2.1272554e-04, -1.9089306e-04, ...,\n", + " 9.4858078e-05, 8.9547662e-05, 7.4881907e-05], dtype=float32), 1.9571458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_487.wav', 'Frohes Schaffen!', 16, array([ 6.8461159e-05, 1.5294057e-04, 2.2618793e-04, ...,\n", + " -2.1603348e-05, -5.1863241e-05, -6.0653092e-06], dtype=float32), 1.337375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_500.wav', 'Sie sind ja noch blutjung!', 26, array([-0.00065145, -0.00103323, -0.00116705, ..., -0.0001188 ,\n", + " -0.00014697, -0.00013791], dtype=float32), 1.8639375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_516.wav', 'Lebe ich noch?', 14, array([-4.3064877e-04, -5.6503405e-04, -4.1817623e-04, ...,\n", + " -1.6641241e-04, -1.2653919e-04, -8.6205284e-05], dtype=float32), 1.1090416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_527.wav', 'Nicht dafür!', 13, array([ 3.5247151e-04, 4.8163909e-04, 3.9777748e-04, ...,\n", + " -5.2257688e-05, -3.3391923e-05, -1.8325276e-05], dtype=float32), 1.137)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_534.wav', 'Genau hundert Stück.', 21, array([-0.00059065, -0.00093307, -0.00079542, ..., 0.00016691,\n", + " 0.00026112, 0.00016139], dtype=float32), 1.8732708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_535.wav', 'Wie ist das möglich?', 21, array([ 3.7494919e-04, 5.0490367e-04, 3.7185123e-04, ...,\n", + " 4.3858363e-06, -5.6393877e-05, -6.9622547e-05], dtype=float32), 1.3886458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_536.wav', 'Alles wiederholt sich.', 22, array([-7.8303702e-03, -9.4565414e-03, 4.3799067e-03, ...,\n", + " -7.5256619e-05, -4.4781635e-05, -4.8768667e-05], dtype=float32), 1.37)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_538.wav', 'Der Klügere gibt nach.', 23, array([-3.3002507e-04, -4.8394629e-04, -4.5790782e-04, ...,\n", + " -1.5844591e-04, -3.2335000e-05, -1.1339883e-04], dtype=float32), 1.4259166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_559.wav', 'Schwing die Hufe!', 17, array([-0.00077766, -0.00118464, -0.00101971, ..., -0.00019519,\n", + " -0.00011075, -0.00013927], dtype=float32), 1.3233958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_561.wav', 'Was das wieder kostet!', 22, array([ 8.5937936e-04, 1.1237016e-03, 9.1907283e-04, ...,\n", + " 2.4701139e-05, -1.2547316e-04, -5.1732359e-06], dtype=float32), 1.6775416666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_563.wav', 'Wieso immer ich?', 16, array([4.5056498e-04, 7.2014128e-04, 6.0793286e-04, ..., 8.4482606e-05,\n", + " 9.7867851e-05, 2.6745778e-05], dtype=float32), 1.5843541666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_582.wav', 'Dann gäbe es dich jetzt nicht.', 31, array([-2.4657813e-04, -3.9872411e-04, -3.3457237e-04, ...,\n", + " 1.6457469e-05, -1.5761821e-05, 1.1328906e-04], dtype=float32), 1.9944166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_585.wav', 'Dem werde ich Beine machen!', 27, array([0.00027461, 0.00040794, 0.00034263, ..., 0.00012492, 0.00024055,\n", + " 0.00019042], dtype=float32), 1.9850833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_588.wav', 'Wollen wir Ihn herein lassen?', 29, array([-3.2398489e-04, -4.3375781e-04, -3.6100275e-04, ...,\n", + " 1.1542152e-04, 9.4435090e-05, 1.1465035e-04], dtype=float32), 1.9198541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_590.wav', 'Richtig geraten!', 16, array([-2.6969259e-04, -4.4567345e-04, -5.3715584e-04, ...,\n", + " 6.1917281e-06, 1.5911644e-05, 3.0031568e-05], dtype=float32), 1.2954375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_634.wav', 'Nun sag schon!', 14, array([-0.00074525, -0.0010401 , -0.00091129, ..., 0.00015909,\n", + " 0.00022603, 0.00013058], dtype=float32), 1.0997291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_658.wav', 'Mit Vergnügen!', 15, array([-1.9300323e-04, -2.6942717e-04, -2.3031878e-04, ...,\n", + " 6.9992027e-05, 5.8482234e-05, 1.2584617e-04], dtype=float32), 1.1929166666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_659.wav', 'Komm sofort her!', 16, array([ 5.0228823e-04, 8.3419622e-04, 7.3006074e-04, ...,\n", + " 4.1768268e-05, -4.2891694e-05, -7.8192716e-05], dtype=float32), 1.4725208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_674.wav', 'Chill mal!', 10, array([ 3.6116564e-04, 5.9050595e-04, 4.8674442e-04, ...,\n", + " -1.4056740e-04, -6.9539550e-05, -1.2587184e-04], dtype=float32), 1.0624583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_677.wav', 'Jetzt mal Butter bei die Fische.', 32, array([-0.00017322, -0.00025202, -0.0003011 , ..., -0.00014372,\n", + " -0.00011187, -0.00014939], dtype=float32), 1.9198541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_705.wav', 'Das wird Macken geben.', 22, array([ 2.6667553e-06, 2.4150137e-05, 6.4756452e-05, ...,\n", + " -7.3486663e-05, -7.0459449e-05, 4.1346510e-05], dtype=float32), 1.7334583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_707.wav', 'Hilf mir mal auf die Sprünge.', 30, array([ 3.0066914e-04, 4.8592529e-04, 4.8968260e-04, ...,\n", + " -2.9595327e-05, -4.5949713e-05, -2.5512512e-05], dtype=float32), 1.8452916666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_747.wav', 'Versuch macht klug.', 19, array([-6.13919692e-04, -8.45544797e-04, -7.43770273e-04, ...,\n", + " 9.61075566e-05, -8.48421769e-05, -1.16592164e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_768.wav', 'Kapiere ich nicht.', 18, array([ 4.0008963e-04, 6.7968445e-04, 6.0982589e-04, ...,\n", + " -7.4681542e-05, 2.5036192e-05, -4.9270067e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_776.wav', 'Der ist ja mickrig!', 19, array([-1.7217337e-04, -2.9700578e-04, -2.6711932e-04, ...,\n", + " -1.2146128e-04, -3.9679853e-05, -5.6118748e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_777.wav', 'Ja, sogar mehrere.', 18, array([ 1.1276272e-03, 1.6285295e-03, 1.3798362e-03, ...,\n", + " -2.8823823e-05, 3.4296296e-05, -5.9779604e-06], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_6_FINAL/6_778.wav', 'Fünf oder lieber sechs?', 24, array([-0.00051076, -0.00086243, -0.00095237, ..., -0.00015284,\n", + " -0.00011934, -0.00010978], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_1.wav', 'Wen interessiert das schon?', 27, array([-2.0386204e-04, -1.6595512e-04, -3.4064340e-04, ...,\n", + " -5.8528771e-05, -4.0259012e-05, -2.3960278e-05], dtype=float32), 1.9034583333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_4.wav', 'Das sieht man sofort.', 21, array([-4.7220071e-04, -6.1083253e-04, -5.2480790e-04, ...,\n", + " 3.0703570e-05, 5.0339484e-05, -4.0401741e-05], dtype=float32), 1.7007708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_9.wav', 'Kannst du ein Instrument spielen?', 33, array([-5.8206980e-04, -9.0975891e-04, -9.2016242e-04, ...,\n", + " -3.6644913e-05, -8.9309695e-05, 5.9820622e-06], dtype=float32), 1.9519166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_17.wav', 'Nein, hör mir zu!', 18, array([1.8352878e-04, 2.3541819e-04, 1.9473537e-04, ..., 3.8015917e-06,\n", + " 3.0260228e-05, 4.7941758e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_24.wav', 'Sowas ist schade.', 17, array([ 5.2204914e-04, 7.2680251e-04, 7.3363306e-04, ...,\n", + " -3.0053505e-05, -6.5714506e-05, -9.0218302e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_48.wav', 'Ich will zocken!', 16, array([-0.00016469, -0.00039593, -0.00179843, ..., 0.00018615,\n", + " 0.00012972, 0.00017355], dtype=float32), 1.5773958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_55.wav', 'Ein Insider berichtet.', 22, array([ 3.7575817e-05, 2.7695228e-04, 1.8994253e-04, ...,\n", + " 2.4524426e-05, 4.0446877e-05, -2.5534926e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_65.wav', 'Evelyn ist seekrank.', 20, array([0.00062829, 0.00093936, 0.0008276 , ..., 0.00017747, 0.00012535,\n", + " 0.00013539], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_66.wav', 'Zunächst der Blick aufs Wetter.', 32, array([-0.00092968, -0.00141539, -0.00128506, ..., 0.00019455,\n", + " 0.00034253, 0.00020309], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_77.wav', 'Was schmeckt am besten?', 23, array([6.4622820e-04, 1.0704662e-03, 1.1439651e-03, ..., 1.9296777e-04,\n", + " 9.2506059e-05, 4.9435432e-05], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_78.wav', 'Wir rufen Sie dann auf.', 23, array([-1.0261516e-03, -1.4563096e-03, -1.2881490e-03, ...,\n", + " 5.2330338e-06, 6.4821052e-06, -3.7749737e-06], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_84.wav', 'Das Essen war vorzüglich.', 26, array([-5.0324254e-04, -7.2285999e-04, -5.4835685e-04, ...,\n", + " -4.1776315e-05, -4.3907283e-05, 3.2214456e-07], dtype=float32), 1.9959791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_89.wav', 'SüÃ\\x9fes oder Saures!', 20, array([-2.1448301e-04, -3.2685092e-04, -1.9420320e-04, ...,\n", + " 5.3501964e-05, 3.9838564e-05, 9.8899181e-05], dtype=float32), 1.5641875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_96.wav', 'Woran das wohl liegt?', 21, array([7.9406239e-04, 1.0801835e-03, 8.6238224e-04, ..., 1.5784081e-04,\n", + " 1.3262879e-04, 7.3408869e-06], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_118.wav', 'Hier hast du deinen Fisch.', 26, array([0.00047934, 0.0008143 , 0.00071459, ..., 0.00040429, 0.00026866,\n", + " 0.00011292], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_135.wav', 'Und zwar hochverdient!', 22, array([-3.4465449e-04, -5.7459215e-04, -4.8516967e-04, ...,\n", + " 2.8431052e-05, 9.6089265e-05, 2.6090011e-05], dtype=float32), 1.9475)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_144.wav', 'Da kräht kein Hahn nach.', 25, array([-2.4579404e-05, -2.7367115e-04, -1.3865142e-04, ...,\n", + " 6.7543602e-05, 4.0894251e-05, 2.7544003e-05], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_148.wav', 'Du zuerst.', 10, array([-8.7500273e-05, -8.8356370e-05, 3.9270883e-05, ...,\n", + " -1.0109833e-04, 5.8080084e-05, -1.4014350e-04], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_149.wav', 'Hier mal eine Faustregel.', 25, array([7.6173781e-04, 9.7895204e-04, 8.7399769e-04, ..., 5.2696447e-05,\n", + " 1.8836032e-06, 6.7383153e-06], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_150.wav', 'Ich sehe kein Leerzeichen.', 26, array([-2.0238354e-05, -3.9017228e-05, -1.8151976e-04, ...,\n", + " -2.8073411e-05, -8.1482809e-05, -9.7252036e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_151.wav', 'Hast du mal einen Fünfziger?', 29, array([-6.5894198e-04, -9.4568409e-04, -8.3610136e-04, ...,\n", + " -1.5597163e-04, -1.5190896e-04, -4.1842508e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_155.wav', 'Mit Pommes?', 11, array([ 0.0003422 , 0.0003448 , 0.00032375, ..., -0.00023719,\n", + " -0.00028336, -0.00012051], dtype=float32), 0.9252916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_172.wav', 'Noch fünf Minuten bitte, Schatz!', 33, array([-4.4656807e-04, -5.2705233e-04, -5.8281276e-04, ...,\n", + " -1.7271057e-05, 3.9541996e-05, 1.4292495e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_174.wav', 'Es ist wie verhext.', 19, array([3.7680543e-04, 6.3684850e-04, 4.2467855e-04, ..., 1.3614137e-05,\n", + " 8.9109992e-05, 1.3674991e-04], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_179.wav', 'Unter uns ist ein Verräter.', 28, array([-2.2123450e-04, -3.2310621e-04, -2.8145462e-04, ...,\n", + " -1.0567834e-04, 3.1090029e-05, 6.3631160e-05], dtype=float32), 1.8682083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_190.wav', 'Nimm die Maske ab!', 18, array([4.6733877e-04, 6.9651386e-04, 5.4769457e-04, ..., 1.6475593e-04,\n", + " 7.5979711e-05, 7.9883583e-05], dtype=float32), 1.2337291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_193.wav', 'Nicht dass ich wüsste.', 23, array([ 0.0001971 , 0.00045662, 0.00023958, ..., -0.00011544,\n", + " -0.00016933, -0.00016841], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_194.wav', 'Der Tee zieht noch.', 19, array([-0.00024223, -0.00046848, -0.00045602, ..., -0.00014842,\n", + " -0.00016475, -0.00012201], dtype=float32), 1.6390833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_210.wav', 'Tu es für mich!', 16, array([4.4054058e-04, 7.1835978e-04, 6.8089634e-04, ..., 6.5819913e-05,\n", + " 6.3534033e-05, 2.4601215e-04], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_216.wav', 'Bölken Sie woanders herum!', 27, array([-4.3733866e-04, -5.8234221e-04, -6.0285319e-04, ...,\n", + " -2.0549475e-04, -5.1659747e-05, -6.9836286e-05], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_217.wav', 'So, so.', 7, array([5.1622407e-04, 8.1000535e-04, 6.2310486e-04, ..., 1.1862206e-04,\n", + " 7.1799346e-05, 3.3523640e-06], dtype=float32), 1.3747291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_220.wav', 'Leicht verdientes Geld.', 23, array([ 1.47327999e-04, 1.87759506e-04, -1.56362767e-05, ...,\n", + " 1.08211556e-04, 8.50987126e-05, -3.97509648e-05], dtype=float32), 1.7360208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_226.wav', 'Wie lautet der Zwischenstand?', 29, array([ 5.1066454e-04, 7.2763517e-04, 6.3450093e-04, ...,\n", + " -8.1010330e-05, -1.8156270e-05, -5.7707053e-05], dtype=float32), 1.9827708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_273.wav', 'Was hat ihn geritten?', 21, array([-3.4532882e-04, -5.6787761e-04, -6.2309759e-04, ...,\n", + " -3.4597360e-05, -1.2706745e-05, -1.1419446e-04], dtype=float32), 1.6214583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_282.wav', 'So nicht, Freundchen!', 21, array([-2.2482723e-03, -3.3393281e-03, -3.0241525e-03, ...,\n", + " 8.9230271e-05, 8.0567042e-05, -1.7856433e-05], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_287.wav', 'So ein feiner Hund!', 19, array([-0.00024811, -0.00028893, -0.00043056, ..., -0.0001634 ,\n", + " -0.00015287, -0.00012142], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_297.wav', 'Ah, die Feuerwehr!', 18, array([-5.8479345e-05, 1.3606872e-06, -3.1950235e-04, ...,\n", + " 4.5466539e-04, 4.1461250e-04, 3.1427949e-04], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_298.wav', 'Nachricht bitte faxen!', 22, array([-3.4957391e-04, -4.1374876e-04, -4.3978900e-04, ...,\n", + " -1.4674234e-04, -2.0285949e-04, -3.0548752e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_330.wav', 'Alter Verwalter!', 16, array([0.00058996, 0.00086262, 0.00074697, ..., 0.00030815, 0.00029123,\n", + " 0.00018931], dtype=float32), 1.8615833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_333.wav', 'Was will man mehr?', 18, array([-8.3821319e-04, -1.1214241e-03, -1.0474359e-03, ...,\n", + " -4.0887986e-05, 1.7188730e-05, 6.5576496e-05], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_362.wav', 'Ganz der Papa!', 14, array([ 1.0614250e-06, 1.0387501e-04, 2.6466480e-05, ...,\n", + " -3.6802659e-05, 4.0980707e-05, 7.8629993e-05], dtype=float32), 1.3042291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_368.wav', 'Es geht schon, danke!', 21, array([-3.1714016e-04, -4.7203674e-04, -3.6235168e-04, ...,\n", + " 7.8341058e-05, 4.7649206e-05, 1.9486140e-05], dtype=float32), 1.6919583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_404.wav', 'Notieren Sie sich das.', 22, array([-3.1276091e-04, -4.1585916e-04, -4.4194568e-04, ...,\n", + " -1.9349645e-04, -6.0014678e-05, 2.7422161e-07], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_416.wav', 'Mach das Licht an!', 18, array([ 7.4020500e-04, 9.9551259e-04, 7.7506527e-04, ...,\n", + " -9.4190882e-06, -5.5277683e-06, 6.0646169e-05], dtype=float32), 1.273375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_418.wav', 'Gebt mir ein O!', 15, array([ 2.55384133e-04, 2.99102190e-04, 3.85188963e-04, ...,\n", + " -6.97520736e-05, -1.12780595e-04, -5.84875634e-05], dtype=float32), 1.5641875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_425.wav', 'Wir haben unsere Vorschriften.', 30, array([-0.0014397 , -0.00206455, -0.00194661, ..., 0.00017973,\n", + " 0.00031227, 0.00029818], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_428.wav', 'Dort spielt die Musik!', 22, array([0.00064248, 0.00109204, 0.00095334, ..., 0.00016345, 0.00021933,\n", + " 0.00016792], dtype=float32), 1.9386875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_439.wav', 'Runter von der Couch!', 21, array([0.00032077, 0.0003695 , 0.00031393, ..., 0.00016823, 0.00027614,\n", + " 0.00030219], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_440.wav', 'Geh, Martin. Geh!', 17, array([-0.0006147 , -0.00096355, -0.00084441, ..., -0.00019064,\n", + " -0.00014664, -0.0001376 ], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_442.wav', 'Dann ist doch alles paletti.', 28, array([-0.0003903 , -0.00051721, -0.00051659, ..., 0.00044963,\n", + " 0.00069829, 0.00057605], dtype=float32), 1.7915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_464.wav', 'Hören Sie erst einmal zu!', 26, array([-2.3209564e-03, -3.7553089e-03, -3.8581355e-03, ...,\n", + " 4.0617133e-06, 6.2217005e-05, 1.8342262e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_475.wav', 'Ich will die Hände sehen!', 26, array([-1.1517418e-03, -1.5774536e-03, -1.5022659e-03, ...,\n", + " 8.5659660e-05, 1.5909245e-04, 1.0823877e-04], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_481.wav', 'Du kennst doch Tessa.', 21, array([-9.7565542e-05, -8.4838466e-05, -2.1631434e-04, ...,\n", + " -9.0966016e-05, -9.0894253e-05, -1.5524645e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_487.wav', 'Angeber und Neidhammel.', 23, array([-4.2524905e-04, -5.5071624e-04, -4.9216941e-04, ...,\n", + " -9.1045105e-05, -3.0268184e-05, -1.0583480e-04], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_491.wav', 'Können diese Augen lügen?', 27, array([-1.04710832e-03, -1.57430710e-03, -1.43215503e-03, ...,\n", + " 1.43472225e-05, 1.20743534e-05, -1.07111417e-04], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_499.wav', 'Kann man hier denn nicht lüften?', 33, array([-9.1343711e-04, -1.1802320e-03, -9.9357730e-04, ...,\n", + " 7.8159035e-05, 2.3012167e-04, 3.3637294e-05], dtype=float32), 1.9871666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_500.wav', 'Der Mann ist vom Leben gezeichnet.', 34, array([ 1.06765685e-04, 2.15540877e-05, -9.11364405e-05, ...,\n", + " -5.42830057e-05, -9.09425871e-05, -3.43727625e-05], dtype=float32), 1.7712708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_506.wav', 'Wollen Sie mich aushorchen?', 27, array([ 0.00060325, 0.00087957, 0.00074186, ..., -0.00021219,\n", + " -0.00024823, -0.00017538], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_531.wav', 'Je eher, desto besser.', 22, array([5.7826861e-04, 7.7570765e-04, 6.1795511e-04, ..., 8.9765228e-05,\n", + " 4.5600675e-05, 1.4581751e-04], dtype=float32), 1.7800833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_538.wav', 'Och, komm schon her!', 20, array([-4.9066258e-04, -7.3491497e-04, -5.5824185e-04, ...,\n", + " 8.5976262e-06, 1.0786976e-04, 1.2791457e-04], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_542.wav', 'Nimm deine Maske endlich ab!', 28, array([ 5.4343470e-04, 7.2278164e-04, 7.2296784e-04, ...,\n", + " -3.4153378e-05, -3.6221893e-05, -8.8784982e-05], dtype=float32), 1.9386875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_574.wav', 'Wollt ihr mich ärgern?', 23, array([0.00089293, 0.00139316, 0.0012052 , ..., 0.00011375, 0.00022351,\n", + " 0.00014075], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_599.wav', 'Das ist knorke.', 15, array([-8.0439750e-06, -4.1563135e-06, -3.6478632e-05, ...,\n", + " -1.6141655e-04, -8.8675122e-05, -1.2264083e-04], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_603.wav', 'Suchst du Ã\\x84rger?', 17, array([ 1.8951594e-04, 3.2533749e-04, 2.3231433e-04, ...,\n", + " -1.0691231e-05, -6.9874281e-05, -4.5488341e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_615.wav', 'Hör nicht auf diese Schwätzer!', 32, array([ 0.00019477, 0.00020745, 0.00017311, ..., 0.00030501,\n", + " -0.00018354, 0.00024707], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_618.wav', \"Gleich geht's weiter!\", 21, array([-6.4648612e-04, -1.0017229e-03, -9.2825363e-04, ...,\n", + " -4.5593577e-05, -6.6424482e-06, 1.4339538e-05], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_641.wav', 'Herr, erbarme dich!', 19, array([ 9.9213721e-06, 1.8233144e-05, -3.5843041e-05, ...,\n", + " -5.0301041e-05, -1.3241796e-04, -2.0356404e-04], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_649.wav', 'Sammelt Holz für das Feuer!', 28, array([-0.00024918, -0.00046716, -0.00041068, ..., 0.00016901,\n", + " 0.0001653 , 0.00017449], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_692.wav', 'Erst will ich noch duschen.', 27, array([ 2.7669812e-04, 5.0494721e-04, 5.6616898e-04, ...,\n", + " 4.0362014e-05, -7.8570345e-05, 6.2029525e-05], dtype=float32), 1.6082291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_703.wav', 'Was kommt als nächstes?', 24, array([ 5.5248733e-04, 8.9842337e-04, 6.7765010e-04, ...,\n", + " -1.3254551e-04, -9.5152573e-05, -2.1063161e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_718.wav', 'Setzen, sechs!', 14, array([-1.2044140e-04, -2.0982703e-04, -2.7291384e-04, ...,\n", + " 1.7828704e-04, 9.6640695e-05, 1.3019536e-05], dtype=float32), 1.2689791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_720.wav', 'Dann nehmen wir meinen Wagen.', 29, array([ 1.2858727e-04, 1.7004457e-04, -5.1648447e-05, ...,\n", + " 2.5735653e-04, 2.8828968e-04, 1.9113944e-04], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_724.wav', 'Lach mal wieder.', 16, array([2.5169516e-04, 3.1780155e-04, 2.4175562e-04, ..., 1.8466891e-04,\n", + " 9.4025556e-05, 1.4185447e-04], dtype=float32), 1.3570833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_726.wav', 'Lass uns mal Fieber messen!', 27, array([ 4.6217057e-04, 7.1049004e-04, 5.8858085e-04, ...,\n", + " -2.7612457e-06, -4.4886579e-05, -1.3602876e-06], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_733.wav', 'Ja, du hast ja Recht!', 21, array([-0.00065709, -0.00095549, -0.00067059, ..., 0.00023162,\n", + " 0.00042249, 0.00021008], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_745.wav', 'Kannst du bitte das Licht anlassen?', 35, array([-4.5024044e-05, -6.6272514e-05, -1.4942518e-04, ...,\n", + " -1.0059726e-04, -8.9730158e-05, -4.9335773e-05], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_7_FINAL/7_755.wav', 'Jetzt wird gefeiert!', 20, array([ 6.5074948e-04, 8.2373072e-04, 6.9322297e-04, ...,\n", + " 2.5613972e-05, -7.3600226e-05, 9.0847658e-05], dtype=float32), 1.4892708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_21.wav', 'Oh, ein Blechschaden!', 21, array([ 2.7968596e-05, 2.5622614e-05, 5.5850909e-05, ...,\n", + " -3.6388674e-06, -1.3192165e-05, -5.8324472e-06], dtype=float32), 1.7536458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_24.wav', 'Woran erkennt man sie?', 22, array([-1.6248678e-05, -2.0881544e-05, 2.2568598e-05, ...,\n", + " -1.0051125e-06, -4.4804568e-05, -3.8311518e-05], dtype=float32), 1.8770208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_57.wav', 'Wo hast Du den Ludenmantel her?', 31, array([ 9.4084098e-05, 6.2570427e-05, 8.1058839e-05, ...,\n", + " -3.1764132e-05, -4.2468575e-05, -3.3772998e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_59.wav', 'Bingo!', 6, array([ 4.7897654e-05, 2.7239477e-05, 3.7255515e-05, ...,\n", + " -1.7023414e-05, -2.9687346e-05, -3.9503360e-05], dtype=float32), 1.1456041666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_64.wav', 'Schreibt man das so?', 20, array([-1.6650798e-04, -2.2954465e-04, -2.1082905e-04, ...,\n", + " 5.5576045e-05, 1.4893518e-05, 2.0421723e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_81.wav', 'Halt mal kurz mein Bier.', 24, array([-8.2688921e-06, -1.1980872e-05, -4.0169580e-06, ...,\n", + " 8.8575485e-05, 1.3926605e-04, 3.6588870e-05], dtype=float32), 1.8417708333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_83.wav', 'Doch Hilfe naht bereits.', 24, array([ 4.9734876e-06, 5.2194659e-06, 1.2122488e-05, ...,\n", + " -1.8982364e-05, -4.2752654e-05, -8.2323677e-05], dtype=float32), 1.98275)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_84.wav', 'Formation einnehmen!', 20, array([8.1898354e-05, 7.4887575e-05, 6.6653323e-05, ..., 7.7452451e-06,\n", + " 2.1070047e-05, 3.0395060e-05], dtype=float32), 1.8682083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_87.wav', 'Holt mich hier raus!', 20, array([ 6.53247334e-05, -2.15428197e-04, -5.42638707e-04, ...,\n", + " -1.15612675e-05, 2.72592151e-05, 1.50995202e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_88.wav', 'Da vorne kommt mein Ex.', 23, array([-2.0436737e-04, -7.5976342e-05, 9.7310134e-05, ...,\n", + " 8.3587765e-06, -3.2081423e-06, 1.7971579e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_113.wav', 'Glaube es mir einfach.', 22, array([-2.7944061e-05, 1.0844935e-05, -1.5047234e-05, ...,\n", + " -2.7743961e-05, 2.9569403e-06, -3.5605283e-06], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_115.wav', 'Was die Leute immer haben!', 26, array([-5.4329084e-05, -8.8018889e-05, -7.1306808e-05, ...,\n", + " 7.3982832e-05, 5.8832418e-05, 6.6730849e-05], dtype=float32), 1.9431041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_124.wav', 'BloÃ\\x9f nicht!', 12, array([1.01506448e-04, 1.75192414e-04, 1.12130554e-04, ...,\n", + " 3.55834927e-05, 4.65009398e-05, 5.75332670e-05], dtype=float32), 1.0310416666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_126.wav', \"Ich tu' immer nur rein.\", 23, array([ 3.21958287e-05, 2.19840458e-05, 1.46883485e-05, ...,\n", + " -8.37586867e-06, -5.43750639e-06, -1.22217643e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_138.wav', 'Wann lief diese Sendung?', 24, array([-1.7348650e-05, 1.9956657e-05, 3.1632226e-05, ...,\n", + " 1.5858004e-05, 1.8046559e-05, -4.8364400e-05], dtype=float32), 1.9563333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_143.wav', 'Gehen Sie aus dem Weg!', 22, array([1.80967872e-05, 1.12411635e-05, 1.61865628e-05, ...,\n", + " 6.79703808e-05, 7.41552940e-05, 9.28417285e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_145.wav', 'Es geht drunter und drüber.', 28, array([ 5.3915655e-06, 8.5220972e-06, -3.3527529e-05, ...,\n", + " -1.0693114e-05, -6.3991156e-06, 1.2663132e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_149.wav', 'Suchscheinwerfer einschalten!', 29, array([-4.3899286e-06, 1.1313143e-05, -7.2204307e-06, ...,\n", + " -3.3424400e-05, -1.3328722e-05, -2.6314769e-05], dtype=float32), 1.8858333333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_152.wav', 'Ihre Uhr geht vor.', 18, array([ 1.1011517e-05, -3.0811309e-05, -2.2571772e-05, ...,\n", + " 8.1292972e-05, 7.4179443e-05, 7.1086802e-06], dtype=float32), 1.3394791666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_153.wav', 'Wir suchen noch Freiwillige.', 28, array([-5.6182507e-06, -3.0251003e-05, 5.1053936e-05, ...,\n", + " -5.0866500e-05, -1.7348602e-05, -4.6226152e-05], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_157.wav', 'Halten Sie sofort an!', 21, array([ 2.3082459e-04, 2.3086018e-04, -2.2280088e-05, ...,\n", + " -4.5649995e-05, -3.0157349e-05, -1.7121181e-05], dtype=float32), 1.6501041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_190.wav', 'Zeig uns mal, wo der Hammer hängt!', 35, array([ 4.6486733e-05, 5.3618060e-05, 4.0510302e-05, ...,\n", + " -1.0646369e-04, -7.5534314e-05, -1.2183484e-04], dtype=float32), 1.91225)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_196.wav', 'Da kann man auch parken.', 24, array([-1.0925556e-05, -3.7278984e-05, -1.0163063e-05, ...,\n", + " -6.9978710e-06, -3.4896555e-06, -6.6393928e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_204.wav', 'Eine gute Stunde ist rum.', 25, array([-2.2296244e-05, -5.8680125e-06, -5.0762057e-05, ...,\n", + " -4.8879232e-05, -8.5942098e-05, -6.8862631e-05], dtype=float32), 1.6214583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_205.wav', 'Oh ja, das fetzt!', 17, array([ 3.4871216e-06, -4.8185248e-06, 1.2310127e-05, ...,\n", + " -1.7998637e-04, -4.5437564e-04, -3.7538476e-04], dtype=float32), 1.5025)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_225.wav', 'Sag das Zauberwort!', 19, array([ 3.0607847e-05, 4.5160428e-05, 1.8997842e-05, ...,\n", + " -1.6968366e-05, 1.1446763e-05, -3.4663015e-05], dtype=float32), 1.6743333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_242.wav', 'Die Arme!', 9, array([4.2858810e-05, 7.1904920e-05, 2.9656387e-05, ..., 5.8210357e-05,\n", + " 4.0901028e-05, 3.2474836e-05], dtype=float32), 0.8636041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_246.wav', 'Schläfst du schon?', 19, array([ 3.2191518e-05, 5.0761428e-05, 4.3220087e-05, ...,\n", + " -4.0423780e-07, 1.7892495e-05, 5.0407853e-06], dtype=float32), 1.1456041666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_247.wav', 'Ja da schau her!', 16, array([-3.9683233e-05, -9.2827155e-05, -5.1356539e-05, ...,\n", + " 8.5207663e-05, 5.3869204e-05, 8.1267404e-05], dtype=float32), 1.3394583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_251.wav', 'Moment, das ging anders.', 24, array([-7.3496245e-05, -9.7117241e-05, -9.9846256e-05, ...,\n", + " -2.2075654e-05, -5.6377292e-05, -3.1324758e-05], dtype=float32), 1.9475208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_254.wav', 'Weiter zum nächsten Kapitel.', 29, array([ 2.7818656e-05, 2.9083269e-05, 2.7292099e-05, ...,\n", + " -1.4497251e-05, 1.6704771e-05, 1.8156856e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_255.wav', 'Holla die Waldfee!', 18, array([ 3.2722608e-05, -3.4862321e-06, 2.1344584e-05, ...,\n", + " -3.5852513e-06, -1.3345180e-05, 1.8042003e-06], dtype=float32), 1.2777916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_278.wav', 'Lass mich nicht allein.', 23, array([-6.2487576e-05, -5.1307488e-05, 3.3147335e-05, ...,\n", + " -1.3666711e-06, -1.6965050e-05, 1.0842440e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_283.wav', 'Warst du beim Frisör?', 22, array([-5.1554598e-05, -2.8181448e-05, -2.1276550e-05, ...,\n", + " 5.1014787e-05, 6.0253118e-05, 4.9681836e-05], dtype=float32), 1.60825)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_291.wav', 'Warum bin ich so fröhlich?', 27, array([-9.2893220e-05, -9.0468158e-05, -8.4269959e-05, ...,\n", + " 5.6945123e-06, 2.3743269e-05, -1.5906717e-07], dtype=float32), 1.5862083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_307.wav', 'Das klingt sehr gut.', 20, array([ 8.8375481e-07, 1.4093188e-06, -8.0541049e-06, ...,\n", + " -6.2088387e-05, -3.6809190e-05, -5.5097131e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_310.wav', 'Kann man das mitessen?', 22, array([-1.8419527e-05, -2.5431269e-05, -8.9255473e-06, ...,\n", + " 2.5581608e-05, 3.7564107e-05, 2.2521937e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_311.wav', 'Wo liegt das Problem?', 21, array([-1.7069402e-05, 2.2379625e-06, -8.6348446e-06, ...,\n", + " 2.4881610e-05, -2.6925150e-06, 1.8407424e-06], dtype=float32), 1.8065208333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_313.wav', 'Wo kann man sich ausloggen?', 27, array([-2.94713544e-07, -2.60781735e-06, 2.09315767e-05, ...,\n", + " -1.10319825e-05, -5.37709784e-05, -2.63888141e-05], dtype=float32), 1.7888958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_319.wav', 'Wo kommt das nur her?', 21, array([-3.2170439e-05, -2.5212325e-05, -3.7200436e-05, ...,\n", + " -9.3722010e-06, -3.0964005e-05, -1.5780270e-05], dtype=float32), 1.9298958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_337.wav', 'Und wenn man die nicht hat?', 27, array([-4.4960318e-05, 5.2144351e-05, -2.9507015e-05, ...,\n", + " -3.9032249e-05, 3.4188946e-05, -2.3692317e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_347.wav', 'Sag ihr das bloÃ\\x9f nicht!', 24, array([8.6986920e-06, 4.4441199e-06, 3.0283294e-05, ..., 9.9162316e-05,\n", + " 7.8216704e-05, 9.9542762e-05], dtype=float32), 1.6126458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_348.wav', 'Wo bleibst du?', 14, array([ 3.3125209e-05, 5.7069548e-05, 3.6280937e-05, ...,\n", + " -2.4643228e-05, -2.7121812e-05, -1.5307731e-05], dtype=float32), 1.2998125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_352.wav', 'Jetzt oder nie!', 15, array([-1.7882267e-05, 1.5871639e-05, -7.5667369e-05, ...,\n", + " -3.7708491e-05, 7.9740630e-06, -7.9073770e-06], dtype=float32), 1.3747291666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_368.wav', 'Hinsetzen und FüÃ\\x9fe hoch!', 26, array([-3.0999392e-05, -7.2621566e-05, -4.7179296e-05, ...,\n", + " -2.5928295e-05, -3.2266624e-05, 1.4868124e-05], dtype=float32), 1.7624583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_394.wav', 'Nimm dir mal eine Pause!', 24, array([-2.7986377e-04, -3.0645030e-04, -2.3860915e-04, ...,\n", + " -3.2176635e-05, -4.1073359e-05, -1.7371191e-05], dtype=float32), 1.9519166666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_427.wav', 'Vom Kinde verschmäht.', 22, array([2.4927745e-05, 5.9401387e-05, 5.5517099e-05, ..., 8.8263223e-05,\n", + " 3.5481713e-05, 1.4234082e-05], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_435.wav', 'Kann ich dir helfen?', 20, array([-7.2613778e-04, 1.4254064e-03, 4.3165400e-03, ...,\n", + " 9.7870041e-05, 6.2070317e-06, 1.0954802e-04], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_449.wav', 'Woher will sie das wissen?', 26, array([ 6.6095758e-08, -2.7216944e-05, -1.6521408e-05, ...,\n", + " 3.0345358e-05, -5.6843191e-06, -4.2101074e-05], dtype=float32), 1.7272083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_468.wav', 'Das lass mal meine Sorge sein.', 30, array([-4.7126541e-05, -5.9281327e-05, -3.5599784e-05, ...,\n", + " 2.0367926e-05, 4.0726398e-05, 1.8718367e-05], dtype=float32), 1.9342916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_500.wav', 'Ich habe heute Geburtstag.', 26, array([-5.1001366e-06, 4.8161728e-05, 1.0626727e-05, ...,\n", + " -8.0793325e-05, -6.0714734e-05, -7.9644029e-05], dtype=float32), 1.9387083333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_513.wav', 'Fertig werden!', 14, array([-2.4789182e-05, -1.4137984e-05, -4.8843711e-05, ...,\n", + " 2.4393246e-05, 2.7856760e-05, 6.9619755e-06], dtype=float32), 1.3615)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_525.wav', 'Jeder trauert anders.', 21, array([-6.3906264e-06, -2.4861220e-05, -3.1557371e-05, ...,\n", + " -5.3394677e-05, 5.5594451e-06, -4.3505042e-05], dtype=float32), 1.8593958333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_530.wav', 'Wir sprechen uns später.', 25, array([ 1.9607371e-05, 1.2742041e-05, 5.9507223e-05, ...,\n", + " -1.0580019e-06, -1.0849526e-05, -2.2735680e-05], dtype=float32), 1.5950208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_534.wav', 'Den Schuss nicht hören.', 24, array([1.0162838e-04, 1.3316146e-04, 1.3368837e-04, ..., 5.8495625e-06,\n", + " 7.8353441e-05, 3.3752654e-05], dtype=float32), 1.8726041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_543.wav', 'Wer tut das nicht?', 18, array([-1.5056261e-05, -2.7894443e-05, -8.4756257e-06, ...,\n", + " -4.3981410e-05, -3.8667356e-05, -4.8794256e-05], dtype=float32), 1.5773958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_547.wav', 'Zu so später Stunde?', 21, array([-1.2750152e-04, 1.9311530e-05, -6.8482601e-05, ...,\n", + " -8.0274267e-06, 3.7486578e-05, -4.1844236e-05], dtype=float32), 1.6478958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_569.wav', 'Ein Zirkus ohne Tiere?', 22, array([-2.8725301e-05, -5.8967784e-05, -4.7625667e-06, ...,\n", + " 5.3123777e-06, -7.1301661e-06, -2.9527286e-05], dtype=float32), 1.8461666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_570.wav', 'Sag schon, was ist drin?', 24, array([ 1.10985304e-04, 5.97430153e-05, 9.55062278e-05, ...,\n", + " 6.52888993e-05, -5.82730863e-05, 6.85385385e-05], dtype=float32), 1.8373541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_574.wav', 'Wir zählen auf dich.', 21, array([-7.5106524e-05, -9.9009638e-05, -7.9571801e-05, ...,\n", + " 3.8461326e-06, 8.2744657e-05, 5.6746823e-05], dtype=float32), 1.9210833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_585.wav', 'Das funktioniert auch.', 22, array([ 5.4934342e-05, 1.7679840e-05, -5.7660582e-05, ...,\n", + " 4.9520886e-06, -2.5478117e-05, -6.3567706e-05], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_589.wav', 'Was drauf?', 10, array([ 1.5172187e-05, 3.5768371e-05, -4.6845405e-05, ...,\n", + " 2.3743922e-05, -3.8076912e-05, 2.2450782e-05], dtype=float32), 1.2072916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_594.wav', 'Einer zur Zeit!', 15, array([-1.9068037e-05, -2.0037192e-05, -8.8215660e-05, ...,\n", + " -1.8433493e-05, -3.3125831e-05, 3.5209345e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_606.wav', 'Okay, und nun?', 14, array([-1.2366170e-05, 2.3954278e-06, -1.8647337e-05, ...,\n", + " -2.4212586e-06, 6.3337334e-06, -2.5126603e-06], dtype=float32), 1.5597708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_607.wav', 'Das verstehst du noch nicht.', 28, array([ 1.6215906e-04, 2.5805720e-04, 2.2398161e-04, ...,\n", + " -5.9032095e-06, -1.2547288e-06, -1.8913257e-05], dtype=float32), 1.7095833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_609.wav', 'Wie uncool!', 11, array([-1.1241895e-05, -3.2969092e-05, -5.8745212e-05, ...,\n", + " 8.5234688e-06, 1.9909365e-05, 1.7495377e-05], dtype=float32), 1.0927291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_624.wav', 'Setzt dich gerade!', 18, array([-1.7491520e-05, 6.7394591e-05, 5.0117076e-05, ...,\n", + " -2.1143003e-05, -1.6165326e-05, -1.6601503e-05], dtype=float32), 1.3835208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_638.wav', 'Nicht schlecht der Specht!', 26, array([-1.0250892e-05, 1.4861113e-05, -5.1604333e-05, ...,\n", + " 7.6938113e-06, 2.0211788e-05, 4.5162437e-06], dtype=float32), 1.8153333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_647.wav', 'Was haben die vor?', 18, array([ 1.2927976e-06, -4.4330540e-05, -4.2087355e-05, ...,\n", + " 1.2652035e-04, -7.1286093e-05, -1.9011653e-06], dtype=float32), 1.4628333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_648.wav', 'Ich habe gar nichts mitbekommen.', 32, array([ 1.6062468e-05, 4.4314598e-05, 1.1317232e-05, ...,\n", + " -8.4248430e-05, -4.8613791e-05, -4.1891144e-05], dtype=float32), 1.9915833333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_664.wav', 'Je mehr, desto besser.', 22, array([-1.0978662e-05, 2.8232571e-06, -2.7930673e-05, ...,\n", + " 5.0805535e-05, 3.9726485e-05, 6.7175766e-05], dtype=float32), 1.8505833333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_677.wav', 'Da vorne links!', 15, array([ 3.8325859e-05, 3.2421449e-05, 1.5961947e-05, ...,\n", + " 2.6722651e-05, -3.3873417e-05, 3.2344939e-05], dtype=float32), 1.4363958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_682.wav', 'Jetzt mal halblang!', 19, array([-2.0417360e-06, -1.3626728e-05, -2.8990502e-05, ...,\n", + " -2.2435464e-05, -3.3464916e-05, 2.5530893e-05], dtype=float32), 1.4892708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_687.wav', 'Gib uns ein Beispiel!', 21, array([-8.4907850e-05, -5.6986839e-05, 3.7472455e-06, ...,\n", + " -1.4217812e-05, -2.3697576e-05, -2.4605337e-05], dtype=float32), 1.6567083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_689.wav', 'Von wegen!', 10, array([-2.7207323e-05, -6.9836324e-06, -9.1906164e-05, ...,\n", + " 6.5761873e-05, 5.3384709e-05, 3.5098144e-06], dtype=float32), 0.8547916666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_691.wav', 'Das finde ich ziemlich doof.', 28, array([-2.9834633e-05, 5.6474819e-06, -2.5375591e-06, ...,\n", + " -3.2603730e-06, -5.9017879e-05, -9.6670803e-05], dtype=float32), 1.7977083333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_697.wav', 'Das trifft sich gut.', 20, array([ 7.9529818e-06, 3.9593842e-06, 3.0517844e-05, ...,\n", + " -4.2052940e-05, -3.0681629e-05, -2.6093589e-05], dtype=float32), 1.8241458333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_704.wav', 'Jetzt gibt es Zoff.', 19, array([ 1.7251841e-05, 3.0525447e-05, 4.0081544e-05, ...,\n", + " -2.7181366e-05, -6.4996988e-05, -2.0187828e-05], dtype=float32), 1.6655208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_708.wav', 'Liebe ist kein Verbrechen.', 26, array([ 1.3942296e-03, 2.0183886e-03, 1.7392144e-03, ...,\n", + " 4.2136421e-06, 1.5667934e-05, -1.1447505e-05], dtype=float32), 1.8329583333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_725.wav', 'Auch das noch!', 14, array([ 6.9235853e-06, 1.0541713e-05, -6.9821567e-06, ...,\n", + " -6.0647875e-05, -3.7899004e-05, 1.4291401e-05], dtype=float32), 1.2337083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_728.wav', 'Toller Hengst!', 14, array([ 1.5415973e-05, 1.2052349e-05, 2.2745300e-05, ...,\n", + " -5.1455394e-05, -8.6221211e-05, -2.3398878e-05], dtype=float32), 1.1632291666666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_748.wav', 'Reine Gewöhungssache.', 22, array([-7.46887818e-05, 3.63702893e-05, 2.65028193e-05, ...,\n", + " 1.14920855e-04, 8.75776823e-05, 7.50372201e-05], dtype=float32), 1.4452083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_764.wav', 'Siehe weiter unten.', 19, array([ 1.0315010e-04, 1.2668683e-04, 1.3160890e-04, ...,\n", + " 3.5362529e-05, -4.0091851e-05, 3.1800329e-05], dtype=float32), 1.5509583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_778.wav', 'Hilfe ein Ã\\x9cberfall!', 20, array([-9.1011774e-05, -1.6054764e-04, -6.9503607e-05, ...,\n", + " -3.2605390e-06, -1.1628125e-05, -4.9398786e-05], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_8_FINAL/8_804.wav', \"Wen wundert's?\", 14, array([ 1.8174978e-05, 1.0757233e-05, 1.4760263e-05, ...,\n", + " -4.7010188e-05, -6.0861544e-06, -1.5782018e-05], dtype=float32), 1.2601666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_19.wav', 'Bis die Schwarte kracht.', 24, array([ 3.15900324e-05, -1.30308879e-04, 3.94875406e-06, ...,\n", + " 3.35644108e-05, 1.02667604e-04, 4.54106703e-05], dtype=float32), 1.7536354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_26.wav', 'Auch das wäre möglich.', 24, array([-4.5410670e-05, 1.9743770e-06, -1.9743769e-05, ...,\n", + " 4.3436296e-05, -1.9743770e-06, 3.5538786e-05], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_28.wav', 'Was geht gar nicht?', 19, array([ 1.9743770e-06, 9.8718847e-06, -3.3564411e-05, ...,\n", + " 1.2241138e-04, -4.5410670e-05, 0.0000000e+00], dtype=float32), 1.4065104166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_33.wav', 'Die Geschichte geht anders.', 27, array([-3.3564411e-05, -7.7000703e-05, -8.2923834e-05, ...,\n", + " 3.3564411e-05, -3.9487541e-06, 3.1590032e-05], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_42.wav', 'Welche SchuhgröÃ\\x9fe?', 20, array([ 0.0000000e+00, -1.1846262e-05, -5.9231311e-06, ...,\n", + " 4.5410670e-05, -3.9487539e-05, 2.9615656e-05], dtype=float32), 1.5685833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_51.wav', 'Mediathek aufrufen!', 19, array([-3.6328536e-04, 1.9941208e-04, -8.4898209e-05, ...,\n", + " 5.9231311e-06, -5.7256933e-05, -4.9359427e-05], dtype=float32), 1.9739479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_54.wav', 'Es tut ihr furchtbar leid.', 26, array([-3.3564411e-05, -4.9359427e-05, 1.1846262e-05, ...,\n", + " 5.1333802e-05, -8.8846966e-05, 5.7256933e-05], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_62.wav', 'Noch mal von vorne, bitte.', 26, array([ 2.5666901e-05, -2.9615656e-05, -3.7513164e-05, ...,\n", + " 8.6872591e-05, -5.7256933e-05, 6.9103196e-05], dtype=float32), 1.8417604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_63.wav', 'Oh jemine!', 10, array([-1.7769393e-05, -6.9103196e-05, -3.7513164e-05, ...,\n", + " 5.7256933e-05, 5.1333802e-05, 3.9487539e-05], dtype=float32), 1.20728125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_84.wav', 'Reich mir den mal rüber.', 25, array([-1.5795016e-05, -9.8718847e-06, 6.7128822e-05, ...,\n", + " 0.0000000e+00, -1.2833450e-04, 3.3564411e-05], dtype=float32), 1.7448333333333332)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_101.wav', 'Findest du nicht auch?', 22, array([-7.3051953e-05, -9.8718847e-06, 5.9231311e-06, ...,\n", + " 2.5666901e-05, -5.3308180e-05, 1.1451387e-04], dtype=float32), 1.32184375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_113.wav', 'Alles korrekt.', 14, array([ 1.1846262e-05, 2.9615656e-05, 1.2833450e-04, ...,\n", + " -1.9743769e-05, 2.7641279e-05, -1.7769393e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_128.wav', 'Alles wird gut.', 15, array([-9.2795723e-05, -3.1590032e-05, 8.2923834e-05, ...,\n", + " 1.3820640e-05, -4.7385049e-05, 1.1846262e-05], dtype=float32), 1.6038333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_156.wav', 'Würde ich auch machen.', 23, array([-6.1205690e-05, -5.3308180e-05, -5.5282559e-05, ...,\n", + " -9.8718847e-06, -1.1648824e-04, -6.1205690e-05], dtype=float32), 1.3747083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_173.wav', 'Gib Gas!', 8, array([ 3.7513164e-05, 7.1077571e-05, -1.9743770e-06, ...,\n", + " 5.9231312e-05, -3.0405406e-04, 4.5410672e-04], dtype=float32), 1.03984375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_221.wav', 'Weil er es kann.', 16, array([ 3.9487541e-06, -3.1590032e-05, 2.1718148e-05, ...,\n", + " -9.6744472e-05, -3.9487539e-05, -6.3180065e-05], dtype=float32), 1.4011458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_225.wav', 'Was ist der Sinn des Lebens?', 28, array([-4.7385049e-05, -9.0821341e-05, 8.6872591e-05, ...,\n", + " 7.8975081e-06, -1.3820640e-05, -2.0730958e-04], dtype=float32), 1.9563229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_247.wav', 'Es ist kalt.', 12, array([-5.33081802e-05, -1.14513867e-04, 2.36925243e-05, ...,\n", + " -4.34362955e-05, 5.92313108e-06, -1.08590735e-04], dtype=float32), 1.17203125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_259.wav', 'ScheiÃ\\x9f drauf!', 14, array([ 9.8718854e-05, 3.9487541e-06, 5.9231312e-05, ...,\n", + " -3.3564411e-05, -1.7769393e-05, -1.1253949e-04], dtype=float32), 1.19846875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_271.wav', 'Katzen haben sieben Leben.', 26, array([-1.57950162e-05, 7.89750811e-06, -6.12056901e-05, ...,\n", + " -1.04641986e-04, -7.30519532e-05, -5.92313108e-06], dtype=float32), 1.8593854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_285.wav', 'Nicht so lasch!', 15, array([ 3.3564411e-05, 1.4412952e-04, -8.8846966e-05, ...,\n", + " 5.9231311e-06, -1.4610391e-04, -3.1590032e-05], dtype=float32), 1.4187708333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_309.wav', 'Ich gehe dann mal kicken.', 25, array([-7.8975077e-05, -5.1333802e-05, 2.1718148e-05, ...,\n", + " -9.8718847e-06, 0.0000000e+00, 1.7769393e-05], dtype=float32), 1.6743229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_313.wav', 'Versuchen Sie es später noch einmal!', 37, array([1.7769393e-04, 1.5597578e-04, 7.7000703e-05, ..., 1.7769393e-05,\n", + " 2.5666901e-05, 0.0000000e+00], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_327.wav', 'Ulrike muss es ja wissen.', 25, array([ 1.3820640e-05, -4.3436296e-05, -2.5666901e-05, ...,\n", + " 8.0949460e-05, 3.1590032e-05, -1.5795016e-05], dtype=float32), 1.5553645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_329.wav', 'So viele schon?', 15, array([-1.1451387e-04, -9.4770097e-05, 1.3820640e-05, ...,\n", + " -9.8718847e-06, 7.8975081e-06, 3.3564411e-05], dtype=float32), 1.32625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_339.wav', 'Noch nicht.', 11, array([-1.1569849e-03, -1.1234205e-03, -1.1056511e-03, ...,\n", + " -4.1461917e-05, -1.9743770e-06, -2.3692524e-05], dtype=float32), 0.9473229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_351.wav', 'So alt bin ich dann auch wieder nicht.', 38, array([ 1.0957792e-03, 8.6082838e-04, 5.8836438e-04, ...,\n", + " -7.7000703e-05, -1.0661636e-04, -5.3308180e-05], dtype=float32), 1.9100520833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_352.wav', 'Diese Gelegenheit kann man nutzen.', 34, array([ 1.2043700e-04, 1.9743769e-05, 7.5026328e-05, ...,\n", + " 3.1590032e-05, 6.5154440e-05, -5.1333802e-05], dtype=float32), 1.9981770833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_361.wav', 'Bist neidisch, was?', 19, array([ 3.9487539e-05, 1.7769393e-05, -2.9615656e-05, ...,\n", + " -5.9231311e-06, 1.9743770e-06, -3.1590032e-05], dtype=float32), 1.4848645833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_364.wav', 'Puh, das Quiz ist schwer!', 25, array([-5.9231312e-05, -6.9103196e-05, -8.2923834e-05, ...,\n", + " 1.3623202e-04, 1.3030888e-04, 2.1520710e-04], dtype=float32), 1.9497083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_371.wav', 'Alles klärchen!', 16, array([-3.9487541e-06, 3.3564411e-05, 1.5795016e-05, ...,\n", + " 5.1333802e-05, 6.1205690e-05, 3.5538786e-05], dtype=float32), 1.5068854166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_392.wav', 'Zeig mal dein Piercing!', 23, array([ 5.7256933e-05, 1.3820640e-05, 3.5538786e-05, ...,\n", + " -6.1205690e-05, -9.8718847e-06, 5.5282559e-05], dtype=float32), 1.7712604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_393.wav', 'Parlieren Sie doch im Park!', 27, array([-7.1077571e-05, -5.3308180e-05, -5.7256933e-05, ...,\n", + " 1.5795015e-04, 1.1253949e-04, 1.0069323e-04], dtype=float32), 1.8505729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_400.wav', 'Hä, was? 400', 13, array([ 3.5538786e-04, 2.7443841e-04, 2.5469463e-04, ...,\n", + " -6.3180065e-05, -1.7769393e-05, -5.9231311e-06], dtype=float32), 0.98696875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_431.wav', 'Tun Sie nicht so überrascht!', 29, array([-2.2310461e-04, -2.6259213e-04, -3.0800281e-04, ...,\n", + " -7.7000703e-05, -1.0661636e-04, -1.1451387e-04], dtype=float32), 1.6743229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_438.wav', 'Was lernen wir daraus?', 22, array([ 3.35644108e-05, -7.70007027e-05, -7.30519532e-05, ...,\n", + " -1.02667604e-04, -8.68725911e-05, -2.76412793e-05], dtype=float32), 1.9078541666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_439.wav', 'Was will sie denn noch?', 23, array([ 5.3308180e-05, 5.7256933e-05, -3.9487539e-05, ...,\n", + " -3.9487541e-06, 5.5282559e-05, 6.9103196e-05], dtype=float32), 1.7624479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_441.wav', 'Ich stecke fest.', 16, array([-1.6584767e-04, -1.5795015e-04, -1.3030888e-04, ...,\n", + " 9.2795723e-05, 7.5026328e-05, 7.5026328e-05], dtype=float32), 1.2976041666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_451.wav', 'Es riecht nach Sonnencreme.', 27, array([-4.1461917e-05, -3.7513164e-05, 2.1718148e-05, ...,\n", + " -2.7641279e-05, -1.0661636e-04, -1.0069323e-04], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_452.wav', 'Da entlang!', 11, array([-1.7769393e-05, 5.9231311e-06, 1.7769393e-05, ...,\n", + " -7.8975081e-06, 7.8975081e-06, 0.0000000e+00], dtype=float32), 0.97815625)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_453.wav', 'Tja, Thaddäus!', 15, array([-9.87188541e-05, -1.46103906e-04, -1.24385755e-04, ...,\n", + " 1.02667604e-04, 1.97437703e-06, -2.76412793e-05], dtype=float32), 1.6787395833333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_458.wav', 'Das Leben ist voller Ã\\x9cberraschungen.', 37, array([-3.9487539e-05, 1.1846262e-05, -1.3820640e-05, ...,\n", + " 6.1205690e-05, 3.1590032e-05, 1.9743770e-06], dtype=float32), 1.9739375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_460.wav', 'Danke der Nachfrage!460', 23, array([ 8.29238343e-05, 1.16488241e-04, 9.67444721e-05, ...,\n", + " -1.12539492e-04, -1.08590735e-04, -1.42155142e-04], dtype=float32), 1.4275833333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_465.wav', 'Ich will auch so ein Pferd.', 27, array([ 0.0000000e+00, 1.9743770e-06, -3.1590032e-05, ...,\n", + " 7.8975077e-05, -3.9487539e-05, -5.7256933e-05], dtype=float32), 1.5404791666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_473.wav', 'Was hat das zu bedeuten?', 24, array([-1.26360130e-04, -1.08590735e-04, -1.16488241e-04, ...,\n", + " 8.29238343e-05, 2.36925243e-05, -1.57950162e-05], dtype=float32), 1.8241354166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_492.wav', 'Die Narkose wirkt nicht.', 24, array([ 1.7571956e-04, 1.6782204e-04, 7.8975077e-05, ...,\n", + " 3.1590032e-05, -2.1718148e-05, -2.7641279e-05], dtype=float32), 1.965125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_495.wav', 'Dein Bruder ist echt krass drauf.', 33, array([-7.10775712e-05, -3.35644108e-05, -2.17181478e-05, ...,\n", + " 1.16488241e-04, 1.02667604e-04, 7.89750775e-05], dtype=float32), 1.9563125)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_498.wav', 'Das behaupten alle.', 19, array([-1.1253949e-04, -1.1846262e-04, -9.8718854e-05, ...,\n", + " 5.5282559e-05, -1.1846262e-05, 4.5410670e-05], dtype=float32), 1.3658958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_500.wav', 'So einfach ist es nicht.', 24, array([-0.00019349, -0.00019744, -0.00022113, ..., -0.00021521,\n", + " -0.0002231 , -0.00020534], dtype=float32), 1.7976979166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_503.wav', 'Das gibt ihm den Rest.', 22, array([-3.94875406e-06, -6.71288217e-05, -1.20436998e-04, ...,\n", + " 1.04641986e-04, 1.24385755e-04, 1.14513867e-04], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_507.wav', 'Was fällt euch ein?', 20, array([9.8718854e-05, 9.0821341e-05, 6.7128822e-05, ..., 1.7374518e-04,\n", + " 2.0730958e-04, 1.5795015e-04], dtype=float32), 1.5421458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_511.wav', 'Lass die Glucke in Ruhe!', 24, array([2.6851529e-04, 2.3692525e-04, 8.2923834e-05, ..., 9.2795723e-05,\n", + " 6.3180065e-05, 6.1205690e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_512.wav', 'Wieso denn das nicht?', 21, array([-3.9487541e-06, -2.5666901e-05, -6.9103196e-05, ...,\n", + " 3.1590032e-05, -1.9743770e-06, 1.3820640e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_542.wav', 'Na gut, ich komme mit.', 22, array([ 6.3180065e-05, -3.9487541e-06, 4.3436296e-05, ...,\n", + " -6.9103196e-05, -6.5154440e-05, 7.8975081e-06], dtype=float32), 1.8329479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_543.wav', 'Entschuldige dich bei ihr.', 26, array([-9.6744472e-05, -7.8975077e-05, -5.1333802e-05, ...,\n", + " -7.7000703e-05, -1.2241138e-04, -5.9231312e-05], dtype=float32), 1.7624479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_544.wav', 'Das reimt sich ja.', 18, array([ 1.1056512e-04, 8.4898209e-05, 1.1648824e-04, ...,\n", + " -9.0821341e-05, -1.1451387e-04, -1.1253949e-04], dtype=float32), 1.25134375)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_549.wav', 'Sie kamen, um zu bleiben.', 25, array([ 3.9487539e-05, 7.8975081e-06, 3.3564411e-05, ...,\n", + " 2.1718148e-05, -2.7641279e-05, -9.6744472e-05], dtype=float32), 1.8593854166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_555.wav', 'Sie nimmt kein Blatt vor den Mund.', 34, array([-1.1569849e-03, -7.0287823e-04, -5.3308180e-05, ...,\n", + " 2.5666901e-05, 1.5795016e-05, -1.9743769e-05], dtype=float32), 1.8021041666666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_556.wav', 'Hoffentlich geht es ihm gut.', 28, array([1.0187785e-03, 1.1372411e-03, 1.2616270e-03, ..., 3.5538786e-05,\n", + " 7.8975081e-06, 5.9231312e-05], dtype=float32), 1.9342916666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_557.wav', 'Vergiss deine Schoner nicht!', 28, array([ 3.9487539e-05, -5.5282559e-05, -2.0336083e-04, ...,\n", + " -6.9103196e-05, -7.1077571e-05, -7.1077571e-05], dtype=float32), 1.8461666666666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_590.wav', 'Wenn du schon so fragst!', 24, array([ 1.4610391e-04, 1.4807828e-04, 1.7966831e-04, ...,\n", + " 1.7769393e-05, -4.3436296e-05, -2.7641279e-05], dtype=float32), 1.8329479166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_595.wav', 'Was muss ich einkaufen?', 23, array([ 0.00016387, 0.00012636, 0.00011254, ..., -0.00010464,\n", + " -0.00011649, -0.00010464], dtype=float32), 1.6038229166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_599.wav', 'Der tut nichts!', 15, array([ 9.2795723e-05, 6.1205690e-05, 2.5666901e-05, ...,\n", + " -1.1648824e-04, -9.8718854e-05, -7.8975077e-05], dtype=float32), 1.16321875)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_605.wav', 'Natürlich war es das.', 22, array([3.1590032e-05, 1.9743769e-05, 7.8975077e-05, ..., 1.4610391e-04,\n", + " 1.6782204e-04, 1.4412952e-04], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_612.wav', 'Sprechen Sie deutsch?', 21, array([ 2.05335207e-04, 1.91514569e-04, 1.57950155e-04, ...,\n", + " 2.96156559e-05, -6.31800649e-05, -1.02667604e-04], dtype=float32), 1.5157083333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_616.wav', 'Gleich hole ich Anne ab.', 24, array([6.3180065e-05, 7.1077571e-05, 1.2636013e-04, ..., 2.1718148e-05,\n", + " 3.1590032e-05, 1.3820640e-05], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_646.wav', 'Ich bin ganz hin und weg!', 25, array([-3.5143911e-04, -2.5666901e-04, -1.6979642e-04, ...,\n", + " -4.3436296e-05, -6.1205690e-05, 4.3436296e-05], dtype=float32), 1.7800729166666667)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_647.wav', 'Frische Luft tut gut.', 21, array([ 1.04641986e-04, 1.97437703e-06, -7.89750811e-06, ...,\n", + " 8.48982090e-05, 1.38206397e-05, -7.89750811e-06], dtype=float32), 1.6655104166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_652.wav', 'Nein, du Genie!', 15, array([1.3623202e-04, 1.2833450e-04, 1.2833450e-04, ..., 2.7641279e-05,\n", + " 4.5410670e-05, 5.1333802e-05], dtype=float32), 1.5068958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_675.wav', 'Das kann doch wohl nicht wahr sein!', 35, array([-1.9743769e-05, -3.3564411e-05, 3.1590032e-05, ...,\n", + " 9.2795723e-05, 9.6744472e-05, 1.2043700e-04], dtype=float32), 1.9827604166666666)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_684.wav', 'Jedes Kind weiÃ\\x9f das.', 21, array([-1.204370e-04, -1.382064e-04, -9.674447e-05, ..., -1.461039e-04,\n", + " -1.382064e-04, -8.489821e-05], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_711.wav', 'Die werden ja nicht schlecht.', 29, array([-1.3030888e-04, -1.0069323e-04, -8.2923834e-05, ...,\n", + " -1.3228325e-04, -1.1253949e-04, -9.6744472e-05], dtype=float32), 1.4716458333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_719.wav', 'Das Leben ist schön!', 21, array([ 2.5666901e-05, 4.7385049e-05, 2.9615656e-05, ...,\n", + " -3.3564411e-05, 3.3564411e-05, 7.8975077e-05], dtype=float32), 1.2953958333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_720.wav', 'Was machst du jetzt?', 20, array([-1.5597578e-04, -1.2833450e-04, -1.3425764e-04, ...,\n", + " -5.3308180e-05, -3.9487541e-06, 3.9487541e-06], dtype=float32), 1.4099583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_728.wav', 'Falsche Antwort.', 16, array([-5.1333802e-05, -1.1846262e-05, 9.8718847e-06, ...,\n", + " -9.8718847e-06, -4.7385049e-05, -5.3308180e-05], dtype=float32), 1.3923333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_743.wav', 'Oder wir gehen Burger essen.', 28, array([-2.9615656e-05, -4.7385049e-05, -3.1590032e-05, ...,\n", + " -8.2923834e-05, -5.1333802e-05, 6.3180065e-05], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_744.wav', 'Lasst mich allein!', 18, array([-0.00018757, -0.00018757, -0.00024877, ..., -0.00011846,\n", + " -0.00011057, -0.00013031], dtype=float32), 1.7007604166666668)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_745.wav', 'Da sind wir wieder.', 19, array([7.2459638e-04, 7.7395578e-04, 8.3911023e-04, ..., 0.0000000e+00,\n", + " 1.3820640e-05, 1.7769393e-05], dtype=float32), 1.4804583333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_754.wav', 'So weit, so gut.', 16, array([-1.7769393e-05, -3.5538786e-05, 3.5538786e-05, ...,\n", + " 9.8718847e-06, -5.3308180e-05, -4.3436296e-05], dtype=float32), 1.5245208333333333)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_756.wav', 'Alles war voller Qualm.', 23, array([ 6.3180065e-05, 2.9615656e-05, 3.7513164e-05, ...,\n", + " -3.1590032e-05, -3.3564411e-05, 2.1718148e-05], dtype=float32), 1.5333333333333334)\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_9_FINAL/9_765.wav', 'Fick dich!', 10, array([-3.9487539e-05, -8.0949460e-05, -5.7256933e-05, ...,\n", + " 3.9487539e-05, 7.8975077e-05, 9.4770097e-05], dtype=float32), 0.9076666666666666)\n" + ] + } + ], + "source": [ + "# print clips shorter than 2 sec\n", + "for item in data:\n", + " if item[-1] < 2:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sec_per_chars = []\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " sec_per_chars.append(sec_per_char)\n", + "# sec_per_char /= len(data)\n", + "# print(sec_per_char)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Average durations per char: 0.07641993439576344\n", + " > STD duration per char: 0.015251748851166484\n" + ] + } + ], + "source": [ + "mean = np.mean(sec_per_chars)\n", + "std = np.std(sec_per_chars)\n", + "print(\" > Average durations per char: \", mean)\n", + "print(\" > STD duration per char: \", std)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# fit a distribution\n", + "dist = norm(mean, std)\n", + "\n", + "# find irregular instances long or short voice durations\n", + "items =[]\n", + "pdfs = []\n", + "for item in data:\n", + " text = item[1]\n", + " dur = item[-1]\n", + " sec_per_char = dur / len(text)\n", + " pdf = norm.pdf(sec_per_char)\n", + " pdfs.append(pdf)\n", + " items.append(item)\n", + "# if pdf < 0.395:\n", + "# print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot pdf values too see outliers\n", + "plt.figure(figsize=[16,16])\n", + "plt.plot(pdfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_11_FINAL/11_227.wav', 'Q-R-S-T-U-V-W-X-Y-Z macht es komplett!', 38, array([-4.0032621e-04, -3.3042193e-04, -3.4537757e-04, ...,\n", + " 7.7704317e-06, 2.7401828e-05, 7.1041533e-05], dtype=float32), 11.323739583333333) 0.38161673291429454\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_14_FINAL/14_496.wav', 'Ist der Kuli blau?', 18, array([ 1.2363373e-05, -3.6298752e-05, 2.1456377e-05, ...,\n", + " 3.9692618e-06, -6.7328816e-05, -9.5399046e-05], dtype=float32), 5.530666666666667) 0.38054811432758695\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_17_FINAL/17_426.wav', 'H-I-J-K-L-M-N-O-P!', 18, array([ 4.7872534e-05, -3.4164757e-05, -2.1835160e-04, ...,\n", + " -4.3899294e-05, -7.5021897e-05, -3.4489829e-05], dtype=float32), 11.167979166666667) 0.32909346861901806\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav', 'Kann ich mich irgendwie revanchieren?', 37, array([-5.1586820e-05, -9.1837741e-05, -9.9342957e-05, ...,\n", + " -1.4234778e-04, -1.2327779e-04, -1.4810068e-04], dtype=float32), 9.728) 0.3853891360487213\n", + "('/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_5_FINAL/5_41.wav', 'Ja, eben.', 9, array([ 8.6438486e-05, 1.5554321e-04, 1.1511238e-04, ...,\n", + " -1.3761004e-05, -2.3534812e-05, -5.6318945e-06], dtype=float32), 2.1033333333333335) 0.38819509492217963\n" + ] + } + ], + "source": [ + "# print outliers\n", + "threshold = 0.39\n", + "for item, pdf in zip(items, pdfs):\n", + " if pdf < threshold:\n", + " print(item, pdf)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Audio\n", + "Audio(\"/home/erogol/Data/Mozilla_DE_Thomas3/BATCH_2_FINAL/2_119.wav\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Plot Dataset Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs mean audio duration\")\n", + "plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs median audio duration\")\n", + "plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs STD\")\n", + "plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"text length vs # instances\")\n", + "plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Check words frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n", + "w_count_df.sort_values(0, ascending=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
die3066
der2362
das1794
ist1767
nicht1467
......
wertvollsten,1
blutgruppe1
gelenkschmerzen1
entgeltbefreiung1
anrã¼cken.1
\n", + "

27102 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " 0\n", + "die 3066\n", + "der 2362\n", + "das 1794\n", + "ist 1767\n", + "nicht 1467\n", + "... ...\n", + "wertvollsten, 1\n", + "blutgruppe 1\n", + "gelenkschmerzen 1\n", + "entgeltbefreiung 1\n", + "anrã¼cken. 1\n", + "\n", + "[27102 rows x 1 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w_count_df" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "18" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check a certain word\n", + "w_count_df.at['auto', 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "Collapsed": "false" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:214: RuntimeWarning: Glyph 159 missing from current font.\n", + " font.set_text(s, 0.0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:214: RuntimeWarning: Glyph 156 missing from current font.\n", + " font.set_text(s, 0.0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:183: RuntimeWarning: Glyph 159 missing from current font.\n", + " font.set_text(s, 0, flags=flags)\n", + "/home/erogol/miniconda3/lib/python3.7/site-packages/matplotlib-3.2.0rc3-py3.7-linux-x86_64.egg/matplotlib/backends/backend_agg.py:183: RuntimeWarning: Glyph 156 missing from current font.\n", + " font.set_text(s, 0, flags=flags)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# fequency bar plot - it takes time!!\n", + "w_count_df.plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb similarity index 99% rename from dataset_analysis/AnalyzeDataset.ipynb rename to notebooks/dataset_analysis/AnalyzeDataset.ipynb index 3ed54ded..62870fdc 100644 --- a/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -27,7 +27,7 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", - "from TTS.datasets.preprocess import *\n", + "from TTS.tts.datasets.preprocess import *\n", "%matplotlib inline" ] }, diff --git a/dataset_analysis/CheckDatasetSNR.ipynb b/notebooks/dataset_analysis/CheckDatasetSNR.ipynb similarity index 100% rename from dataset_analysis/CheckDatasetSNR.ipynb rename to notebooks/dataset_analysis/CheckDatasetSNR.ipynb diff --git a/dataset_analysis/README.md b/notebooks/dataset_analysis/README.md similarity index 100% rename from dataset_analysis/README.md rename to notebooks/dataset_analysis/README.md diff --git a/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py similarity index 100% rename from dataset_analysis/analyze.py rename to notebooks/dataset_analysis/analyze.py diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 00000000..e5e46476 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,10 @@ +# tests +nosetests tests -x + +# runtime tests +./tests/test_server_package.sh +./tests/test_tts_train.sh +./tests/test_vocoder_train.sh + +# linter check +cardboardlinter --refspec master \ No newline at end of file diff --git a/server/README.md b/server/README.md deleted file mode 100644 index 3c65c961..00000000 --- a/server/README.md +++ /dev/null @@ -1,47 +0,0 @@ -## TTS example web-server - -You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model). - -Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work. - -#### Development server: - -##### Using server.py -If you have the environment set already for TTS, then you can directly call ```server.py```. - -##### Using .whl -1. apt-get install -y espeak libsndfile1 python3-venv -2. python3 -m venv /tmp/venv -3. source /tmp/venv/bin/activate -4. pip install -U pip setuptools wheel -5. pip install -U https//example.com/url/to/python/package.whl -6. python -m TTS.server.server - -You can now open http://localhost:5002 in a browser - -#### Running with nginx/uwsgi: - -1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv -2. python3 -m venv /tmp/venv -3. source /tmp/venv/bin/activate -4. pip install -U pip setuptools wheel -5. pip install -U https//example.com/url/to/python/package.whl -6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip -7. unzip *-nginx-uwsgi.zip -8. cp tts_site_nginx /etc/nginx/sites-enabled/default -9. service nginx restart -10. uwsgi --ini uwsgi.ini - -You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx). -Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting. - -#### Creating a server package with an embedded model - -[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command: - -- `--checkpoint ` - path to model checkpoint file you want to embed in the package -- `--model_config ` - path to corresponding config.json file for the checkpoint - -To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`. - -A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it. diff --git a/server/__init__.py b/server/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/server/conf.json b/server/conf.json deleted file mode 100644 index 00045365..00000000 --- a/server/conf.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder - "tts_file":"best_model.pth.tar", // tts checkpoint file - "tts_config":"config.json", // tts config.json file - "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. - "vocoder_config":null, - "vocoder_file": null, - "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. - "wavernn_path":null, // wavernn model root path - "wavernn_file":null, // wavernn checkpoint file name - "wavernn_config": null, // wavernn config file - "is_wavernn_batched":true, - "port": 5002, - "use_cuda": true, - "debug": true -} diff --git a/server/server.py b/server/server.py deleted file mode 100644 index bd23ea9c..00000000 --- a/server/server.py +++ /dev/null @@ -1,86 +0,0 @@ -#!flask/bin/python -import argparse -import os - -from flask import Flask, request, render_template, send_file -from TTS.server.synthesizer import Synthesizer - - -def create_argparser(): - def convert_boolean(x): - return x.lower() in ['true', '1', 'yes'] - - parser = argparse.ArgumentParser() - parser.add_argument('--tts_checkpoint', type=str, help='path to TTS checkpoint file') - parser.add_argument('--tts_config', type=str, help='path to TTS config.json file') - parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') - parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') - parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.') - parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') - parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') - parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.') - parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.') - parser.add_argument('--port', type=int, default=5002, help='port to listen on.') - parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.') - parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.') - return parser - - -synthesizer = None - -embedded_models_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model') - -embedded_tts_folder = os.path.join(embedded_models_folder, 'tts') -tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar') -tts_config_file = os.path.join(embedded_tts_folder, 'config.json') - -embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder') -vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar') -vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json') - -# These models are soon to be deprecated -embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn') -wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar') -wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json') - -args = create_argparser().parse_args() - -# If these were not specified in the CLI args, use default values with embedded model files -if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): - args.tts_checkpoint = tts_checkpoint_file -if not args.tts_config and os.path.isfile(tts_config_file): - args.tts_config = tts_config_file - -if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file): - args.vocoder_checkpoint = vocoder_checkpoint_file -if not args.vocoder_config and os.path.isfile(vocoder_config_file): - args.vocoder_config = vocoder_config_file - -if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file): - args.wavernn_checkpoint = wavernn_checkpoint_file -if not args.wavernn_config and os.path.isfile(wavernn_config_file): - args.wavernn_config = wavernn_config_file - -synthesizer = Synthesizer(args) - -app = Flask(__name__) - -@app.route('/') -def index(): - return render_template('index.html') - - -@app.route('/api/tts', methods=['GET']) -def tts(): - text = request.args.get('text') - print(" > Model input: {}".format(text)) - data = synthesizer.tts(text) - return send_file(data, mimetype='audio/wav') - - -def main(): - app.run(debug=args.debug, host='0.0.0.0', port=args.port) - - -if __name__ == '__main__': - main() diff --git a/server/synthesizer.py b/server/synthesizer.py deleted file mode 100644 index 0f743d87..00000000 --- a/server/synthesizer.py +++ /dev/null @@ -1,194 +0,0 @@ -import io -import sys -import time - -import numpy as np -import torch -import yaml -import pysbd - -from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config -from TTS.utils.generic_utils import setup_model -from TTS.utils.speakers import load_speaker_mapping -from TTS.vocoder.utils.generic_utils import setup_generator -# pylint: disable=unused-wildcard-import -# pylint: disable=wildcard-import -from TTS.utils.synthesis import * - -from TTS.utils.text import make_symbols, phonemes, symbols - - -class Synthesizer(object): - def __init__(self, config): - self.wavernn = None - self.vocoder_model = None - self.config = config - print(config) - self.seg = self.get_segmenter("en") - self.use_cuda = self.config.use_cuda - if self.use_cuda: - assert torch.cuda.is_available(), "CUDA is not availabe on this machine." - self.load_tts(self.config.tts_checkpoint, self.config.tts_config, - self.config.use_cuda) - if self.config.vocoder_checkpoint: - self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) - if self.config.wavernn_lib_path: - self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint, - self.config.wavernn_config, self.config.use_cuda) - - @staticmethod - def get_segmenter(lang): - return pysbd.Segmenter(language=lang, clean=True) - - def load_tts(self, tts_checkpoint, tts_config, use_cuda): - # pylint: disable=global-statement - global symbols, phonemes - - print(" > Loading TTS model ...") - print(" | > model config: ", tts_config) - print(" | > checkpoint file: ", tts_checkpoint) - - self.tts_config = load_config(tts_config) - self.use_phonemes = self.tts_config.use_phonemes - self.ap = AudioProcessor(**self.tts_config.audio) - - if 'characters' in self.tts_config.keys(): - symbols, phonemes = make_symbols(**self.tts_config.characters) - - if self.use_phonemes: - self.input_size = len(phonemes) - else: - self.input_size = len(symbols) - # TODO: fix this for multi-speaker model - load speakers - if self.config.tts_speakers is not None: - self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) - num_speakers = len(self.tts_speakers) - else: - num_speakers = 0 - self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) - # load model state - cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) - # load the model - self.tts_model.load_state_dict(cp['model']) - if use_cuda: - self.tts_model.cuda() - self.tts_model.eval() - self.tts_model.decoder.max_decoder_steps = 3000 - if 'r' in cp: - self.tts_model.decoder.set_r(cp['r']) - print(f" > model reduction factor: {cp['r']}") - - def load_vocoder(self, model_file, model_config, use_cuda): - self.vocoder_config = load_config(model_config) - self.vocoder_model = setup_generator(self.vocoder_config) - self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"]) - self.vocoder_model.remove_weight_norm() - self.vocoder_model.inference_padding = 0 - self.vocoder_config = load_config(model_config) - - if use_cuda: - self.vocoder_model.cuda() - self.vocoder_model.eval() - - def load_wavernn(self, lib_path, model_file, model_config, use_cuda): - # TODO: set a function in wavernn code base for model setup and call it here. - sys.path.append(lib_path) # set this if WaveRNN is not installed globally - #pylint: disable=import-outside-toplevel - from WaveRNN.models.wavernn import Model - print(" > Loading WaveRNN model ...") - print(" | > model config: ", model_config) - print(" | > model file: ", model_file) - self.wavernn_config = load_config(model_config) - # This is the default architecture we use for our models. - # You might need to update it - self.wavernn = Model( - rnn_dims=512, - fc_dims=512, - mode=self.wavernn_config.mode, - mulaw=self.wavernn_config.mulaw, - pad=self.wavernn_config.pad, - use_aux_net=self.wavernn_config.use_aux_net, - use_upsample_net=self.wavernn_config.use_upsample_net, - upsample_factors=self.wavernn_config.upsample_factors, - feat_dims=80, - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=self.ap.hop_length, - sample_rate=self.ap.sample_rate, - ).cuda() - - check = torch.load(model_file, map_location="cpu") - self.wavernn.load_state_dict(check['model']) - if use_cuda: - self.wavernn.cuda() - self.wavernn.eval() - - def save_wav(self, wav, path): - # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) - wav = np.array(wav) - self.ap.save_wav(wav, path) - - def split_into_sentences(self, text): - return self.seg.segment(text) - - def tts(self, text, speaker_id=None): - start_time = time.time() - wavs = [] - sens = self.split_into_sentences(text) - print(sens) - speaker_id = id_to_torch(speaker_id) - if speaker_id is not None and self.use_cuda: - speaker_id = speaker_id.cuda() - - for sen in sens: - # preprocess the given text - inputs = text_to_seqvec(sen, self.tts_config) - inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) - inputs = inputs.unsqueeze(0) - # synthesize voice - _, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None) - if self.vocoder_model: - # use native vocoder model - vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) - wav = self.vocoder_model.inference(vocoder_input) - if self.use_cuda: - wav = wav.cpu().numpy() - else: - wav = wav.numpy() - wav = wav.flatten() - elif self.wavernn: - # use 3rd paty wavernn - vocoder_input = None - if self.tts_config.model == "Tacotron": - vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0) - else: - vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) - if self.use_cuda: - vocoder_input.cuda() - wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) - else: - # use GL - if self.use_cuda: - postnet_output = postnet_output[0].cpu() - else: - postnet_output = postnet_output[0] - postnet_output = postnet_output.numpy() - wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) - - # trim silence - wav = trim_silence(wav, self.ap) - - wavs += list(wav) - wavs += [0] * 10000 - - out = io.BytesIO() - self.save_wav(wavs, out) - - # compute stats - process_time = time.time() - start_time - audio_time = len(wavs) / self.tts_config.audio['sample_rate'] - print(f" > Processing time: {process_time}") - print(f" > Real-time factor: {process_time / audio_time}") - return out diff --git a/server/templates/index.html b/server/templates/index.html deleted file mode 100644 index 45b874a9..00000000 --- a/server/templates/index.html +++ /dev/null @@ -1,111 +0,0 @@ - - - - - - - - - - - Mozilla - Text2Speech engine - - - - - - - - - - Fork me on GitHub - - - - - -
-
-
- -

Mozilla TTS

-
    -
- -

- -

-
-
-
- - - - - - - diff --git a/setup.py b/setup.py index 3f02dd09..260aa20f 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.3' +version = '0.0.4' # Adapted from https://github.com/pytorch/pytorch cwd = os.path.dirname(os.path.abspath(__file__)) @@ -112,6 +112,8 @@ setup( name='TTS', version=version, url='https://github.com/mozilla/TTS', + author='Eren Gölge', + author_email='egolge@mozilla.com', description='Text to Speech with Deep Learning', license='MPL-2.0', entry_points={ @@ -119,11 +121,7 @@ setup( 'tts-server = TTS.server.server:main' ] }, - package_dir={'': 'tts_namespace'}, - packages=find_packages('tts_namespace'), - package_data={ - 'TTS': package_data, - }, + packages=find_packages(include=['TTS*']), project_urls={ 'Documentation': 'https://github.com/mozilla/TTS/wiki', 'Tracker': 'https://github.com/mozilla/TTS/issues', diff --git a/speaker_encoder/README.md b/speaker_encoder/README.md deleted file mode 100644 index b6f541f8..00000000 --- a/speaker_encoder/README.md +++ /dev/null @@ -1,18 +0,0 @@ -### Speaker Encoder - -This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. - -With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. - -Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). - -![](umap.png) - -Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. - -To run the code, you need to follow the same flow as in TTS. - -- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. -- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` -- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. -- Watch training on Tensorboard as in TTS diff --git a/speaker_encoder/__init__.py b/speaker_encoder/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/speaker_encoder/compute_embeddings.py b/speaker_encoder/compute_embeddings.py deleted file mode 100644 index bfa377e3..00000000 --- a/speaker_encoder/compute_embeddings.py +++ /dev/null @@ -1,88 +0,0 @@ -import argparse -import glob -import os - -import numpy as np -from tqdm import tqdm - -import torch -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import load_config - -parser = argparse.ArgumentParser( - description='Compute embedding vectors for each wav file in a dataset. ') -parser.add_argument( - 'model_path', - type=str, - help='Path to model outputs (checkpoint, tensorboard etc.).') -parser.add_argument( - 'config_path', - type=str, - help='Path to config file for training.', -) -parser.add_argument( - 'data_path', - type=str, - help='Data path for wav files - directory or CSV file') -parser.add_argument( - 'output_path', - type=str, - help='path for training outputs.') -parser.add_argument( - '--use_cuda', type=bool, help='flag to set cuda.', default=False -) -parser.add_argument( - '--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|' -) -args = parser.parse_args() - - -c = load_config(args.config_path) -ap = AudioProcessor(**c['audio']) - -data_path = args.data_path -split_ext = os.path.splitext(data_path) -sep = args.separator - -if len(split_ext) > 0 and split_ext[1].lower() == '.csv': - # Parse CSV - print(f'CSV file: {data_path}') - with open(data_path) as f: - wav_path = os.path.join(os.path.dirname(data_path), 'wavs') - wav_files = [] - print(f'Separator is: {sep}') - for line in f: - components = line.split(sep) - if len(components) != 2: - print("Invalid line") - continue - wav_file = os.path.join(wav_path, components[0] + '.wav') - #print(f'wav_file: {wav_file}') - if os.path.exists(wav_file): - wav_files.append(wav_file) - print(f'Count of wavs imported: {len(wav_files)}') -else: - # Parse all wav files in data_path - wav_path = data_path - wav_files = glob.glob(data_path + '/**/*.wav', recursive=True) - -output_files = [wav_file.replace(wav_path, args.output_path).replace( - '.wav', '.npy') for wav_file in wav_files] - -for output_file in output_files: - os.makedirs(os.path.dirname(output_file), exist_ok=True) - -model = SpeakerEncoder(**c.model) -model.load_state_dict(torch.load(args.model_path)['model']) -model.eval() -if args.use_cuda: - model.cuda() - -for idx, wav_file in enumerate(tqdm(wav_files)): - mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T - mel_spec = torch.FloatTensor(mel_spec[None, :, :]) - if args.use_cuda: - mel_spec = mel_spec.cuda() - embedd = model.compute_embedding(mel_spec) - np.save(output_files[idx], embedd.detach().cpu().numpy()) diff --git a/speaker_encoder/config.json b/speaker_encoder/config.json deleted file mode 100644 index 0d0f8f68..00000000 --- a/speaker_encoder/config.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "run_name": "libritts_360-half", - "run_description": "train speaker encoder for libritts 360", - "audio": { - // Audio processing parameters - "num_mels": 40, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - }, - "reinit_layers": [], - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "model": { - "input_dim": 40, - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-360/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-100/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/speaker_encoder/dataset.py b/speaker_encoder/dataset.py deleted file mode 100644 index 913b7a6d..00000000 --- a/speaker_encoder/dataset.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import random -from torch.utils.data import Dataset - - -class MyDataset(Dataset): - def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, - num_utter_per_speaker=10, skip_speakers=False, verbose=False): - """ - Args: - ap (TTS.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - seq_len (int): voice segment length in seconds. - verbose (bool): print diagnostic information. - """ - self.items = meta_data - self.sample_rate = ap.sample_rate - self.voice_len = voice_len - self.seq_len = int(voice_len * self.sample_rate) - self.num_speakers_in_batch = num_speakers_in_batch - self.num_utter_per_speaker = num_utter_per_speaker - self.skip_speakers = skip_speakers - self.ap = ap - self.verbose = verbose - self.__parse_items() - if self.verbose: - print("\n > DataLoader initialization") - print(f" | > Number of instances : {len(self.items)}") - print(f" | > Sequence length: {self.seq_len}") - print(f" | > Num speakers: {len(self.speakers)}") - - def load_wav(self, filename): - audio = self.ap.load_wav(filename) - return audio - - def load_data(self, idx): - text, wav_file, speaker_name = self.items[idx] - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - # sample seq_len - - assert text.size > 0, self.items[idx][1] - assert wav.size > 0, self.items[idx][1] - - sample = { - "mel": mel, - "item_idx": self.items[idx][1], - "speaker_name": speaker_name, - } - return sample - - def __parse_items(self): - """ - Find unique speaker ids and create a dict mapping utterances from speaker id - """ - speakers = list({item[-1] for item in self.items}) - self.speaker_to_utters = {} - self.speakers = [] - for speaker in speakers: - speaker_utters = [item[1] for item in self.items if item[2] == speaker] - if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: - print( - f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}." - ) - else: - self.speakers.append(speaker) - self.speaker_to_utters[speaker] = speaker_utters - - def __len__(self): - return int(1e10) - - def __sample_speaker(self): - speaker = random.sample(self.speakers, 1)[0] - if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): - utters = random.choices( - self.speaker_to_utters[speaker], k=self.num_utter_per_speaker - ) - else: - utters = random.sample( - self.speaker_to_utters[speaker], self.num_utter_per_speaker - ) - return speaker, utters - - def __sample_speaker_utterances(self, speaker): - """ - Sample all M utterances for the given speaker. - """ - feats = [] - labels = [] - for _ in range(self.num_utter_per_speaker): - # TODO:dummy but works - while True: - if len(self.speaker_to_utters[speaker]) > 0: - utter = random.sample(self.speaker_to_utters[speaker], 1)[0] - else: - self.speakers.remove(speaker) - speaker, _ = self.__sample_speaker() - continue - wav = self.load_wav(utter) - if wav.shape[0] - self.seq_len > 0: - break - self.speaker_to_utters[speaker].remove(utter) - - offset = random.randint(0, wav.shape[0] - self.seq_len) - mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len]) - feats.append(torch.FloatTensor(mel)) - labels.append(speaker) - return feats, labels - - def __getitem__(self, idx): - speaker, _ = self.__sample_speaker() - return speaker - - def collate_fn(self, batch): - labels = [] - feats = [] - for speaker in batch: - feats_, labels_ = self.__sample_speaker_utterances(speaker) - labels.append(labels_) - feats.extend(feats_) - feats = torch.stack(feats) - return feats.transpose(1, 2), labels diff --git a/speaker_encoder/generic_utils.py b/speaker_encoder/generic_utils.py deleted file mode 100644 index c568d129..00000000 --- a/speaker_encoder/generic_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import datetime -import torch - - -def save_checkpoint(model, optimizer, model_loss, out_path, - current_step, epoch): - checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - 'model': new_state_dict, - 'optimizer': optimizer.state_dict() if optimizer is not None else None, - 'step': current_step, - 'epoch': epoch, - 'GE2Eloss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y"), - } - torch.save(state, checkpoint_path) - - -def save_best_model(model, optimizer, model_loss, best_loss, out_path, - current_step): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - 'model': new_state_dict, - 'optimizer': optimizer.state_dict(), - 'step': current_step, - 'GE2Eloss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y"), - } - best_loss = model_loss - bestmodel_path = 'best_model.pth.tar' - bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format( - model_loss, bestmodel_path)) - torch.save(state, bestmodel_path) - return best_loss \ No newline at end of file diff --git a/speaker_encoder/loss.py b/speaker_encoder/loss.py deleted file mode 100644 index ab290547..00000000 --- a/speaker_encoder/loss.py +++ /dev/null @@ -1,121 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - - -# adapted from https://github.com/cvqluu/GE2E-Loss -class GE2ELoss(nn.Module): - def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): - """ - Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] - Accepts an input of size (N, M, D) - where N is the number of speakers in the batch, - M is the number of utterances per speaker, - and D is the dimensionality of the embedding vector (e.g. d-vector) - Args: - - init_w (float): defines the initial value of w in Equation (5) of [1] - - init_b (float): definies the initial value of b in Equation (5) of [1] - """ - super(GE2ELoss, self).__init__() - # pylint: disable=E1102 - self.w = nn.Parameter(torch.tensor(init_w)) - # pylint: disable=E1102 - self.b = nn.Parameter(torch.tensor(init_b)) - self.loss_method = loss_method - - assert self.loss_method in ["softmax", "contrast"] - - if self.loss_method == "softmax": - self.embed_loss = self.embed_loss_softmax - if self.loss_method == "contrast": - self.embed_loss = self.embed_loss_contrast - - # pylint: disable=R0201 - def calc_new_centroids(self, dvecs, centroids, spkr, utt): - """ - Calculates the new centroids excluding the reference utterance - """ - excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :])) - excl = torch.mean(excl, 0) - new_centroids = [] - for i, centroid in enumerate(centroids): - if i == spkr: - new_centroids.append(excl) - else: - new_centroids.append(centroid) - return torch.stack(new_centroids) - - def calc_cosine_sim(self, dvecs, centroids): - """ - Make the cosine similarity matrix with dims (N,M,N) - """ - cos_sim_matrix = [] - for spkr_idx, speaker in enumerate(dvecs): - cs_row = [] - for utt_idx, utterance in enumerate(speaker): - new_centroids = self.calc_new_centroids( - dvecs, centroids, spkr_idx, utt_idx - ) - # vector based cosine similarity for speed - cs_row.append( - torch.clamp( - torch.mm( - utterance.unsqueeze(1).transpose(0, 1), - new_centroids.transpose(0, 1), - ) - / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), - 1e-6, - ) - ) - cs_row = torch.cat(cs_row, dim=0) - cos_sim_matrix.append(cs_row) - return torch.stack(cos_sim_matrix) - - # pylint: disable=R0201 - def embed_loss_softmax(self, dvecs, cos_sim_matrix): - """ - Calculates the loss on each embedding $L(e_{ji})$ by taking softmax - """ - N, M, _ = dvecs.shape - L = [] - for j in range(N): - L_row = [] - for i in range(M): - L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j]) - L_row = torch.stack(L_row) - L.append(L_row) - return torch.stack(L) - - # pylint: disable=R0201 - def embed_loss_contrast(self, dvecs, cos_sim_matrix): - """ - Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid - """ - N, M, _ = dvecs.shape - L = [] - for j in range(N): - L_row = [] - for i in range(M): - centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) - excl_centroids_sigmoids = torch.cat( - (centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]) - ) - L_row.append( - 1.0 - - torch.sigmoid(cos_sim_matrix[j, i, j]) - + torch.max(excl_centroids_sigmoids) - ) - L_row = torch.stack(L_row) - L.append(L_row) - return torch.stack(L) - - def forward(self, dvecs): - """ - Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) - """ - centroids = torch.mean(dvecs, 1) - cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) - torch.clamp(self.w, 1e-6) - cos_sim_matrix = self.w * cos_sim_matrix + self.b - L = self.embed_loss(dvecs, cos_sim_matrix) - return L.mean() diff --git a/speaker_encoder/model.py b/speaker_encoder/model.py deleted file mode 100644 index b3bd71ff..00000000 --- a/speaker_encoder/model.py +++ /dev/null @@ -1,88 +0,0 @@ -import torch -from torch import nn - - -class LSTMWithProjection(nn.Module): - def __init__(self, input_size, hidden_size, proj_size): - super().__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.proj_size = proj_size - self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) - self.linear = nn.Linear(hidden_size, proj_size, bias=False) - - def forward(self, x): - self.lstm.flatten_parameters() - o, (_, _) = self.lstm(x) - return self.linear(o) - - -class SpeakerEncoder(nn.Module): - def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3): - super().__init__() - layers = [] - layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers - 1): - layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) - self.layers = nn.Sequential(*layers) - self._init_layers() - - def _init_layers(self): - for name, param in self.layers.named_parameters(): - if "bias" in name: - nn.init.constant_(param, 0.0) - elif "weight" in name: - nn.init.xavier_normal_(param) - - def forward(self, x): - # TODO: implement state passing for lstms - d = self.layers(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) - return d - - def inference(self, x): - d = self.layers.forward(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) - return d - - def compute_embedding(self, x, num_frames=160, overlap=0.5): - """ - Generate embeddings for a batch of utterances - x: 1xTxD - """ - num_overlap = int(num_frames * overlap) - max_len = x.shape[1] - embed = None - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) - frames = x[:, offset:end_offset] - if embed is None: - embed = self.inference(frames) - else: - embed += self.inference(frames) - return embed / cur_iter - - def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): - """ - Generate embeddings for a batch of utterances - x: BxTxD - """ - num_overlap = num_frames * overlap - max_len = x.shape[1] - embed = None - num_iters = seq_lens / (num_frames - num_overlap) - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) - frames = x[:, offset:end_offset] - if embed is None: - embed = self.inference(frames) - else: - embed[cur_iter <= num_iters, :] += self.inference( - frames[cur_iter <= num_iters, :, :] - ) - return embed / num_iters - diff --git a/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb b/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb deleted file mode 100644 index 159f040c..00000000 --- a/speaker_encoder/notebooks/PlotUmapLibriTTS.ipynb +++ /dev/null @@ -1,325 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Overview\n", - "\n", - "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", - "\n", - "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import glob\n", - "import random\n", - "import numpy as np\n", - "import torch\n", - "import umap\n", - "\n", - "from TTS.speaker_encoder.model import SpeakerEncoder\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config\n", - "\n", - "from bokeh.io import output_notebook, show\n", - "from bokeh.plotting import figure\n", - "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", - "from bokeh.transform import factor_cmap, factor_mark\n", - "from bokeh.palettes import Category10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", - "\n", - "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", - "\n", - "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_notebook()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "# My single speaker locations\n", - "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", - "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", - "\n", - "# My multi speaker locations\n", - "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", - "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls -1 $MODEL_RUN_PATH" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "CONFIG = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**CONFIG['audio'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Bring in the embeddings created by **compute_embeddings.py**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", - "print(f'Embeddings found: {len(embed_files)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check that we did indeed find an embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embed_files[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process the speakers\n", - "\n", - "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", - "speaker_to_utter = {}\n", - "for embed_file in embed_files:\n", - " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", - " try:\n", - " speaker_to_utter[speaker_path].append(embed_file)\n", - " except:\n", - " speaker_to_utter[speaker_path]=[embed_file]\n", - "print(f'Speaker count: {len(speaker_paths)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up the embeddings\n", - "\n", - "Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeds = []\n", - "labels = []\n", - "locations = []\n", - "\n", - "# single speaker \n", - "#num_speakers = 1\n", - "#num_utters = 1000\n", - "\n", - "# multi speaker\n", - "num_speakers = 10\n", - "num_utters = 20\n", - "\n", - "\n", - "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", - "\n", - "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", - " speaker_path = speaker_paths[speaker_idx]\n", - " speakers_utter = speaker_to_utter[speaker_path]\n", - " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", - " for utter_idx in utter_idxs:\n", - " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", - " embed = np.load(embed_path)\n", - " embeds.append(embed)\n", - " labels.append(str(speaker_num))\n", - " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", - "embeds = np.concatenate(embeds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load embeddings with UMAP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = umap.UMAP()\n", - "projection = model.fit_transform(embeds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Interactively charting the data in Bokeh\n", - "\n", - "Set up various details for Bokeh to plot the data\n", - "\n", - "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", - "\n", - "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", - "\n", - "File location in the tooltip is given relative to **AUDIO_PATH**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_wav_stems = ColumnDataSource(\n", - " data=dict(\n", - " x = projection.T[0].tolist(),\n", - " y = projection.T[1].tolist(),\n", - " desc=locations,\n", - " label=labels\n", - " )\n", - " )\n", - "\n", - "hover = HoverTool(\n", - " tooltips=[\n", - " (\"file\", \"@desc\"),\n", - " (\"speaker\", \"@label\"),\n", - " ]\n", - " )\n", - "\n", - "# optionally consider adding these to the tooltips if you want additional detail\n", - "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", - "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", - "\n", - "factors = list(set(labels))\n", - "pal_size = max(len(factors), 3)\n", - "pal = Category10[pal_size]\n", - "\n", - "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", - "\n", - "\n", - "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", - "\n", - "url = \"http://localhost:8000/@desc\"\n", - "taptool = p.select(type=TapTool)\n", - "taptool.callback = OpenURL(url=url)\n", - "\n", - "show(p)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Local server to serve wav files from corpus\n", - "\n", - "This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n", - "\n", - "There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n", - "\n", - "The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%cd $AUDIO_PATH\n", - "%pwd\n", - "!python -m http.server" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/speaker_encoder/requirements.txt b/speaker_encoder/requirements.txt deleted file mode 100644 index a486cc45..00000000 --- a/speaker_encoder/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -umap-learn -numpy>=1.17.0 diff --git a/speaker_encoder/train.py b/speaker_encoder/train.py deleted file mode 100644 index f74d0880..00000000 --- a/speaker_encoder/train.py +++ /dev/null @@ -1,252 +0,0 @@ -import argparse -import os -import sys -import time -import traceback - -import torch -from torch.utils.data import DataLoader -from TTS.datasets.preprocess import load_meta_data -from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.loss import GE2ELoss -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.visual import plot_embeddings -from TTS.speaker_encoder.generic_utils import save_best_model -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) -from TTS.utils.io import load_config, copy_config_file -from TTS.utils.training import check_update, NoamLR -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.utils.radam import RAdam - -torch.backends.cudnn.enabled = True -torch.backends.cudnn.benchmark = True -torch.manual_seed(54321) -use_cuda = torch.cuda.is_available() -num_gpus = torch.cuda.device_count() -print(" > Using CUDA: ", use_cuda) -print(" > Number of GPUs: ", num_gpus) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val: - loader = None - else: - dataset = MyDataset(ap, - meta_data_eval if is_val else meta_data_train, - voice_len=1.6, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=verbose) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=c.num_speakers_in_batch, - shuffle=False, - num_workers=c.num_loader_workers, - collate_fn=dataset.collate_fn) - return loader - - -def train(model, criterion, optimizer, scheduler, ap, global_step): - data_loader = setup_loader(ap, is_val=False, verbose=True) - model.train() - epoch_time = 0 - best_loss = float('inf') - avg_loss = 0 - end_time = time.time() - for _, data in enumerate(data_loader): - start_time = time.time() - - # setup input data - inputs = data[0] - loader_time = time.time() - end_time - global_step += 1 - - # setup lr - if c.lr_decay: - scheduler.step() - optimizer.zero_grad() - - # dispatch data to GPU - if use_cuda: - inputs = inputs.cuda(non_blocking=True) - # labels = labels.cuda(non_blocking=True) - - # forward pass model - outputs = model(inputs) - - # loss computation - loss = criterion( - outputs.view(c.num_speakers_in_batch, - outputs.shape[0] // c.num_speakers_in_batch, -1)) - loss.backward() - grad_norm, _ = check_update(model, c.grad_clip) - optimizer.step() - - step_time = time.time() - start_time - epoch_time += step_time - - avg_loss = 0.01 * loss.item( - ) + 0.99 * avg_loss if avg_loss != 0 else loss.item() - current_lr = optimizer.param_groups[0]['lr'] - - if global_step % c.steps_plot_stats == 0: - # Plot Training Epoch Stats - train_stats = { - "GE2Eloss": avg_loss, - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time - } - tb_logger.tb_train_epoch_stats(global_step, train_stats) - figures = { - # FIXME: not constant - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), - 10), - } - tb_logger.tb_train_figures(global_step, figures) - - if global_step % c.print_step == 0: - print( - " | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), avg_loss, grad_norm, step_time, - loader_time, current_lr), - flush=True) - - # save best model - best_loss = save_best_model(model, optimizer, avg_loss, best_loss, - OUT_PATH, global_step) - - end_time = time.time() - return avg_loss, global_step - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train - global meta_data_eval - - ap = AudioProcessor(**c.audio) - model = SpeakerEncoder(input_dim=40, - proj_dim=128, - lstm_dim=384, - num_lstm_layers=3) - optimizer = RAdam(model.parameters(), lr=c.lr) - criterion = GE2ELoss(loss_method='softmax') - - if args.restore_path: - checkpoint = torch.load(args.restore_path) - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - # optimizer.load_state_dict(checkpoint['optimizer']) - if c.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint['model']) - except KeyError: - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint, c) - model.load_state_dict(model_dict) - del model_dict - for group in optimizer.param_groups: - group['lr'] = c.lr - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] - else: - args.restore_step = 0 - - if use_cuda: - model = model.cuda() - criterion.cuda() - - if c.lr_decay: - scheduler = NoamLR(optimizer, - warmup_steps=c.warmup_steps, - last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - # pylint: disable=redefined-outer-name - meta_data_train, meta_data_eval = load_meta_data(c.datasets) - - global_step = args.restore_step - train_loss, global_step = train(model, criterion, optimizer, scheduler, ap, - global_step) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--restore_path', - type=str, - help='Path to model outputs (checkpoint, tensorboard etc.).', - default=0) - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - ) - parser.add_argument('--debug', - type=bool, - default=True, - help='Do not verify commit integrity to run training.') - parser.add_argument( - '--data_path', - type=str, - default='', - help='Defines the data path. It overwrites config.json.') - parser.add_argument('--output_path', - type=str, - help='path for training outputs.', - default='') - parser.add_argument('--output_folder', - type=str, - default='', - help='folder name for training outputs.') - args = parser.parse_args() - - # setup output paths and read configs - c = load_config(args.config_path) - _ = os.path.dirname(os.path.realpath(__file__)) - if args.data_path != '': - c.data_path = args.data_path - - if args.output_path == '': - OUT_PATH = os.path.join(_, c.output_path) - else: - OUT_PATH = args.output_path - - if args.output_folder == '': - OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug) - else: - OUT_PATH = os.path.join(OUT_PATH, args.output_folder) - - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), - new_fields) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/speaker_encoder/umap.png b/speaker_encoder/umap.png deleted file mode 100644 index ca8aefea..00000000 Binary files a/speaker_encoder/umap.png and /dev/null differ diff --git a/speaker_encoder/visual.py b/speaker_encoder/visual.py deleted file mode 100644 index 68c48f12..00000000 --- a/speaker_encoder/visual.py +++ /dev/null @@ -1,46 +0,0 @@ -import umap -import numpy as np -import matplotlib -import matplotlib.pyplot as plt - -matplotlib.use("Agg") - - -colormap = ( - np.array( - [ - [76, 255, 0], - [0, 127, 70], - [255, 0, 0], - [255, 217, 38], - [0, 135, 255], - [165, 0, 165], - [255, 167, 255], - [0, 255, 255], - [255, 96, 38], - [142, 76, 0], - [33, 0, 127], - [0, 0, 0], - [183, 183, 183], - ], - dtype=np.float, - ) - / 255 -) - - -def plot_embeddings(embeddings, num_utter_per_speaker): - embeddings = embeddings[: 10 * num_utter_per_speaker] - model = umap.UMAP() - projection = model.fit_transform(embeddings) - num_speakers = embeddings.shape[0] // num_utter_per_speaker - ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker) - colors = [colormap[i] for i in ground_truth] - - fig, ax = plt.subplots(figsize=(16, 10)) - _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) - plt.gca().set_aspect("equal", "datalim") - plt.title("UMAP projection") - plt.tight_layout() - plt.savefig("umap") - return fig diff --git a/synthesize.py b/synthesize.py deleted file mode 100644 index 18048c2f..00000000 --- a/synthesize.py +++ /dev/null @@ -1,182 +0,0 @@ -# pylint: disable=redefined-outer-name, unused-argument -import os -import time -import argparse -import torch -import json -import string - -from TTS.utils.synthesis import synthesis -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config -from TTS.utils.text.symbols import make_symbols, symbols, phonemes -from TTS.utils.audio import AudioProcessor - - -def tts(model, - vocoder_model, - C, - VC, - text, - ap, - ap_vocoder, - use_cuda, - batched_vocoder, - speaker_id=None, - figures=False): - t_1 = time.time() - use_vocoder_model = vocoder_model is not None - waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( - model, text, C, use_cuda, ap, speaker_id, style_wav=False, - truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, - use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) - - if C.model == "Tacotron" and use_vocoder_model: - postnet_output = ap.out_linear_to_mel(postnet_output.T).T - # correct if there is a scale difference b/w two models - if use_vocoder_model: - postnet_output = ap._denormalize(postnet_output) - postnet_output = ap_vocoder._normalize(postnet_output) - vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) - waveform = vocoder_model.generate( - vocoder_input.cuda() if use_cuda else vocoder_input, - batched=batched_vocoder, - target=8000, - overlap=400) - print(" > Run-time: {}".format(time.time() - t_1)) - return alignment, postnet_output, stop_tokens, waveform - - -if __name__ == "__main__": - - global symbols, phonemes - - parser = argparse.ArgumentParser() - parser.add_argument('text', type=str, help='Text to generate speech.') - parser.add_argument('config_path', - type=str, - help='Path to model config file.') - parser.add_argument( - 'model_path', - type=str, - help='Path to model file.', - ) - parser.add_argument( - 'out_path', - type=str, - help='Path to save final wav file. Wav file will be names as the text given.', - ) - parser.add_argument('--use_cuda', - type=bool, - help='Run model on CUDA.', - default=False) - parser.add_argument( - '--vocoder_path', - type=str, - help= - 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', - default="", - ) - parser.add_argument('--vocoder_config_path', - type=str, - help='Path to vocoder model config file.', - default="") - parser.add_argument( - '--batched_vocoder', - type=bool, - help="If True, vocoder model uses faster batch processing.", - default=True) - parser.add_argument('--speakers_json', - type=str, - help="JSON file for multi-speaker model.", - default="") - parser.add_argument( - '--speaker_id', - type=int, - help="target speaker_id if the model is multi-speaker.", - default=None) - args = parser.parse_args() - - if args.vocoder_path != "": - assert args.use_cuda, " [!] Enable cuda for vocoder." - from WaveRNN.models.wavernn import Model as VocoderModel - - # load the config - C = load_config(args.config_path) - C.forward_attn_mask = True - - # load the audio processor - ap = AudioProcessor(**C.audio) - - # if the vocabulary was passed, replace the default - if 'characters' in C.keys(): - symbols, phonemes = make_symbols(**C.characters) - - # load speakers - if args.speakers_json != '': - speakers = json.load(open(args.speakers_json, 'r')) - num_speakers = len(speakers) - else: - num_speakers = 0 - - # load the model - num_chars = len(phonemes) if C.use_phonemes else len(symbols) - model = setup_model(num_chars, num_speakers, C) - cp = torch.load(args.model_path) - model.load_state_dict(cp['model']) - model.eval() - if args.use_cuda: - model.cuda() - model.decoder.set_r(cp['r']) - - # load vocoder model - if args.vocoder_path != "": - VC = load_config(args.vocoder_config_path) - ap_vocoder = AudioProcessor(**VC.audio) - bits = 10 - vocoder_model = VocoderModel(rnn_dims=512, - fc_dims=512, - mode=VC.mode, - mulaw=VC.mulaw, - pad=VC.pad, - upsample_factors=VC.upsample_factors, - feat_dims=VC.audio["num_mels"], - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=ap.hop_length, - sample_rate=ap.sample_rate, - use_aux_net=True, - use_upsample_net=True) - - check = torch.load(args.vocoder_path) - vocoder_model.load_state_dict(check['model']) - vocoder_model.eval() - if args.use_cuda: - vocoder_model.cuda() - else: - vocoder_model = None - VC = None - ap_vocoder = None - - # synthesize voice - print(" > Text: {}".format(args.text)) - _, _, _, wav = tts(model, - vocoder_model, - C, - VC, - args.text, - ap, - ap_vocoder, - args.use_cuda, - args.batched_vocoder, - speaker_id=args.speaker_id, - figures=False) - - # save the results - file_name = args.text.replace(" ", "_") - file_name = file_name.translate( - str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' - out_path = os.path.join(args.out_path, file_name) - print(" > Saving output to {}".format(out_path)) - ap.save_wav(wav, out_path) diff --git a/tests/generic_utils_text.py b/tests/generic_utils_text.py index 228df2df..19c48647 100644 --- a/tests/generic_utils_text.py +++ b/tests/generic_utils_text.py @@ -1,8 +1,8 @@ import unittest import torch as T -from TTS.utils.generic_utils import save_checkpoint, save_best_model -from TTS.layers.tacotron import Prenet +from TTS.tts.utils.generic_utils import save_checkpoint, save_best_model +from TTS.tts.layers.tacotron import Prenet OUT_PATH = '/tmp/test.pth.tar' diff --git a/tests/test_config.json b/tests/inputs/test_config.json similarity index 100% rename from tests/test_config.json rename to tests/inputs/test_config.json diff --git a/config.json b/tests/inputs/test_train_config.json similarity index 83% rename from config.json rename to tests/inputs/test_train_config.json index 23868a33..e43903ce 100644 --- a/config.json +++ b/tests/inputs/test_train_config.json @@ -1,149 +1,151 @@ -{ - "model": "Tacotron2", - "run_name": "ljspeech-ddc-bn", - "run_description": "tacotron2 with ddc and batch-normalization", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - // "characters":{ - // "pad": "_", - // "eos": "~", - // "bos": "^", - // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - // "punctuations":"!'(),-.:;? ", - // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - // }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "bn", // "original" or "bn". - "prenet_dropout": false, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "original", // 'original' or 'graves' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "phoneme_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - - // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", - - // PHONEMES - "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false, // TACOTRON ONLY: use global style tokens - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", - "meta_file_val": null - } - ] - -} - +{ + "model": "Tacotron2", + "run_name": "test_sample_dataset_run", + "run_description": "sample dataset test run", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 1, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/train_outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. + "use_gst": false, // TACOTRON ONLY: use global style tokens + + // DATASETS + "train_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "eval_portion": 0.1, // dataset portion used for training. It is mainly for internal experiments. + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] + +} + diff --git a/vocoder/tests/test_config.json b/tests/inputs/test_vocoder_audio_config.json similarity index 100% rename from vocoder/tests/test_config.json rename to tests/inputs/test_vocoder_audio_config.json diff --git a/vocoder/configs/multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json similarity index 89% rename from vocoder/configs/multiband_melgan_config.json rename to tests/inputs/test_vocoder_multiband_melgan_config.json index a89d43bb..c0f552a4 100644 --- a/vocoder/configs/multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -31,7 +31,7 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // DISTRIBUTED TRAINING @@ -90,7 +90,7 @@ }, // DATASET - "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", + "data_path": "tests/data/ljspeech/wavs/", "feature_path": null, "seq_len": 16384, "pad_short": 2000, @@ -101,7 +101,7 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. // VALIDATION "run_eval": true, @@ -109,7 +109,7 @@ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. // OPTIMIZER - "epochs": 10000, // total number of epochs to train. + "epochs": 1, // total number of epochs to train. "wd": 0.0, // Weight decay weight. "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 "disc_clip_grad": -1, // Discriminator gradient clipping threshold. @@ -127,7 +127,7 @@ "lr_disc": 1e-4, // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. + "print_step": 1, // Number of steps to log traning on console. "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" @@ -139,6 +139,6 @@ "eval_split_size": 10, // PATHS - "output_path": "/home/erogol/Models/LJSpeech/" + "output_path": "tests/outputs/train_outputs/" } diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json deleted file mode 100644 index 36fac3e5..00000000 --- a/tests/outputs/dummy_model_config.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking", - "run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP", - - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "hop_length": 256, - "win_length": 1024, - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": false, // move normalization to range [-1, 1] - "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - }, - - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], - - "model": "Tacotron2", // one of the model in models/ - "grad_clip": 1, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "windowing": false, // Enables attention windowing. Used only in eval mode. - "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. - "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, - "attention_type": "original", - "attention_heads": 5, - "bidirectional_decoder": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "use_gst": false, - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "eval_batch_size":16, - "r": 1, // Number of frames to predict for step. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 10, // Number of steps to log traning on console. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. - - "run_eval": true, - "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py - "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers -} - diff --git a/tests/symbols_tests.py b/tests/symbols_tests.py index 4c32c7d6..87ec4a8a 100644 --- a/tests/symbols_tests.py +++ b/tests/symbols_tests.py @@ -1,6 +1,6 @@ import unittest -from TTS.utils.text import phonemes +from TTS.tts.utils.text import phonemes class SymbolsTest(unittest.TestCase): def test_uniqueness(self): #pylint: disable=no-self-use diff --git a/tests/test_audio.py b/tests/test_audio.py index 4b8ee276..a4d69de5 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -1,7 +1,7 @@ import os import unittest -from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config @@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) -conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) # pylint: disable=protected-access diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 5404304b..e8a86094 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -4,10 +4,11 @@ import unittest import torch as T from TTS.server.synthesizer import Synthesizer -from TTS.tests import get_tests_input_path, get_tests_output_path -from TTS.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config, save_checkpoint +from tests import get_tests_input_path, get_tests_output_path +from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.utils.io import save_checkpoint +from TTS.utils.io import load_config class DemoServerTest(unittest.TestCase): diff --git a/speaker_encoder/tests.py b/tests/test_encoder.py similarity index 97% rename from speaker_encoder/tests.py rename to tests/test_encoder.py index 039833fc..c713a1f1 100644 --- a/speaker_encoder/tests.py +++ b/tests/test_encoder.py @@ -2,12 +2,13 @@ import os import unittest import torch as T +from tests import get_tests_path, get_tests_input_path from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.loss import GE2ELoss from TTS.utils.io import load_config -file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" +file_path = get_tests_input_path() c = load_config(os.path.join(file_path, "test_config.json")) diff --git a/tests/test_layers.py b/tests/test_layers.py index d7c8829f..e9a36e35 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -1,9 +1,9 @@ import unittest import torch as T -from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder -from TTS.layers.losses import L1LossMasked -from TTS.utils.generic_utils import sequence_mask +from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder +from TTS.tts.layers.losses import L1LossMasked +from TTS.tts.utils.generic_utils import sequence_mask # pylint: disable=unused-variable diff --git a/tests/test_loader.py b/tests/test_loader.py index 52d24c7a..978b29b7 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -4,18 +4,18 @@ import shutil import torch import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from torch.utils.data import DataLoader from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor -from TTS.datasets import TTSDataset -from TTS.datasets.preprocess import ljspeech +from TTS.tts.datasets import TTSDataset +from TTS.tts.datasets.preprocess import ljspeech #pylint: disable=unused-variable -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(file_path, "outputs/loader_tests/") +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) ok_ljspeech = os.path.exists(c.data_path) DATA_EXIST = True diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 993ee495..56f79402 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -1,8 +1,8 @@ import unittest import os -from TTS.tests import get_tests_input_path +from tests import get_tests_input_path -from TTS.datasets.preprocess import common_voice +from TTS.tts.datasets.preprocess import common_voice class TestPreprocessors(unittest.TestCase): diff --git a/__init__.py b/tests/test_stft_torch.py similarity index 100% rename from __init__.py rename to tests/test_stft_torch.py diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index ae9f20a2..41b5f039 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -1,14 +1,14 @@ -import os import copy -import torch +import os import unittest -import numpy as np -from torch import optim -from torch import nn +import torch +from tests import get_tests_input_path +from torch import nn, optim + +from TTS.tts.layers.losses import MSELossMasked +from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.io import load_config -from TTS.layers.losses import MSELossMasked -from TTS.models.tacotron2 import Tacotron2 #pylint: disable=unused-variable @@ -16,8 +16,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) class TacotronTrainTest(unittest.TestCase): diff --git a/tf/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py similarity index 95% rename from tf/tests/test_tacotron2_tf_model.py rename to tests/test_tacotron2_tf_model.py index 03db194a..472c1ebf 100644 --- a/tf/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -5,9 +5,11 @@ import numpy as np import tensorflow as tf tf.get_logger().setLevel('INFO') +from tests import get_tests_path, get_tests_input_path, get_tests_output_path + from TTS.utils.io import load_config -from TTS.tf.models.tacotron2 import Tacotron2 -from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model +from TTS.tts.tf.models.tacotron2 import Tacotron2 +from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model #pylint: disable=unused-variable @@ -15,8 +17,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/') -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) class TacotronTFTrainTest(unittest.TestCase): diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 2bbb3c8d..f8d4a4d7 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -1,13 +1,14 @@ -import os import copy -import torch +import os import unittest -from torch import optim -from torch import nn +import torch +from tests import get_tests_input_path +from torch import nn, optim + +from TTS.tts.layers.losses import L1LossMasked +from TTS.tts.models.tacotron import Tacotron from TTS.utils.io import load_config -from TTS.layers.losses import L1LossMasked -from TTS.models.tacotron import Tacotron #pylint: disable=unused-variable @@ -15,8 +16,7 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -file_path = os.path.dirname(os.path.realpath(__file__)) -c = load_config(os.path.join(file_path, 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) def count_parameters(model): diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 93edabe7..992f0a17 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -3,12 +3,12 @@ import os # pylint: disable=wildcard-import # pylint: disable=unused-import import unittest -from TTS.utils.text import * -from TTS.tests import get_tests_path +from tests import get_tests_input_path +from TTS.tts.utils.text import * +from tests import get_tests_path from TTS.utils.io import load_config -TESTS_PATH = get_tests_path() -conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) def test_phoneme_to_sequence(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" @@ -19,7 +19,7 @@ def test_phoneme_to_sequence(): sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" - assert text_hat == text_hat_with_params == gt + assert text_hat == text_hat_with_params == gt # multiple punctuations text = "Be a voice, not an! echo?" diff --git a/datasets/__init__.py b/tests/test_train_tts.py similarity index 100% rename from datasets/__init__.py rename to tests/test_train_tts.py diff --git a/tests/test_tts_train.sh b/tests/test_tts_train.sh new file mode 100755 index 00000000..55379a1e --- /dev/null +++ b/tests/test_tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --config_path $BASEDIR/inputs/test_train_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/vocoder/tests/test_datasets.py b/tests/test_vocoder_datasets.py similarity index 93% rename from vocoder/tests/test_datasets.py rename to tests/test_vocoder_datasets.py index 43d0d3de..2a487d9a 100644 --- a/vocoder/tests/test_datasets.py +++ b/tests/test_vocoder_datasets.py @@ -1,24 +1,24 @@ import os + import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path from torch.utils.data import DataLoader -from TTS.vocoder.datasets.gan_dataset import GANDataset -from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config - +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.datasets.preprocess import load_wav_data file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/") +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -C = load_config(os.path.join(file_path, 'test_config.json')) +C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) -test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/") +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") ok_ljspeech = os.path.exists(test_data_path) - def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers): ''' run dataloader with given parameters and check conditions ''' ap = AudioProcessor(**C.audio) diff --git a/vocoder/tests/test_losses.py b/tests/test_vocoder_losses.py similarity index 85% rename from vocoder/tests/test_losses.py rename to tests/test_vocoder_losses.py index 68e42e89..965e68ad 100644 --- a/vocoder/tests/test_losses.py +++ b/tests/test_vocoder_losses.py @@ -1,11 +1,11 @@ import os + import torch +from tests import get_tests_input_path, get_tests_output_path, get_tests_path -from TTS.vocoder.layers.losses import TorchSTFT, STFTLoss, MultiScaleSTFTLoss - -from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config +from TTS.vocoder.layers.losses import MultiScaleSTFTLoss, STFTLoss, TorchSTFT TESTS_PATH = get_tests_path() @@ -14,8 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -file_path = os.path.dirname(os.path.realpath(__file__)) -C = load_config(os.path.join(file_path, 'test_config.json')) +C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) ap = AudioProcessor(**C.audio) @@ -53,9 +52,3 @@ def test_multiscale_stft_loss(): loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav)) assert loss_sc < 1.0 assert loss_m + loss_sc > 0 - - - - - - diff --git a/vocoder/tests/test_melgan_discriminator.py b/tests/test_vocoder_melgan_discriminator.py similarity index 100% rename from vocoder/tests/test_melgan_discriminator.py rename to tests/test_vocoder_melgan_discriminator.py diff --git a/vocoder/tests/test_melgan_generator.py b/tests/test_vocoder_melgan_generator.py similarity index 100% rename from vocoder/tests/test_melgan_generator.py rename to tests/test_vocoder_melgan_generator.py diff --git a/vocoder/tests/test_pqmf.py b/tests/test_vocoder_pqmf.py similarity index 90% rename from vocoder/tests/test_pqmf.py rename to tests/test_vocoder_pqmf.py index a26bdd59..8924fea8 100644 --- a/vocoder/tests/test_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -4,7 +4,7 @@ import torch import soundfile as sf from librosa.core import load -from TTS.tests import get_tests_path, get_tests_input_path +from tests import get_tests_path, get_tests_input_path from TTS.vocoder.layers.pqmf import PQMF diff --git a/vocoder/tests/test_rwd.py b/tests/test_vocoder_rwd.py similarity index 100% rename from vocoder/tests/test_rwd.py rename to tests/test_vocoder_rwd.py diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_train.sh new file mode 100755 index 00000000..b4a9b9fa --- /dev/null +++ b/tests/test_vocoder_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/outputs/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/outputs/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tf/README.md b/tf/README.md deleted file mode 100644 index 0f9d58e9..00000000 --- a/tf/README.md +++ /dev/null @@ -1,20 +0,0 @@ -## Utilities to Convert Models to Tensorflow2 -Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=). - -Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations. - -Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome. - -To see how you can use TF model at inference, check the notebook. - -This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. - - -### Converting a Model -- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments. - -### Known issues ans limitations -- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors. -- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```. -- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper. -- You can only convert models trained after TF model implementation since model layers has been updated in Torch model. diff --git a/tf/__init__.py b/tf/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tf/convert_tacotron2_tflite.py b/tf/convert_tacotron2_tflite.py deleted file mode 100644 index fc46cc79..00000000 --- a/tf/convert_tacotron2_tflite.py +++ /dev/null @@ -1,37 +0,0 @@ -# Convert Tensorflow Tacotron2 model to TF-Lite binary - -import argparse - -from TTS.utils.io import load_config -from TTS.utils.text.symbols import symbols, phonemes -from TTS.tf.utils.generic_utils import setup_model -from TTS.tf.utils.io import load_checkpoint -from TTS.tf.utils.tflite import convert_tacotron2_to_tflite - - -parser = argparse.ArgumentParser() -parser.add_argument('--tf_model', - type=str, - help='Path to target torch model to be converted to TF.') -parser.add_argument('--config_path', - type=str, - help='Path to config file of torch model.') -parser.add_argument('--output_path', - type=str, - help='path to tflite output binary.') -args = parser.parse_args() - -# Set constants -CONFIG = load_config(args.config_path) - -# load the model -c = CONFIG -num_speakers = 0 -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c, enable_tflite=True) -model.build_inference() -model = load_checkpoint(model, args.tf_model) -model.decoder.set_max_decoder_steps(1000) - -# create tflite model -tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path) \ No newline at end of file diff --git a/tf/convert_tacotron2_torch_to_tf.py b/tf/convert_tacotron2_torch_to_tf.py deleted file mode 100644 index dfc42250..00000000 --- a/tf/convert_tacotron2_torch_to_tf.py +++ /dev/null @@ -1,210 +0,0 @@ -# %% -import sys -sys.path.append('/home/erogol/Projects') -import os -os.environ['CUDA_VISIBLE_DEVICES'] = '' -# %% -import argparse -import numpy as np -import torch -import tensorflow as tf -from fuzzywuzzy import fuzz - -from TTS.utils.text.symbols import phonemes, symbols -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config -from TTS.tf.models.tacotron2 import Tacotron2 -from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name -from TTS.tf.utils.generic_utils import save_checkpoint - -parser = argparse.ArgumentParser() -parser.add_argument('--torch_model_path', - type=str, - help='Path to target torch model to be converted to TF.') -parser.add_argument('--config_path', - type=str, - help='Path to config file of torch model.') -parser.add_argument('--output_path', - type=str, - help='path to output file including file name to save TF model.') -args = parser.parse_args() - -# load model config -config_path = args.config_path -c = load_config(config_path) -num_speakers = 0 - -# init torch model -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c) -checkpoint = torch.load(args.torch_model_path, - map_location=torch.device('cpu')) -state_dict = checkpoint['model'] -model.load_state_dict(state_dict) - -# init tf model -model_tf = Tacotron2(num_chars=num_chars, - num_speakers=num_speakers, - r=model.decoder.r, - postnet_output_dim=c.audio['num_mels'], - decoder_output_dim=c.audio['num_mels'], - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder) - -# set initial layer mapping - these are not captured by the below heuristic approach -# TODO: set layer names so that we can remove these manual matching -common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE' -var_map = [ - ('embedding/embeddings:0', 'embedding.weight'), - ('encoder/lstm/forward_lstm/lstm_cell_1/kernel:0', - 'encoder.lstm.weight_ih_l0'), - ('encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0', - 'encoder.lstm.weight_hh_l0'), - ('encoder/lstm/backward_lstm/lstm_cell_2/kernel:0', - 'encoder.lstm.weight_ih_l0_reverse'), - ('encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0', - 'encoder.lstm.weight_hh_l0_reverse'), - ('encoder/lstm/forward_lstm/lstm_cell_1/bias:0', - ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')), - ('encoder/lstm/backward_lstm/lstm_cell_2/bias:0', - ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')), - ('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'), - ('decoder/linear_projection/kernel:0', - 'decoder.linear_projection.linear_layer.weight'), - ('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight') -] - -# %% -# get tf_model graph -mel_pred = model_tf.build_inference() - -# get tf variables -tf_vars = model_tf.weights - -# match variable names with fuzzy logic -torch_var_names = list(state_dict.keys()) -tf_var_names = [we.name for we in model_tf.weights] -for tf_name in tf_var_names: - # skip re-mapped layer names - if tf_name in [name[0] for name in var_map]: - continue - tf_name_edited = convert_tf_name(tf_name) - ratios = [ - fuzz.ratio(torch_name, tf_name_edited) - for torch_name in torch_var_names - ] - max_idx = np.argmax(ratios) - matching_name = torch_var_names[max_idx] - del torch_var_names[max_idx] - var_map.append((tf_name, matching_name)) - -# %% -# print variable match -from pprint import pprint -pprint(var_map) -pprint(torch_var_names) - -# pass weights -tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict) - -# Compare TF and TORCH models -# %% -# check embedding outputs -model.eval() -input_ids = torch.randint(0, 24, (1, 128)).long() - -o_t = model.embedding(input_ids) -o_tf = model_tf.embedding(input_ids.detach().numpy()) -assert abs(o_t.detach().numpy() - - o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - - o_tf.numpy()).sum() - -# compare encoder outputs -oo_en = model.encoder.inference(o_t.transpose(1, 2)) -ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False) -assert compare_torch_tf(oo_en, ooo_en) < 1e-5 - -#pylint: disable=redefined-builtin -# compare decoder.attention_rnn -inp = torch.rand([1, 768]) -inp_tf = inp.numpy() -model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access -output, cell_state = model.decoder.attention_rnn(inp) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, - states[2], - training=False) -assert compare_torch_tf(output, output_tf).mean() < 1e-5 - -query = output -inputs = torch.rand([1, 128, 512]) -query_tf = query.detach().numpy() -inputs_tf = inputs.numpy() - -# compare decoder.attention -model.decoder.attention.init_states(inputs) -processes_inputs = model.decoder.attention.preprocess_inputs(inputs) -loc_attn, proc_query = model.decoder.attention.get_location_attention( - query, processes_inputs) -context = model.decoder.attention(query, inputs, processes_inputs, None) - -attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1] -model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf)) -loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states) -context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False) - -assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5 -assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5 -assert compare_torch_tf(context, context_tf) < 1e-5 - -# compare decoder.decoder_rnn -input = torch.rand([1, 1536]) -input_tf = input.numpy() -model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access -output, cell_state = model.decoder.decoder_rnn( - input, [model.decoder.decoder_hidden, model.decoder.decoder_cell]) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, - states[3], - training=False) -assert abs(input - input_tf).mean() < 1e-5 -assert compare_torch_tf(output, output_tf).mean() < 1e-5 - -# compare decoder.linear_projection -input = torch.rand([1, 1536]) -input_tf = input.numpy() -output = model.decoder.linear_projection(input) -output_tf = model_tf.decoder.linear_projection(input_tf, training=False) -assert compare_torch_tf(output, output_tf) < 1e-5 - -# compare decoder outputs -model.decoder.max_decoder_steps = 100 -model_tf.decoder.set_max_decoder_steps(100) -output, align, stop = model.decoder.inference(oo_en) -states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) -output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False) -assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4 - -# compare the whole model output -outputs_torch = model.inference(input_ids) -outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy())) -print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean()) -assert compare_torch_tf(outputs_torch[2][:, 50, :], - outputs_tf[2][:, 50, :]) < 1e-5 -assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4 - -# %% -# save tf model -save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'], - checkpoint['r'], args.output_path) -print(' > Model conversion is successfully completed :).') diff --git a/tf/layers/common_layers.py b/tf/layers/common_layers.py deleted file mode 100644 index f2353a93..00000000 --- a/tf/layers/common_layers.py +++ /dev/null @@ -1,285 +0,0 @@ -import tensorflow as tf -from tensorflow import keras -from tensorflow.python.ops import math_ops -# from tensorflow_addons.seq2seq import BahdanauAttention - - -class Linear(keras.layers.Layer): - def __init__(self, units, use_bias, **kwargs): - super(Linear, self).__init__(**kwargs) - self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer') - self.activation = keras.layers.ReLU() - - def call(self, x): - """ - shapes: - x: B x T x C - """ - return self.activation(self.linear_layer(x)) - - -class LinearBN(keras.layers.Layer): - def __init__(self, units, use_bias, **kwargs): - super(LinearBN, self).__init__(**kwargs) - self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer') - self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization') - self.activation = keras.layers.ReLU() - - def call(self, x, training=None): - """ - shapes: - x: B x T x C - """ - out = self.linear_layer(x) - out = self.batch_normalization(out, training=training) - return self.activation(out) - - -class Prenet(keras.layers.Layer): - def __init__(self, - prenet_type, - prenet_dropout, - units, - bias, - **kwargs): - super(Prenet, self).__init__(**kwargs) - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.linear_layers = [] - if prenet_type == "bn": - self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)] - elif prenet_type == "original": - self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)] - else: - raise RuntimeError(' [!] Unknown prenet type.') - if prenet_dropout: - self.dropout = keras.layers.Dropout(rate=0.5) - - def call(self, x, training=None): - """ - shapes: - x: B x T x C - """ - for linear in self.linear_layers: - if self.prenet_dropout: - x = self.dropout(linear(x), training=training) - else: - x = linear(x) - return x - - -def _sigmoid_norm(score): - attn_weights = tf.nn.sigmoid(score) - attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True) - return attn_weights - - -class Attention(keras.layers.Layer): - """TODO: implement forward_attention - TODO: location sensitive attention - TODO: implement attention windowing """ - def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters, - loc_attn_kernel_size, use_windowing, norm, use_forward_attn, - use_trans_agent, use_forward_attn_mask, **kwargs): - super(Attention, self).__init__(**kwargs) - self.use_loc_attn = use_loc_attn - self.loc_attn_n_filters = loc_attn_n_filters - self.loc_attn_kernel_size = loc_attn_kernel_size - self.use_windowing = use_windowing - self.norm = norm - self.use_forward_attn = use_forward_attn - self.use_trans_agent = use_trans_agent - self.use_forward_attn_mask = use_forward_attn_mask - self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer') - self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer') - self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer') - if use_loc_attn: - self.location_conv1d = keras.layers.Conv1D( - filters=loc_attn_n_filters, - kernel_size=loc_attn_kernel_size, - padding='same', - use_bias=False, - name='location_layer/location_conv1d') - self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense') - if norm == 'softmax': - self.norm_func = tf.nn.softmax - elif norm == 'sigmoid': - self.norm_func = _sigmoid_norm - else: - raise ValueError("Unknown value for attention norm type") - - def init_states(self, batch_size, value_length): - states = [] - if self.use_loc_attn: - attention_cum = tf.zeros([batch_size, value_length]) - attention_old = tf.zeros([batch_size, value_length]) - states = [attention_cum, attention_old] - if self.use_forward_attn: - alpha = tf.concat([ - tf.ones([batch_size, 1]), - tf.zeros([batch_size, value_length])[:, :-1] + 1e-7 - ], 1) - states.append(alpha) - return tuple(states) - - def process_values(self, values): - """ cache values for decoder iterations """ - #pylint: disable=attribute-defined-outside-init - self.processed_values = self.inputs_layer(values) - self.values = values - - def get_loc_attn(self, query, states): - """ compute location attention, query layer and - unnorm. attention weights""" - attention_cum, attention_old = states[:2] - attn_cat = tf.stack([attention_old, attention_cum], axis=2) - - processed_query = self.query_layer(tf.expand_dims(query, 1)) - processed_attn = self.location_dense(self.location_conv1d(attn_cat)) - score = self.v( - tf.nn.tanh(self.processed_values + processed_query + - processed_attn)) - score = tf.squeeze(score, axis=2) - return score, processed_query - - def get_attn(self, query): - """ compute query layer and unnormalized attention weights """ - processed_query = self.query_layer(tf.expand_dims(query, 1)) - score = self.v(tf.nn.tanh(self.processed_values + processed_query)) - score = tf.squeeze(score, axis=2) - return score, processed_query - - def apply_score_masking(self, score, mask): #pylint: disable=no-self-use - """ ignore sequence paddings """ - padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) - # Bias so padding positions do not contribute to attention distribution. - score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) - return score - - def apply_forward_attention(self, alignment, alpha): #pylint: disable=no-self-use - # forward attention - fwd_shifted_alpha = tf.pad(alpha[:, :-1], ((0, 0), (1, 0)), constant_values=0.0) - # compute transition potentials - new_alpha = ((1 - 0.5) * alpha + 0.5 * fwd_shifted_alpha + 1e-8) * alignment - # renormalize attention weights - new_alpha = new_alpha / tf.reduce_sum(new_alpha, axis=1, keepdims=True) - return new_alpha - - def update_states(self, old_states, scores_norm, attn_weights, new_alpha=None): - states = [] - if self.use_loc_attn: - states = [old_states[0] + scores_norm, attn_weights] - if self.use_forward_attn: - states.append(new_alpha) - return tuple(states) - - def call(self, query, states): - """ - shapes: - query: B x D - """ - if self.use_loc_attn: - score, _ = self.get_loc_attn(query, states) - else: - score, _ = self.get_attn(query) - - # TODO: masking - # if mask is not None: - # self.apply_score_masking(score, mask) - # attn_weights shape == (batch_size, max_length, 1) - - # normalize attention scores - scores_norm = self.norm_func(score) - attn_weights = scores_norm - - # apply forward attention - new_alpha = None - if self.use_forward_attn: - new_alpha = self.apply_forward_attention(attn_weights, states[-1]) - attn_weights = new_alpha - - # update states tuple - # states = (cum_attn_weights, attn_weights, new_alpha) - states = self.update_states(states, scores_norm, attn_weights, new_alpha) - - # context_vector shape after sum == (batch_size, hidden_size) - context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False) - context_vector = tf.squeeze(context_vector, axis=1) - return context_vector, attn_weights, states - - -# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b): -# dtype = processed_query.dtype -# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1] -# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2]) - - -# class LocationSensitiveAttention(BahdanauAttention): -# def __init__(self, -# units, -# memory=None, -# memory_sequence_length=None, -# normalize=False, -# probability_fn="softmax", -# kernel_initializer="glorot_uniform", -# dtype=None, -# name="LocationSensitiveAttention", -# location_attention_filters=32, -# location_attention_kernel_size=31): - -# super(LocationSensitiveAttention, -# self).__init__(units=units, -# memory=memory, -# memory_sequence_length=memory_sequence_length, -# normalize=normalize, -# probability_fn='softmax', ## parent module default -# kernel_initializer=kernel_initializer, -# dtype=dtype, -# name=name) -# if probability_fn == 'sigmoid': -# self.probability_fn = lambda score, _: self._sigmoid_normalization(score) -# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False) -# self.location_dense = keras.layers.Dense(units, use_bias=False) -# # self.v = keras.layers.Dense(1, use_bias=True) - -# def _location_sensitive_score(self, processed_query, keys, processed_loc): -# processed_query = tf.expand_dims(processed_query, 1) -# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2]) - -# def _location_sensitive(self, alignment_cum, alignment_old): -# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2) -# return self.location_dense(self.location_conv(alignment_cat)) - -# def _sigmoid_normalization(self, score): -# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True) - -# # def _apply_masking(self, score, mask): -# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) -# # # Bias so padding positions do not contribute to attention distribution. -# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) -# # return score - -# def _calculate_attention(self, query, state): -# alignment_cum, alignment_old = state[:2] -# processed_query = self.query_layer( -# query) if self.query_layer else query -# processed_loc = self._location_sensitive(alignment_cum, alignment_old) -# score = self._location_sensitive_score( -# processed_query, -# self.keys, -# processed_loc) -# alignment = self.probability_fn(score, state) -# alignment_cum = alignment_cum + alignment -# state[0] = alignment_cum -# state[1] = alignment -# return alignment, state - -# def compute_context(self, alignments): -# expanded_alignments = tf.expand_dims(alignments, 1) -# context = tf.matmul(expanded_alignments, self.values) -# context = tf.squeeze(context, [1]) -# return context - -# # def call(self, query, state): -# # alignment, next_state = self._calculate_attention(query, state) -# # return alignment, next_state diff --git a/tf/layers/tacotron2.py b/tf/layers/tacotron2.py deleted file mode 100644 index 7cef9eac..00000000 --- a/tf/layers/tacotron2.py +++ /dev/null @@ -1,300 +0,0 @@ -import tensorflow as tf -from tensorflow import keras -from TTS.tf.utils.tf_utils import shape_list -from TTS.tf.layers.common_layers import Prenet, Attention -# from tensorflow_addons.seq2seq import AttentionWrapper - - -class ConvBNBlock(keras.layers.Layer): - def __init__(self, filters, kernel_size, activation, **kwargs): - super(ConvBNBlock, self).__init__(**kwargs) - self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d') - self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization') - self.dropout = keras.layers.Dropout(rate=0.5, name='dropout') - self.activation = keras.layers.Activation(activation, name='activation') - - def call(self, x, training=None): - o = self.convolution1d(x) - o = self.batch_normalization(o, training=training) - o = self.activation(o) - o = self.dropout(o, training=training) - return o - - -class Postnet(keras.layers.Layer): - def __init__(self, output_filters, num_convs, **kwargs): - super(Postnet, self).__init__(**kwargs) - self.convolutions = [] - self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0')) - for idx in range(1, num_convs - 1): - self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}')) - self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}')) - - def call(self, x, training=None): - o = x - for layer in self.convolutions: - o = layer(o, training=training) - return o - - -class Encoder(keras.layers.Layer): - def __init__(self, output_input_dim, **kwargs): - super(Encoder, self).__init__(**kwargs) - self.convolutions = [] - for idx in range(3): - self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}')) - self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm') - - def call(self, x, training=None): - o = x - for layer in self.convolutions: - o = layer(o, training=training) - o = self.lstm(o) - return o - - -class Decoder(keras.layers.Layer): - #pylint: disable=unused-argument - def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type, - prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask, - use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, enable_tflite, **kwargs): - super(Decoder, self).__init__(**kwargs) - self.frame_dim = frame_dim - self.r_init = tf.constant(r, dtype=tf.int32) - self.r = tf.constant(r, dtype=tf.int32) - self.output_dim = r * self.frame_dim - self.separate_stopnet = separate_stopnet - self.enable_tflite = enable_tflite - - # layer constants - self.max_decoder_steps = tf.constant(1000, dtype=tf.int32) - self.stop_thresh = tf.constant(0.5, dtype=tf.float32) - - # model dimensions - self.query_dim = 1024 - self.decoder_rnn_dim = 1024 - self.prenet_dim = 256 - self.attn_dim = 128 - self.p_attention_dropout = 0.1 - self.p_decoder_dropout = 0.1 - - self.prenet = Prenet(prenet_type, - prenet_dropout, - [self.prenet_dim, self.prenet_dim], - bias=False, - name='prenet') - self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name='attention_rnn', ) - self.attention_rnn_dropout = keras.layers.Dropout(0.5) - - # TODO: implement other attn options - self.attention = Attention(attn_dim=self.attn_dim, - use_loc_attn=True, - loc_attn_n_filters=32, - loc_attn_kernel_size=31, - use_windowing=False, - norm=attn_norm, - use_forward_attn=use_forward_attn, - use_trans_agent=use_trans_agent, - use_forward_attn_mask=use_forward_attn_mask, - name='attention') - self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name='decoder_rnn') - self.decoder_rnn_dropout = keras.layers.Dropout(0.5) - self.linear_projection = keras.layers.Dense(self.frame_dim * r, name='linear_projection/linear_layer') - self.stopnet = keras.layers.Dense(1, name='stopnet/linear_layer') - - - def set_max_decoder_steps(self, new_max_steps): - self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32) - - def set_r(self, new_r): - self.r = tf.constant(new_r, dtype=tf.int32) - self.output_dim = self.frame_dim * new_r - - def build_decoder_initial_states(self, batch_size, memory_dim, memory_length): - zero_frame = tf.zeros([batch_size, self.frame_dim]) - zero_context = tf.zeros([batch_size, memory_dim]) - attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) - decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) - attention_states = self.attention.init_states(batch_size, memory_length) - return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states - - def step(self, prenet_next, states, - memory_seq_length=None, training=None): - _, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states - attention_rnn_input = tf.concat([prenet_next, context_next], -1) - attention_rnn_output, attention_rnn_state = \ - self.attention_rnn(attention_rnn_input, - attention_rnn_state, training=training) - attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training) - context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training) - decoder_rnn_input = tf.concat([attention_rnn_output, context], -1) - decoder_rnn_output, decoder_rnn_state = \ - self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training) - decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training) - linear_projection_input = tf.concat([decoder_rnn_output, context], -1) - output_frame = self.linear_projection(linear_projection_input, training=training) - stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1) - stopnet_output = self.stopnet(stopnet_input, training=training) - output_frame = output_frame[:, :self.r * self.frame_dim] - states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states) - return output_frame, stopnet_output, states, attention - - def decode(self, memory, states, frames, memory_seq_length=None): - B, _, _ = shape_list(memory) - num_iter = shape_list(frames)[1] // self.r - # init states - frame_zero = tf.expand_dims(states[0], 1) - frames = tf.concat([frame_zero, frames], axis=1) - outputs = tf.TensorArray(dtype=tf.float32, size=num_iter) - attentions = tf.TensorArray(dtype=tf.float32, size=num_iter) - stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter) - # pre-computes - self.attention.process_values(memory) - prenet_output = self.prenet(frames, training=True) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions): - prenet_next = prenet_output[:, step] - output, stop_token, states, attention = self.step(prenet_next, - states, - memory_seq_length) - outputs = outputs.write(step, output) - attentions = attentions.write(step, attention) - stop_tokens = stop_tokens.write(step, stop_token) - return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions - _, memory, _, states, outputs, stop_tokens, attentions = \ - tf.while_loop(lambda *arg: True, - _body, - loop_vars=(step_count, memory, prenet_output, - states, outputs, stop_tokens, attentions), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=num_iter) - - outputs = outputs.stack() - attentions = attentions.stack() - stop_tokens = stop_tokens.stack() - outputs = tf.transpose(outputs, [1, 0, 2]) - attentions = tf.transpose(attentions, [1, 0, 2]) - stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) - stop_tokens = tf.squeeze(stop_tokens, axis=2) - outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - def decode_inference(self, memory, states): - B, _, _ = shape_list(memory) - # init states - outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) - - # pre-computes - self.attention.process_values(memory) - - # iter vars - stop_flag = tf.constant(False, dtype=tf.bool) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag): - frame_next = states[0] - prenet_next = self.prenet(frame_next, training=False) - output, stop_token, states, attention = self.step(prenet_next, - states, - None, - training=False) - stop_token = tf.math.sigmoid(stop_token) - outputs = outputs.write(step, output) - attentions = attentions.write(step, attention) - stop_tokens = stop_tokens.write(step, stop_token) - stop_flag = tf.greater(stop_token, self.stop_thresh) - stop_flag = tf.reduce_all(stop_flag) - return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag - - cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) - _, memory, states, outputs, stop_tokens, attentions, stop_flag = \ - tf.while_loop(cond, - _body, - loop_vars=(step_count, memory, states, outputs, - stop_tokens, attentions, stop_flag), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=self.max_decoder_steps) - - outputs = outputs.stack() - attentions = attentions.stack() - stop_tokens = stop_tokens.stack() - - outputs = tf.transpose(outputs, [1, 0, 2]) - attentions = tf.transpose(attentions, [1, 0, 2]) - stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) - stop_tokens = tf.squeeze(stop_tokens, axis=2) - outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - def decode_inference_tflite(self, memory, states): - """Inference with TF-Lite compatibility. It assumes - batch_size is 1""" - # init states - # dynamic_shape is not supported in TFLite - outputs = tf.TensorArray(dtype=tf.float32, - size=self.max_decoder_steps, - element_shape=tf.TensorShape( - [self.output_dim]), - clear_after_read=False, - dynamic_size=False) - # stop_flags = tf.TensorArray(dtype=tf.bool, - # size=self.max_decoder_steps, - # element_shape=tf.TensorShape( - # []), - # clear_after_read=False, - # dynamic_size=False) - attentions = () - stop_tokens = () - - # pre-computes - self.attention.process_values(memory) - - # iter vars - stop_flag = tf.constant(False, dtype=tf.bool) - step_count = tf.constant(0, dtype=tf.int32) - - def _body(step, memory, states, outputs, stop_flag): - frame_next = states[0] - prenet_next = self.prenet(frame_next, training=False) - output, stop_token, states, _ = self.step(prenet_next, - states, - None, - training=False) - stop_token = tf.math.sigmoid(stop_token) - stop_flag = tf.greater(stop_token, self.stop_thresh) - stop_flag = tf.reduce_all(stop_flag) - # stop_flags = stop_flags.write(step, tf.logical_not(stop_flag)) - - outputs = outputs.write(step, tf.reshape(output, [-1])) - return step + 1, memory, states, outputs, stop_flag - - cond = lambda step, m, s, o, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) - step_count, memory, states, outputs, stop_flag = \ - tf.while_loop(cond, - _body, - loop_vars=(step_count, memory, states, outputs, - stop_flag), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=self.max_decoder_steps) - - - outputs = outputs.stack() - outputs = tf.gather(outputs, tf.range(step_count)) # pylint: disable=no-value-for-parameter - outputs = tf.expand_dims(outputs, axis=[0]) - outputs = tf.transpose(outputs, [1, 0, 2]) - outputs = tf.reshape(outputs, [1, -1, self.frame_dim]) - return outputs, stop_tokens, attentions - - - def call(self, memory, states, frames=None, memory_seq_length=None, training=False): - if training: - return self.decode(memory, states, frames, memory_seq_length) - if self.enable_tflite: - return self.decode_inference_tflite(memory, states) - return self.decode_inference(memory, states) diff --git a/tf/models/tacotron2.py b/tf/models/tacotron2.py deleted file mode 100644 index 70d725e2..00000000 --- a/tf/models/tacotron2.py +++ /dev/null @@ -1,108 +0,0 @@ -import tensorflow as tf -from tensorflow import keras - -from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet -from TTS.tf.utils.tf_utils import shape_list - - -#pylint: disable=too-many-ancestors -class Tacotron2(keras.models.Model): - def __init__(self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type='original', - attn_win=False, - attn_norm="softmax", - attn_K=4, - prenet_type="original", - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - separate_stopnet=True, - bidirectional_decoder=False, - enable_tflite=False): - super(Tacotron2, self).__init__() - self.r = r - self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim - self.bidirectional_decoder = bidirectional_decoder - self.num_speakers = num_speakers - self.speaker_embed_dim = 256 - self.enable_tflite = enable_tflite - - self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding') - self.encoder = Encoder(512, name='encoder') - # TODO: most of the decoder args have no use at the momment - self.decoder = Decoder(decoder_output_dim, - r, - attn_type=attn_type, - use_attn_win=attn_win, - attn_norm=attn_norm, - prenet_type=prenet_type, - prenet_dropout=prenet_dropout, - use_forward_attn=forward_attn, - use_trans_agent=trans_agent, - use_forward_attn_mask=forward_attn_mask, - use_location_attn=location_attn, - attn_K=attn_K, - separate_stopnet=separate_stopnet, - speaker_emb_dim=self.speaker_embed_dim, - name='decoder', - enable_tflite=enable_tflite) - self.postnet = Postnet(postnet_output_dim, 5, name='postnet') - - @tf.function(experimental_relax_shapes=True) - def call(self, characters, text_lengths=None, frames=None, training=None): - if training: - return self.training(characters, text_lengths, frames) - if not training: - return self.inference(characters) - raise RuntimeError(' [!] Set model training mode True or False') - - def training(self, characters, text_lengths, frames): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=True) - encoder_output = self.encoder(embedding_vectors, training=True) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True) - postnet_frames = self.postnet(decoder_frames, training=True) - output_frames = decoder_frames + postnet_frames - return decoder_frames, output_frames, attentions, stop_tokens - - def inference(self, characters): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=False) - encoder_output = self.encoder(embedding_vectors, training=False) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) - postnet_frames = self.postnet(decoder_frames, training=False) - output_frames = decoder_frames + postnet_frames - print(output_frames.shape) - return decoder_frames, output_frames, attentions, stop_tokens - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, None], dtype=tf.int32), - ],) - def inference_tflite(self, characters): - B, T = shape_list(characters) - embedding_vectors = self.embedding(characters, training=False) - encoder_output = self.encoder(embedding_vectors, training=False) - decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) - decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) - postnet_frames = self.postnet(decoder_frames, training=False) - output_frames = decoder_frames + postnet_frames - print(output_frames.shape) - return decoder_frames, output_frames, attentions, stop_tokens - - def build_inference(self, ): - # TODO: issue https://github.com/PyCQA/pylint/issues/3613 - input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg - self(input_ids) - diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb deleted file mode 100644 index 4a21ae17..00000000 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ /dev/null @@ -1,714 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "This is to test TTS tensorflow models with benchmark sentences.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related models.\n", - " - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model in the models page).\n", - "- to set the file paths below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n", - "\n", - "Known Issues:\n", - "- To load the model second time you need to restart the notebook kernel. \n", - "- Some of the advance methods are not yet implemented for Tensorflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "\n", - "import sys\n", - "import io\n", - "import torch \n", - "import tensorflow as tf\n", - "print(tf.config.list_physical_devices('GPU'))\n", - "\n", - "import time\n", - "import json\n", - "import yaml\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "import matplotlib.pyplot as plt\n", - "plt.rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.tf.models.tacotron2 import Tacotron2\n", - "from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " print(mel_postnet_spec.shape)\n", - " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n", - " if use_cuda and not use_gl:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " waveform = waveform.squeeze()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '../torch_model/'\n", - "MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "# Run FLAGs\n", - "use_cuda = True # use the available GPU (only for torch)\n", - "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "BACKEND = 'tf' # set the backend for inference " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n", - "from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n", - "c = CONFIG\n", - "num_speakers = 0\n", - "r = 1\n", - "num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, num_speakers, c)\n", - "\n", - "# before loading weights you need to run the model once to generate the variables\n", - "input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n", - "mel_pred = model(input_ids, training=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "scrolled": true - }, - "outputs": [], - "source": [ - "model = load_checkpoint(model, MODEL_PATH)\n", - "# model = tf.function(model, experimental_relax_shapes=True)\n", - "ap = AudioProcessor(**CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# wrapper class to use tf.function\n", - "class ModelInference(tf.keras.Model):\n", - " def __init__(self, model):\n", - " super(ModelInference, self).__init__()\n", - " self.model = model\n", - " \n", - " @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n", - " def call(self, characters):\n", - " return self.model(characters, training=False)\n", - " \n", - "model = ModelInference(model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# LOAD WAVERNN\n", - "if use_gl == False:\n", - " from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n", - " \n", - " vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", - " vocoder_model.remove_weight_norm()\n", - " ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", - " if use_cuda:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval();\n", - " print(count_parameters(vocoder_model))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### https://espnet.github.io/icassp2020-tts/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The Commission also recommends\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Other examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "wavs = []\n", - "model.eval()\n", - "model.decoder.prenet.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "# model.decoder.prenet.train()\n", - "speaker_id = None\n", - "sentence = '''This is App Store Optimization report.\n", - "The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and what’s new section.\n", - "In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n", - "Product name is the actual app name on the AppStore or Google Play Store.\n", - "Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n", - "Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n", - "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", - "If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps’ Store presence back then.\n", - "You can also filter for a specific app using Product Name.\n", - "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", - "'''\n", - "\n", - "for s in sentence.split('\\n'):\n", - " print(s)\n", - " align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n", - " wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tf/tests/__init__.py b/tf/tests/__init__.py deleted file mode 100644 index 8b137891..00000000 --- a/tf/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tf/tests/test_layers_tf.py b/tf/tests/test_layers_tf.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tf/utils/convert_torch_to_tf_utils.py b/tf/utils/convert_torch_to_tf_utils.py deleted file mode 100644 index e9e1e8a3..00000000 --- a/tf/utils/convert_torch_to_tf_utils.py +++ /dev/null @@ -1,81 +0,0 @@ -import numpy as np -import tensorflow as tf - - -def tf_create_dummy_inputs(): - """ Create dummy inputs for TF Tacotron2 model """ - batch_size = 4 - max_input_length = 32 - max_mel_length = 128 - pad = 1 - n_chars = 24 - input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32) - input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size]) - input_lengths[-1] = max_input_length - input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32) - mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80]) - mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size]) - mel_lengths[-1] = max_mel_length - mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32) - return input_ids, input_lengths, mel_outputs, mel_lengths - - -def compare_torch_tf(torch_tensor, tf_tensor): - """ Compute the average absolute difference b/w torch and tf tensors """ - return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() - - -def convert_tf_name(tf_name): - """ Convert certain patterns in TF layer names to Torch patterns """ - tf_name_tmp = tf_name - tf_name_tmp = tf_name_tmp.replace(':0', '') - tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0') - tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1') - tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh') - tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight') - tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight') - tf_name_tmp = tf_name_tmp.replace('/beta', '/bias') - tf_name_tmp = tf_name_tmp.replace('/', '.') - return tf_name_tmp - - -def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): - """ Transfer weigths from torch state_dict to TF variables """ - print(" > Passing weights from Torch to TF ...") - for tf_var in tf_vars: - torch_var_name = var_map_dict[tf_var.name] - print(f' | > {tf_var.name} <-- {torch_var_name}') - # if tuple, it is a bias variable - if not isinstance(torch_var_name, tuple): - torch_layer_name = '.'.join(torch_var_name.split('.')[-2:]) - torch_weight = state_dict[torch_var_name] - if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name: - # out_dim, in_dim, filter -> filter, in_dim, out_dim - numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy() - elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - # if variable is for bidirectional lstm and it is a bias vector there - # needs to be pre-defined two matching torch bias vectors - elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name: - bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name] - assert len(bias_vectors) == 2 - numpy_weight = bias_vectors[0] + bias_vectors[1] - elif 'rnn' in tf_var.name and 'kernel' in tf_var.name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - elif 'rnn' in tf_var.name and 'bias' in tf_var.name: - bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key] - assert len(bias_vectors) == 2 - numpy_weight = bias_vectors[0] + bias_vectors[1] - elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name: - numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() - else: - numpy_weight = torch_weight.detach().cpu().numpy() - assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" - tf.keras.backend.set_value(tf_var, numpy_weight) - return tf_vars - - -def load_tf_vars(model_tf, tf_vars): - for tf_var in tf_vars: - model_tf.get_layer(tf_var.name).set_weights(tf_var) - return model_tf diff --git a/tf/utils/generic_utils.py b/tf/utils/generic_utils.py deleted file mode 100644 index 1fea4cbb..00000000 --- a/tf/utils/generic_utils.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import datetime -import importlib -import pickle -import numpy as np -import tensorflow as tf - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): - state = { - 'model': model.weights, - 'optimizer': optimizer, - 'step': current_step, - 'epoch': epoch, - 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': r - } - state.update(kwargs) - pickle.dump(state, open(output_path, 'wb')) - - -def load_checkpoint(model, checkpoint_path): - checkpoint = pickle.load(open(checkpoint_path, 'rb')) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - try: - chkp_var_value = chkp_var_dict[layer_name] - except KeyError: - class_name = list(chkp_var_dict.keys())[0].split("/")[0] - layer_name = f"{class_name}/{layer_name}" - chkp_var_value = chkp_var_dict[layer_name] - - tf.keras.backend.set_value(tf_var, chkp_var_value) - if 'r' in checkpoint.keys(): - model.decoder.set_r(checkpoint['r']) - return model - - -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.max() - batch_size = sequence_length.size(0) - seq_range = np.empty([0, max_len], dtype=np.int8) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.cuda() - seq_length_expand = ( - sequence_length.unsqueeze(1).expand_as(seq_range_expand)) - # B x T_max - return seq_range_expand < seq_length_expand - - -# @tf.custom_gradient -def check_gradient(x, grad_clip): - x_normed = tf.clip_by_norm(x, grad_clip) - grad_norm = tf.norm(grad_clip) - return x_normed, grad_norm - - -def count_parameters(model, c): - try: - return model.count_params() - except RuntimeError: - input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32')) - input_lengths = np.random.randint(100, 129, (8, )) - input_lengths[-1] = 128 - input_lengths = tf.convert_to_tensor(input_lengths.astype('int32')) - mel_spec = np.random.rand(8, 2 * c.r, - c.audio['num_mels']).astype('float32') - mel_spec = tf.convert_to_tensor(mel_spec) - speaker_ids = np.random.randint( - 0, 5, (8, )) if c.use_speaker_embedding else None - _ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids) - return model.count_params() - - -def setup_model(num_chars, num_speakers, c, enable_tflite=False): - print(" > Using model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower()) - MyModel = getattr(MyModel, c.model) - if c.model.lower() in "tacotron": - raise NotImplementedError(' [!] Tacotron model is not ready.') - # tacotron2 - model = MyModel(num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio['num_mels'], - decoder_output_dim=c.audio['num_mels'], - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - enable_tflite=enable_tflite) - return model diff --git a/tf/utils/io.py b/tf/utils/io.py deleted file mode 100644 index 78a56de4..00000000 --- a/tf/utils/io.py +++ /dev/null @@ -1,42 +0,0 @@ -import pickle -import datetime -import tensorflow as tf - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwargs): - state = { - 'model': model.weights, - 'optimizer': optimizer, - 'step': current_step, - 'epoch': epoch, - 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': r - } - state.update(kwargs) - pickle.dump(state, open(output_path, 'wb')) - - -def load_checkpoint(model, checkpoint_path): - checkpoint = pickle.load(open(checkpoint_path, 'rb')) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - try: - chkp_var_value = chkp_var_dict[layer_name] - except KeyError: - class_name = list(chkp_var_dict.keys())[0].split("/")[0] - layer_name = f"{class_name}/{layer_name}" - chkp_var_value = chkp_var_dict[layer_name] - - tf.keras.backend.set_value(tf_var, chkp_var_value) - if 'r' in checkpoint.keys(): - model.decoder.set_r(checkpoint['r']) - return model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model - diff --git a/tf/utils/tf_utils.py b/tf/utils/tf_utils.py deleted file mode 100644 index 558936d5..00000000 --- a/tf/utils/tf_utils.py +++ /dev/null @@ -1,8 +0,0 @@ -import tensorflow as tf - - -def shape_list(x): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] diff --git a/tf/utils/tflite.py b/tf/utils/tflite.py deleted file mode 100644 index 5e684b30..00000000 --- a/tf/utils/tflite.py +++ /dev/null @@ -1,31 +0,0 @@ -import tensorflow as tf - - -def convert_tacotron2_to_tflite(model, - output_path=None, - experimental_converter=True): - """Convert Tensorflow Tacotron2 model to TFLite. Save a binary file if output_path is - provided, else return TFLite model.""" - - concrete_function = model.inference_tflite.get_concrete_function() - converter = tf.lite.TFLiteConverter.from_concrete_functions( - [concrete_function]) - converter.experimental_new_converter = experimental_converter - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_ops = [ - tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS - ] - tflite_model = converter.convert() - print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.') - if output_path is not None: - # same model binary if outputpath is provided - with open(output_path, 'wb') as f: - f.write(tflite_model) - return None - return tflite_model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model \ No newline at end of file diff --git a/train.py b/train.py deleted file mode 100644 index 189a6baa..00000000 --- a/train.py +++ /dev/null @@ -1,641 +0,0 @@ -import argparse -import os -import sys -import glob -import time -import traceback - -import numpy as np -import torch -from torch.utils.data import DataLoader - -from TTS.datasets.TTSDataset import MyDataset -from distribute import (DistributedSampler, apply_gradient_allreduce, - init_distributed, reduce_tensor) -from TTS.layers.losses import TacotronLoss -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder, - get_git_branch, set_init_dict, - setup_model, KeepAverage, check_config) -from TTS.utils.io import (save_best_model, save_checkpoint, - load_config, copy_config_file) -from TTS.utils.training import (NoamLR, check_update, adam_weight_decay, - gradual_training_scheduler, set_weight_decay, - setup_torch_training_env) -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ - get_speakers -from TTS.utils.synthesis import synthesis -from TTS.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.utils.visual import plot_alignment, plot_spectrogram -from TTS.datasets.preprocess import load_meta_data -from TTS.utils.radam import RAdam -from TTS.utils.measures import alignment_diagonal_score - - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = MyDataset( - r, - c.text_cleaner, - compute_linear_spec=True if c.model.lower() == 'tacotron' else False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=c.characters if 'characters' in c.keys() else None, - batch_group_size=0 if is_val else c.batch_group_size * - c.batch_size, - min_seq_len=c.min_seq_len, - max_seq_len=c.max_seq_len, - phoneme_cache_path=c.phoneme_cache_path, - use_phonemes=c.use_phonemes, - phoneme_language=c.phoneme_language, - enable_eos_bos=c.enable_eos_bos_chars, - verbose=verbose) - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.eval_batch_size if is_val else c.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) - return loader - - -def format_data(data): - if c.use_speaker_embedding: - speaker_mapping = load_speaker_mapping(OUT_PATH) - - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron"] else None - mel_input = data[4] - mel_lengths = data[5] - stop_targets = data[6] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if c.use_speaker_embedding: - speaker_ids = [ - speaker_mapping[speaker_name] for speaker_name in speaker_names - ] - speaker_ids = torch.LongTensor(speaker_ids) - else: - speaker_ids = None - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length - - -def train(model, criterion, optimizer, optimizer_st, scheduler, - ap, global_step, epoch): - data_loader = setup_loader(ap, model.decoder.r, is_val=False, - verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # setup lr - if c.noam_schedule: - scheduler.step() - optimizer.zero_grad() - if optimizer_st: - optimizer_st.zero_grad() - - # forward pass model - if c.bidirectional_decoder or c.double_decoder_consistency: - decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( - text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids) - decoder_backward_output = None - alignments_backward = None - - # set the alignment lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) - - # backward pass - loss_dict['loss'].backward() - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True) - optimizer.step() - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict['align_error'] = align_error - - # backpass and check the grad norm for stop loss - if c.separate_stopnet: - loss_dict['stopnet_loss'].backward() - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - optimizer_st.step() - else: - grad_norm_st = 0 - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) - loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) - loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus) - loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss'] - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % c.print_step == 0: - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - avg_spec_length, avg_text_length, - step_time, loader_time, current_lr, - loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % c.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time - } - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH, - optimizer_st=optimizer_st, - model_loss=loss_dict['postnet_loss']) - - # Diagnostic visualizations - const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST" - ] else mel_input[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - if c.bidirectional_decoder or c.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy()) - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - if c.model in ["Tacotron", "TacotronGST"]: - train_audio = ap.inv_spectrogram(const_spec.T) - else: - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, - {'TrainAudio': train_audio}, - c.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if c.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, model.decoder.r, is_val=True) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data) - assert mel_input.shape[1] % model.decoder.r == 0 - - # forward pass model - if c.bidirectional_decoder or c.double_decoder_consistency: - decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) - decoder_backward_output = None - alignments_backward = None - - # set the alignment lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict['align_error'] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) - loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) - if c.stopnet: - loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - keep_avg.update_values(update_train_values) - - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST" - ] else mel_input[idx].data.cpu().numpy() - align_img = alignments[idx].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img) - } - - # Sample audio - if c.model in ["Tacotron", "TacotronGST"]: - eval_audio = ap.inv_spectrogram(const_spec.T) - else: - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, - c.audio["sample_rate"]) - - # Plot Validation Stats - - if c.bidirectional_decoder or c.double_decoder_consistency: - align_b_img = alignments_backward[idx].data.cpu().numpy() - eval_figures['alignment2'] = plot_alignment(align_b_img) - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch > c.test_delay_epochs: - if c.test_sentences_file is None: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ] - else: - with open(c.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - speaker_id = 0 if c.use_speaker_embedding else None - style_wav = c.get("style_wav_for_test") - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( - model, - test_sentence, - c, - use_cuda, - ap, - speaker_id=speaker_id, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios['{}-audio'.format(idx)] = wav - test_figures['{}-prediction'.format(idx)] = plot_spectrogram( - postnet_output, ap) - test_figures['{}-alignment'.format(idx)] = plot_alignment( - alignment) - except: - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, - c.audio['sample_rate']) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes - # Audio processor - ap = AudioProcessor(**c.audio) - if 'characters' in c.keys(): - symbols, phonemes = make_symbols(**c.characters) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, - c.distributed["backend"], c.distributed["url"]) - num_chars = len(phonemes) if c.use_phonemes else len(symbols) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(c.datasets) - - # parse speakers - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - assert all([speaker in speaker_mapping - for speaker in speakers]), "As of now you, you cannot " \ - "introduce new speakers to " \ - "a previously trained model." - else: - speaker_mapping = {name: i for i, name in enumerate(speakers)} - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print("Training with {} speakers: {}".format(num_speakers, - ", ".join(speakers))) - else: - num_speakers = 0 - - model = setup_model(num_chars, num_speakers, c) - - params = set_weight_decay(model, c.wd) - optimizer = RAdam(params, lr=c.lr, weight_decay=0) - if c.stopnet and c.separate_stopnet: - optimizer_st = RAdam(model.decoder.stopnet.parameters(), - lr=c.lr, - weight_decay=0) - else: - optimizer_st = None - - # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - - if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - # optimizer.load_state_dict(checkpoint['optimizer']) - if c.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint['model']) - except: - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) - model.load_state_dict(model_dict) - del model_dict - for group in optimizer.param_groups: - group['lr'] = c.lr - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = apply_gradient_allreduce(model) - - if c.noam_schedule: - scheduler = NoamLR(optimizer, - warmup_steps=c.warmup_steps, - last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if 'best_loss' not in locals(): - best_loss = float('inf') - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - # set gradual training - if c.gradual_training is not None: - r, c.batch_size = gradual_training_scheduler(global_step, c) - c.r = r - model.decoder.set_r(r) - if c.bidirectional_decoder: - model.decoder_backward.set_r(r) - print("\n > Number of output frames:", model.decoder.r) - train_avg_loss_dict, global_step = train(model, criterion, optimizer, - optimizer_st, scheduler, ap, - global_step, epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict['avg_postnet_loss'] - if c.run_eval: - target_loss = eval_avg_loss_dict['avg_postnet_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument( - '--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv - ) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob(args.continue_path + "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/tts_namespace/README.md b/tts_namespace/README.md deleted file mode 100644 index c5b2ddbf..00000000 --- a/tts_namespace/README.md +++ /dev/null @@ -1,29 +0,0 @@ -This folder contains a symlink called TTS to the parent folder: - - lrwxr-xr-x TTS -> .. - -This is used to appease the distribute/setuptools gods. When the project was -initially set up, the repository folder itself was considered a namespace, and -development was done with `sys.path` hacks. This means if you tried to install -TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of - `TTS.models`, `TTS.utils`... - -Installing TTS would then pollute the package namespace with generic names like -those above. In order to make things installable in both install and development -modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed -to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect -using `packages_dir` in `setup.py` is not enough because it breaks the editable -installation, which can only handle the simplest of `package_dir` redirects. - -Our solution is to use a symlink in order to add the extra `TTS` namespace. In -`setup.py`, we only look for packages inside `tts_namespace` (this folder), -which contains a symlink called TTS pointing to the repository root. The final -result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`... - -With this hack, `pip install -e` will then add a symlink to the `tts_namespace` -in your `site-packages` folder, which works properly. It's important not to add -anything else in this folder because it will pollute the package namespace when -installing the project. - -This does not work if you check out your project on a filesystem that does not -support symlinks. \ No newline at end of file diff --git a/tts_namespace/TTS b/tts_namespace/TTS deleted file mode 120000 index a96aa0ea..00000000 --- a/tts_namespace/TTS +++ /dev/null @@ -1 +0,0 @@ -.. \ No newline at end of file diff --git a/utils/.generic_utils.py.swo b/utils/.generic_utils.py.swo deleted file mode 100644 index ab1b3870..00000000 Binary files a/utils/.generic_utils.py.swo and /dev/null differ diff --git a/utils/.model.py.swp b/utils/.model.py.swp deleted file mode 100644 index 24a8152e..00000000 Binary files a/utils/.model.py.swp and /dev/null differ diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/utils/audio.py b/utils/audio.py deleted file mode 100644 index 5b5b5059..00000000 --- a/utils/audio.py +++ /dev/null @@ -1,356 +0,0 @@ -import librosa -import soundfile as sf -import numpy as np -import scipy.io -import scipy.signal - -from TTS.utils.data import StandardScaler - - -class AudioProcessor(object): - def __init__(self, - sample_rate=None, - num_mels=None, - min_level_db=None, - frame_shift_ms=None, - frame_length_ms=None, - hop_length=None, - win_length=None, - ref_level_db=None, - fft_size=1024, - power=None, - preemphasis=0.0, - signal_norm=None, - symmetric_norm=None, - max_norm=None, - mel_fmin=None, - mel_fmax=None, - spec_gain=20, - stft_pad_mode='reflect', - clip_norm=True, - griffin_lim_iters=None, - do_trim_silence=False, - trim_db=60, - do_sound_norm=False, - stats_path=None, - **_): - - print(" > Setting up Audio Processor...") - # setup class attributed - self.sample_rate = sample_rate - self.num_mels = num_mels - self.min_level_db = min_level_db or 0 - self.frame_shift_ms = frame_shift_ms - self.frame_length_ms = frame_length_ms - self.ref_level_db = ref_level_db - self.fft_size = fft_size - self.power = power - self.preemphasis = preemphasis - self.griffin_lim_iters = griffin_lim_iters - self.signal_norm = signal_norm - self.symmetric_norm = symmetric_norm - self.mel_fmin = mel_fmin or 0 - self.mel_fmax = mel_fmax - self.spec_gain = float(spec_gain) - self.stft_pad_mode = 'reflect' - self.max_norm = 1.0 if max_norm is None else float(max_norm) - self.clip_norm = clip_norm - self.do_trim_silence = do_trim_silence - self.trim_db = trim_db - self.do_sound_norm = do_sound_norm - self.stats_path = stats_path - # setup stft parameters - if hop_length is None: - # compute stft parameters from given time values - self.hop_length, self.win_length = self._stft_parameters() - else: - # use stft parameters from config file - self.hop_length = hop_length - self.win_length = win_length - assert min_level_db != 0.0, " [!] min_level_db is 0" - assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" - members = vars(self) - for key, value in members.items(): - print(" | > {}:{}".format(key, value)) - # create spectrogram utils - self.mel_basis = self._build_mel_basis() - self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) - # setup scaler - if stats_path: - mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) - self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) - self.signal_norm = True - self.max_norm = None - self.clip_norm = None - self.symmetric_norm = None - - ### setting up the parameters ### - def _build_mel_basis(self, ): - if self.mel_fmax is not None: - assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel( - self.sample_rate, - self.fft_size, - n_mels=self.num_mels, - fmin=self.mel_fmin, - fmax=self.mel_fmax) - - def _stft_parameters(self, ): - """Compute necessary stft parameters with given time values""" - factor = self.frame_length_ms / self.frame_shift_ms - assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(hop_length * factor) - return hop_length, win_length - - ### normalization ### - def _normalize(self, S): - """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" - #pylint: disable=no-else-return - S = S.copy() - if self.signal_norm: - # mean-var scaling - if hasattr(self, 'mel_scaler'): - if S.shape[0] == self.num_mels: - return self.mel_scaler.transform(S.T).T - elif S.shape[0] == self.fft_size / 2: - return self.linear_scaler.transform(S.T).T - else: - raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') - # range normalization - S -= self.ref_level_db # discard certain range of DB assuming it is air noise - S_norm = ((S - self.min_level_db) / (-self.min_level_db)) - if self.symmetric_norm: - S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm - if self.clip_norm: - S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) - return S_norm - else: - S_norm = self.max_norm * S_norm - if self.clip_norm: - S_norm = np.clip(S_norm, 0, self.max_norm) - return S_norm - else: - return S - - def _denormalize(self, S): - """denormalize values""" - #pylint: disable=no-else-return - S_denorm = S.copy() - if self.signal_norm: - # mean-var scaling - if hasattr(self, 'mel_scaler'): - if S_denorm.shape[0] == self.num_mels: - return self.mel_scaler.inverse_transform(S_denorm.T).T - elif S_denorm.shape[0] == self.fft_size / 2: - return self.linear_scaler.inverse_transform(S_denorm.T).T - else: - raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') - if self.symmetric_norm: - if self.clip_norm: - S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) - S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db - return S_denorm + self.ref_level_db - else: - if self.clip_norm: - S_denorm = np.clip(S_denorm, 0, self.max_norm) - S_denorm = (S_denorm * -self.min_level_db / - self.max_norm) + self.min_level_db - return S_denorm + self.ref_level_db - else: - return S_denorm - - ### Mean-STD scaling ### - def load_stats(self, stats_path): - stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg - mel_mean = stats['mel_mean'] - mel_std = stats['mel_std'] - linear_mean = stats['linear_mean'] - linear_std = stats['linear_std'] - stats_config = stats['audio_config'] - # check all audio parameters used for computing stats - skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power'] - for key in stats_config.keys(): - if key in skip_parameters: - continue - assert stats_config[key] == self.__dict__[key],\ - f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" - return mel_mean, mel_std, linear_mean, linear_std, stats_config - - # pylint: disable=attribute-defined-outside-init - def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): - self.mel_scaler = StandardScaler() - self.mel_scaler.set_stats(mel_mean, mel_std) - self.linear_scaler = StandardScaler() - self.linear_scaler.set_stats(linear_mean, linear_std) - - ### DB and AMP conversion ### - # pylint: disable=no-self-use - def _amp_to_db(self, x): - return self.spec_gain * np.log10(np.maximum(1e-5, x)) - - # pylint: disable=no-self-use - def _db_to_amp(self, x): - return np.power(10.0, x / self.spec_gain) - - ### Preemphasis ### - def apply_preemphasis(self, x): - if self.preemphasis == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") - return scipy.signal.lfilter([1, -self.preemphasis], [1], x) - - def apply_inv_preemphasis(self, x): - if self.preemphasis == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") - return scipy.signal.lfilter([1], [1, -self.preemphasis], x) - - ### SPECTROGRAMs ### - def _linear_to_mel(self, spectrogram): - return np.dot(self.mel_basis, spectrogram) - - def _mel_to_linear(self, mel_spec): - return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) - - def spectrogram(self, y): - if self.preemphasis != 0: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) - S = self._amp_to_db(np.abs(D)) - return self._normalize(S) - - def melspectrogram(self, y): - if self.preemphasis != 0: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) - S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - return self._normalize(S) - - def inv_spectrogram(self, spectrogram): - """Converts spectrogram to waveform using librosa""" - S = self._denormalize(spectrogram) - S = self._db_to_amp(S) - # Reconstruct phase - if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) - - def inv_melspectrogram(self, mel_spectrogram): - '''Converts melspectrogram to waveform using librosa''' - D = self._denormalize(mel_spectrogram) - S = self._db_to_amp(D) - S = self._mel_to_linear(S) # Convert back to linear - if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) - - def out_linear_to_mel(self, linear_spec): - S = self._denormalize(linear_spec) - S = self._db_to_amp(S) - S = self._linear_to_mel(np.abs(S)) - S = self._amp_to_db(S) - mel = self._normalize(S) - return mel - - ### STFT and ISTFT ### - def _stft(self, y): - return librosa.stft( - y=y, - n_fft=self.fft_size, - hop_length=self.hop_length, - win_length=self.win_length, - pad_mode=self.stft_pad_mode, - ) - - def _istft(self, y): - return librosa.istft( - y, hop_length=self.hop_length, win_length=self.win_length) - - def _griffin_lim(self, S): - angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) - S_complex = np.abs(S).astype(np.complex) - y = self._istft(S_complex * angles) - for _ in range(self.griffin_lim_iters): - angles = np.exp(1j * np.angle(self._stft(y))) - y = self._istft(S_complex * angles) - return y - - def compute_stft_paddings(self, x, pad_sides=1): - '''compute right padding (final frame) or both sides padding (first and final frames) - ''' - assert pad_sides in (1, 2) - pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] - if pad_sides == 1: - return 0, pad - return pad // 2, pad // 2 + pad % 2 - - ### Audio Processing ### - def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): - window_length = int(self.sample_rate * min_silence_sec) - hop_length = int(window_length / 4) - threshold = self._db_to_amp(threshold_db) - for x in range(hop_length, len(wav) - window_length, hop_length): - if np.max(wav[x:x + window_length]) < threshold: - return x + hop_length - return len(wav) - - def trim_silence(self, wav): - """ Trim silent parts with a threshold and 0.01 sec margin """ - margin = int(self.sample_rate * 0.01) - wav = wav[margin:-margin] - return librosa.effects.trim( - wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] - - @staticmethod - def sound_norm(x): - return x / abs(x).max() * 0.9 - - ### save and load ### - def load_wav(self, filename, sr=None): - if sr is None: - x, sr = sf.read(filename) - else: - x, sr = librosa.load(filename, sr=sr) - if self.do_trim_silence: - try: - x = self.trim_silence(x) - except ValueError: - print(f' [!] File cannot be trimmed for silence - {filename}') - assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) - if self.do_sound_norm: - x = self.sound_norm(x) - return x - - def save_wav(self, wav, path): - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) - - @staticmethod - def mulaw_encode(wav, qc): - mu = 2 ** qc - 1 - # wav_abs = np.minimum(np.abs(wav), 1.0) - signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) - # Quantize signal to the specified number of levels. - signal = (signal + 1) / 2 * mu + 0.5 - return np.floor(signal,) - - @staticmethod - def mulaw_decode(wav, qc): - """Recovers waveform from quantized values.""" - mu = 2 ** qc - 1 - x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) - return x - - - @staticmethod - def encode_16bits(x): - return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) - - @staticmethod - def quantize(x, bits): - return (x + 1.) * (2**bits - 1) / 2 - - @staticmethod - def dequantize(x, bits): - return 2 * x / (2**bits - 1) - 1 diff --git a/utils/console_logger.py b/utils/console_logger.py deleted file mode 100644 index 85d5b376..00000000 --- a/utils/console_logger.py +++ /dev/null @@ -1,95 +0,0 @@ -import datetime -from TTS.utils.io import AttrDict - - -tcolors = AttrDict({ - 'OKBLUE': '\033[94m', - 'HEADER': '\033[95m', - 'OKGREEN': '\033[92m', - 'WARNING': '\033[93m', - 'FAIL': '\033[91m', - 'ENDC': '\033[0m', - 'BOLD': '\033[1m', - 'UNDERLINE': '\033[4m' -}) - - -class ConsoleLogger(): - def __init__(self): - # TODO: color code for value changes - # use these to compare values between iterations - self.old_train_loss_dict = None - self.old_epoch_loss_dict = None - self.old_eval_loss_dict = None - - # pylint: disable=no-self-use - def get_time(self): - now = datetime.datetime.now() - return now.strftime("%Y-%m-%d %H:%M:%S") - - def print_epoch_start(self, epoch, max_epoch): - print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD, - epoch, max_epoch, tcolors.ENDC), - flush=True) - - def print_train_start(self): - print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}") - - def print_train_step(self, batch_steps, step, global_step, avg_spec_length, - avg_text_length, step_time, loader_time, lr, - loss_dict, avg_loss_dict): - indent = " | > " - print() - log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format( - tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) - for key, value in loss_dict.items(): - # print the avg value if given - if f'avg_{key}' in avg_loss_dict.keys(): - log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) - else: - log_text += "{}{}: {:.5f} \n".format(indent, key, value) - log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\ - f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}" - print(log_text, flush=True) - - # pylint: disable=unused-argument - def print_train_epoch_end(self, global_step, epoch, epoch_time, - print_dict): - indent = " | > " - log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" - for key, value in print_dict.items(): - log_text += "{}{}: {:.5f}\n".format(indent, key, value) - print(log_text, flush=True) - - def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") - - def print_eval_step(self, step, loss_dict, avg_loss_dict): - indent = " | > " - print() - log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" - for key, value in loss_dict.items(): - # print the avg value if given - if f'avg_{key}' in avg_loss_dict.keys(): - log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) - else: - log_text += "{}{}: {:.5f} \n".format(indent, key, value) - print(log_text, flush=True) - - def print_epoch_end(self, epoch, avg_loss_dict): - indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format( - tcolors.BOLD, tcolors.ENDC) - for key, value in avg_loss_dict.items(): - # print the avg value if given - color = tcolors.FAIL - sign = '+' - diff = 0 - if self.old_eval_loss_dict is not None: - diff = value - self.old_eval_loss_dict[key] - if diff <= 0: - color = tcolors.OKGREEN - sign = '' - log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff) - self.old_eval_loss_dict = avg_loss_dict - print(log_text, flush=True) diff --git a/utils/data.py b/utils/data.py deleted file mode 100644 index a83325cb..00000000 --- a/utils/data.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np - - -def _pad_data(x, length): - _pad = 0 - assert x.ndim == 1 - return np.pad( - x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) - - -def prepare_data(inputs): - max_len = max((len(x) for x in inputs)) - return np.stack([_pad_data(x, max_len) for x in inputs]) - - -def _pad_tensor(x, length): - _pad = 0. - assert x.ndim == 2 - x = np.pad( - x, [[0, 0], [0, length - x.shape[1]]], - mode='constant', - constant_values=_pad) - return x - - -def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) - remainder = max_len % out_steps - pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len - return np.stack([_pad_tensor(x, pad_len) for x in inputs]) - - -def _pad_stop_target(x, length): - _pad = 0. - assert x.ndim == 1 - return np.pad( - x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) - - -def prepare_stop_target(inputs, out_steps): - """ Pad row vectors with 1. """ - max_len = max((x.shape[0] for x in inputs)) - remainder = max_len % out_steps - pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len - return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) - - -def pad_per_step(inputs, pad_len): - return np.pad( - inputs, [[0, 0], [0, 0], [0, pad_len]], - mode='constant', - constant_values=0.0) - - -# pylint: disable=attribute-defined-outside-init -class StandardScaler(): - - def set_stats(self, mean, scale): - self.mean_ = mean - self.scale_ = scale - - def reset_stats(self): - delattr(self, 'mean_') - delattr(self, 'scale_') - - def transform(self, X): - X = np.asarray(X) - X -= self.mean_ - X /= self.scale_ - return X - - def inverse_transform(self, X): - X = np.asarray(X) - X *= self.scale_ - X += self.mean_ - return X - diff --git a/utils/generic_utils.py b/utils/generic_utils.py deleted file mode 100644 index c806bdf3..00000000 --- a/utils/generic_utils.py +++ /dev/null @@ -1,362 +0,0 @@ -import os -import glob -import torch -import shutil -import datetime -import subprocess -import importlib -import numpy as np -from collections import Counter - - -def get_git_branch(): - try: - out = subprocess.check_output(["git", "branch"]).decode("utf8") - current = next(line for line in out.split("\n") - if line.startswith("*")) - current.replace("* ", "") - except subprocess.CalledProcessError: - current = "inside_docker" - return current - - -def get_commit_hash(): - """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" - # try: - # subprocess.check_output(['git', 'diff-index', '--quiet', - # 'HEAD']) # Verify client is clean - # except: - # raise RuntimeError( - # " !! Commit before training to get the commit hash.") - try: - commit = subprocess.check_output( - ['git', 'rev-parse', '--short', 'HEAD']).decode().strip() - # Not copying .git folder into docker container - except subprocess.CalledProcessError: - commit = "0000000" - print(' > Git Hash: {}'.format(commit)) - return commit - - -def create_experiment_folder(root_path, model_name, debug): - """ Create a folder with the current date and time """ - date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - if debug: - commit_hash = 'debug' - else: - commit_hash = get_commit_hash() - output_folder = os.path.join( - root_path, model_name + '-' + date_str + '-' + commit_hash) - os.makedirs(output_folder, exist_ok=True) - print(" > Experiment folder: {}".format(output_folder)) - return output_folder - - -def remove_experiment_folder(experiment_path): - """Check folder if there is a checkpoint, otherwise remove the folder""" - - checkpoint_files = glob.glob(experiment_path + "/*.pth.tar") - if not checkpoint_files: - if os.path.exists(experiment_path): - shutil.rmtree(experiment_path, ignore_errors=True) - print(" ! Run is removed from {}".format(experiment_path)) - else: - print(" ! Run is kept in {}".format(experiment_path)) - - -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def split_dataset(items): - is_multi_speaker = False - speakers = [item[-1] for item in items] - is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if len(items) * 0.01 > 500 else int( - len(items) * 0.01) - assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." - np.random.seed(0) - np.random.shuffle(items) - if is_multi_speaker: - items_eval = [] - # most stupid code ever -- Fix it ! - while len(items_eval) < eval_split_size: - speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) - item_idx = np.random.randint(0, len(items)) - if speaker_counter[items[item_idx][-1]] > 1: - items_eval.append(items[item_idx]) - del items[item_idx] - return items_eval, items - return items[:eval_split_size], items[eval_split_size:] - - -# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.data.max() - batch_size = sequence_length.size(0) - seq_range = torch.arange(0, max_len).long() - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.to(sequence_length.device) - seq_length_expand = ( - sequence_length.unsqueeze(1).expand_as(seq_range_expand)) - # B x T_max - return seq_range_expand < seq_length_expand - - -def set_init_dict(model_dict, checkpoint_state, c): - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint_state.items(): - if k not in model_dict: - print(" | > Layer missing in the model definition: {}".format(k)) - # 1. filter out unnecessary keys - pretrained_dict = { - k: v - for k, v in checkpoint_state.items() if k in model_dict - } - # 2. filter out different size layers - pretrained_dict = { - k: v - for k, v in pretrained_dict.items() - if v.numel() == model_dict[k].numel() - } - # 3. skip reinit layers - if c.reinit_layers is not None: - for reinit_layer_name in c.reinit_layers: - pretrained_dict = { - k: v - for k, v in pretrained_dict.items() - if reinit_layer_name not in k - } - # 4. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - print(" | > {} / {} layers are restored.".format(len(pretrained_dict), - len(model_dict))) - return model_dict - - -def setup_model(num_chars, num_speakers, c): - print(" > Using model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.models.' + c.model.lower()) - MyModel = getattr(MyModel, c.model) - if c.model.lower() in "tacotron": - model = MyModel(num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), - decoder_output_dim=c.audio['num_mels'], - gst=c.use_gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r) - elif c.model.lower() == "tacotron2": - model = MyModel(num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio['num_mels'], - decoder_output_dim=c.audio['num_mels'], - gst=c.use_gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r) - return model - -class KeepAverage(): - def __init__(self): - self.avg_values = {} - self.iters = {} - - def __getitem__(self, key): - return self.avg_values[key] - - def items(self): - return self.avg_values.items() - - def add_value(self, name, init_val=0, init_iter=0): - self.avg_values[name] = init_val - self.iters[name] = init_iter - - def update_value(self, name, value, weighted_avg=False): - if name not in self.avg_values: - # add value if not exist before - self.add_value(name, init_val=value) - else: - # else update existing value - if weighted_avg: - self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value - self.iters[name] += 1 - else: - self.avg_values[name] = self.avg_values[name] * \ - self.iters[name] + value - self.iters[name] += 1 - self.avg_values[name] /= self.iters[name] - - def add_values(self, name_dict): - for key, value in name_dict.items(): - self.add_value(key, init_val=value) - - def update_values(self, value_dict): - for key, value in value_dict.items(): - self.update_value(key, value) - - -def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None): - if alternative in c.keys() and c[alternative] is not None: - return - if restricted: - assert name in c.keys(), f' [!] {name} not defined in config.json' - if name in c.keys(): - if max_val: - assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}' - if min_val: - assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' - if enum_list: - assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' - if val_type: - assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' - - -def check_config(c): - _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) - _check_argument('run_name', c, restricted=True, val_type=str) - _check_argument('run_description', c, val_type=str) - - # AUDIO - _check_argument('audio', c, restricted=True, val_type=dict) - - # audio processing parameters - _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) - _check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) - _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) - _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') - _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') - _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) - _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) - _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) - _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) - _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) - - # vocabulary parameters - _check_argument('characters', c, restricted=False, val_type=dict) - _check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - _check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - _check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - _check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - _check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - _check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - - # normalization parameters - _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) - _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) - _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) - _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) - _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) - _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) - _check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100) - _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) - _check_argument('trim_db', c['audio'], restricted=True, val_type=int) - - # training parameters - _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) - _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) - _check_argument('r', c, restricted=True, val_type=int, min_val=1) - _check_argument('gradual_training', c, restricted=False, val_type=list) - _check_argument('loss_masking', c, restricted=True, val_type=bool) - # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) - - # validation parameters - _check_argument('run_eval', c, restricted=True, val_type=bool) - _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) - _check_argument('test_sentences_file', c, restricted=False, val_type=str) - - # optimizer - _check_argument('noam_schedule', c, restricted=False, val_type=bool) - _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) - _check_argument('epochs', c, restricted=True, val_type=int, min_val=1) - _check_argument('lr', c, restricted=True, val_type=float, min_val=0) - _check_argument('wd', c, restricted=True, val_type=float, min_val=0) - _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) - _check_argument('seq_len_norm', c, restricted=True, val_type=bool) - - # tacotron prenet - _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) - _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) - _check_argument('prenet_dropout', c, restricted=True, val_type=bool) - - # attention - _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) - _check_argument('attention_heads', c, restricted=True, val_type=int) - _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) - _check_argument('windowing', c, restricted=True, val_type=bool) - _check_argument('use_forward_attn', c, restricted=True, val_type=bool) - _check_argument('forward_attn_mask', c, restricted=True, val_type=bool) - _check_argument('transition_agent', c, restricted=True, val_type=bool) - _check_argument('transition_agent', c, restricted=True, val_type=bool) - _check_argument('location_attn', c, restricted=True, val_type=bool) - _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) - _check_argument('double_decoder_consistency', c, restricted=True, val_type=bool) - _check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) - - # stopnet - _check_argument('stopnet', c, restricted=True, val_type=bool) - _check_argument('separate_stopnet', c, restricted=True, val_type=bool) - - # tensorboard - _check_argument('print_step', c, restricted=True, val_type=int, min_val=1) - _check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) - _check_argument('save_step', c, restricted=True, val_type=int, min_val=1) - _check_argument('checkpoint', c, restricted=True, val_type=bool) - _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) - - # dataloading - # pylint: disable=import-outside-toplevel - from TTS.utils.text import cleaners - _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) - _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) - _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) - _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) - _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) - _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) - _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) - - # paths - _check_argument('output_path', c, restricted=True, val_type=str) - - # multi-speaker gst - _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) - _check_argument('style_wav_for_test', c, restricted=True, val_type=str) - _check_argument('use_gst', c, restricted=True, val_type=bool) - - # datasets - checking only the first entry - _check_argument('datasets', c, restricted=True, val_type=list) - for dataset_entry in c['datasets']: - _check_argument('name', dataset_entry, restricted=True, val_type=str) - _check_argument('path', dataset_entry, restricted=True, val_type=str) - _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) - _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/utils/io.py b/utils/io.py deleted file mode 100644 index faf00195..00000000 --- a/utils/io.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import json -import re -import torch -import datetime - - -class AttrDict(dict): - def __init__(self, *args, **kwargs): - super(AttrDict, self).__init__(*args, **kwargs) - self.__dict__ = self - - -def load_config(config_path): - config = AttrDict() - with open(config_path, "r") as f: - input_str = f.read() - input_str = re.sub(r'\\\n', '', input_str) - input_str = re.sub(r'//.*\n', '\n', input_str) - data = json.loads(input_str) - config.update(data) - return config - - -def copy_config_file(config_file, out_path, new_fields): - config_lines = open(config_file, "r").readlines() - # add extra information fields - for key, value in new_fields.items(): - if isinstance(value, str): - new_line = '"{}":"{}",\n'.format(key, value) - else: - new_line = '"{}":{},\n'.format(key, value) - config_lines.insert(1, new_line) - config_out_file = open(out_path, "w") - config_out_file.writelines(config_lines) - config_out_file.close() - - -def load_checkpoint(model, checkpoint_path, use_cuda=False): - state = torch.load(checkpoint_path, map_location=torch.device('cpu')) - model.load_state_dict(state['model']) - if use_cuda: - model.cuda() - # set model stepsize - if 'r' in state.keys(): - model.decoder.set_r(state['r']) - return model, state - - -def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs): - new_state_dict = model.state_dict() - state = { - 'model': new_state_dict, - 'optimizer': optimizer.state_dict() if optimizer is not None else None, - 'step': current_step, - 'epoch': epoch, - 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': r - } - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): - file_name = 'checkpoint_{}.pth.tar'.format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs) - - -def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs): - if target_loss < best_loss: - file_name = 'best_model.pth.tar' - checkpoint_path = os.path.join(output_folder, file_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) - best_loss = target_loss - return best_loss diff --git a/utils/measures.py b/utils/measures.py deleted file mode 100644 index 01d25695..00000000 --- a/utils/measures.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch - - -def alignment_diagonal_score(alignments, binary=False): - """ - Compute how diagonal alignment predictions are. It is useful - to measure the alignment consistency of a model - Args: - alignments (torch.Tensor): batch of alignments. - binary (bool): if True, ignore scores and consider attention - as a binary mask. - Shape: - alignments : batch x decoder_steps x encoder_steps - """ - maxs = alignments.max(dim=1)[0] - if binary: - maxs[maxs > 0] = 1 - return maxs.mean(dim=1).mean(dim=0).item() diff --git a/utils/radam.py b/utils/radam.py deleted file mode 100644 index 4724b705..00000000 --- a/utils/radam.py +++ /dev/null @@ -1,97 +0,0 @@ -# from https://github.com/LiyuanLucasLiu/RAdam - -import math -import torch -from torch.optim.optimizer import Optimizer, required - - -class RAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): - if lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) - if eps < 0.0: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - - self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): - for param in params: - if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): - param['buffer'] = [[None, None, None] for _ in range(10)] - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) - super(RAdam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(RAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - - state['step'] += 1 - buffered = group['buffer'][int(state['step'] % 10)] - if state['step'] == buffered[0]: - N_sma, step_size = buffered[1], buffered[2] - else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - buffered[1] = N_sma - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - elif self.degenerated_to_sgd: - step_size = 1.0 / (1 - beta1 ** state['step']) - else: - step_size = -1 - buffered[2] = step_size - - # more conservative since it's an approximated value - if N_sma >= 5: - if group['weight_decay'] != 0: - p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) - p.data.copy_(p_data_fp32) - elif step_size > 0: - if group['weight_decay'] != 0: - p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) - p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) - p.data.copy_(p_data_fp32) - - return loss diff --git a/utils/speakers.py b/utils/speakers.py deleted file mode 100644 index 8aa612a8..00000000 --- a/utils/speakers.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import json - -from TTS.datasets.preprocess import get_preprocessor_by_name - - -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - try: - with open(make_speakers_json_path(out_path)) as f: - return json.load(f) - except FileNotFoundError: - return {} - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - speakers_json_path = make_speakers_json_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def get_speakers(items): - """Returns a sorted, unique list of speakers in a given dataset.""" - speakers = {e[2] for e in items} - return sorted(speakers) diff --git a/utils/synthesis.py b/utils/synthesis.py deleted file mode 100644 index ce76b0ec..00000000 --- a/utils/synthesis.py +++ /dev/null @@ -1,231 +0,0 @@ -import pkg_resources -installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable -if 'tensorflow' in installed or 'tensorflow-gpu' in installed: - import tensorflow as tf -import torch -import numpy as np -from .text import text_to_sequence, phoneme_to_sequence - - -def text_to_seqvec(text, CONFIG): - text_cleaner = [CONFIG.text_cleaner] - # text ot phonemes to sequence vector - if CONFIG.use_phonemes: - seq = np.asarray( - phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, - CONFIG.enable_eos_bos_chars, - tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), - dtype=np.int32) - else: - seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) - return seq - - -def numpy_to_torch(np_array, dtype, cuda=False): - if np_array is None: - return None - tensor = torch.as_tensor(np_array, dtype=dtype) - if cuda: - return tensor.cuda() - return tensor - - -def numpy_to_tf(np_array, dtype): - if np_array is None: - return None - tensor = tf.convert_to_tensor(np_array, dtype=dtype) - return tensor - - -def compute_style_mel(style_wav, ap): - style_mel = ap.melspectrogram( - ap.load_wav(style_wav)).expand_dims(0) - return style_mel - - -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id) - else: - if truncated: - decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id) - else: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id) - return decoder_output, postnet_output, alignments, stop_tokens - - -def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst and style_mel is not None: - raise NotImplementedError(' [!] GST inference not implemented for TF') - if truncated: - raise NotImplementedError(' [!] Truncated inference not implemented for TF') - if speaker_id is not None: - raise NotImplementedError(' [!] Multi-Speaker not implemented for TF') - # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, training=False) - return decoder_output, postnet_output, alignments, stop_tokens - - -def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst and style_mel is not None: - raise NotImplementedError(' [!] GST inference not implemented for TfLite') - if truncated: - raise NotImplementedError(' [!] Truncated inference not implemented for TfLite') - if speaker_id is not None: - raise NotImplementedError(' [!] Multi-Speaker not implemented for TfLite') - # get input and output details - input_details = model.get_input_details() - output_details = model.get_output_details() - # reshape input tensor for the new input shape - model.resize_tensor_input(input_details[0]['index'], inputs.shape) - model.allocate_tensors() - detail = input_details[0] - # input_shape = detail['shape'] - model.set_tensor(detail['index'], inputs) - # run the model - model.invoke() - # collect outputs - decoder_output = model.get_tensor(output_details[0]['index']) - postnet_output = model.get_tensor(output_details[1]['index']) - # tflite model only returns feature frames - return decoder_output, postnet_output, None, None - - -def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = decoder_output[0].data.cpu().numpy() - alignment = alignments[0].cpu().data.numpy() - stop_tokens = stop_tokens[0].cpu().numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - -def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].numpy() - decoder_output = decoder_output[0].numpy() - alignment = alignments[0].numpy() - stop_tokens = stop_tokens[0].numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - -def parse_outputs_tflite(postnet_output, decoder_output): - postnet_output = postnet_output[0] - decoder_output = decoder_output[0] - return postnet_output, decoder_output - - -def trim_silence(wav, ap): - return wav[:ap.find_endpoint(wav)] - - -def inv_spectrogram(postnet_output, ap, CONFIG): - if CONFIG.model.lower() in ["tacotron"]: - wav = ap.inv_spectrogram(postnet_output.T) - else: - wav = ap.inv_melspectrogram(postnet_output.T) - return wav - - -def id_to_torch(speaker_id): - if speaker_id is not None: - speaker_id = np.asarray(speaker_id) - speaker_id = torch.from_numpy(speaker_id).unsqueeze(0) - return speaker_id - - -# TODO: perform GL with pytorch for batching -def apply_griffin_lim(inputs, input_lens, CONFIG, ap): - '''Apply griffin-lim to each sample iterating throught the first dimension. - Args: - inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size. - input_lens (Tensor or np.Array): 1D array of sample lengths. - CONFIG (Dict): TTS config. - ap (AudioProcessor): TTS audio processor. - ''' - wavs = [] - for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding - wav = inv_spectrogram(spec, ap, CONFIG) - # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" - wavs.append(wav[:wav_len]) - return wavs - - -def synthesis(model, - text, - CONFIG, - use_cuda, - ap, - speaker_id=None, - style_wav=None, - truncated=False, - enable_eos_bos_chars=False, #pylint: disable=unused-argument - use_griffin_lim=False, - do_trim_silence=False, - backend='torch'): - """Synthesize voice for the given text. - - Args: - model (TTS.models): model to synthesize. - text (str): target text - CONFIG (dict): config dictionary to be loaded from config.json. - use_cuda (bool): enable cuda. - ap (TTS.utils.audio.AudioProcessor): audio processor to process - model outputs. - speaker_id (int): id of speaker - style_wav (str): Uses for style embedding of GST. - truncated (bool): keep model states after inference. It can be used - for continuous inference at long texts. - enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. - do_trim_silence (bool): trim silence after synthesis. - backend (str): tf or torch - """ - # GST processing - style_mel = None - if CONFIG.model == "TacotronGST" and style_wav is not None: - style_mel = compute_style_mel(style_wav, ap) - # preprocess the given text - inputs = text_to_seqvec(text, CONFIG) - # pass tensors to backend - if backend == 'torch': - speaker_id = id_to_torch(speaker_id) - style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) - inputs = inputs.unsqueeze(0) - elif backend == 'tf': - # TODO: handle speaker id for tf model - style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) - elif backend == 'tflite': - style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) - # synthesize voice - if backend == 'torch': - decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens) - elif backend == 'tf': - decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens) - elif backend == 'tflite': - decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output = parse_outputs_tflite( - postnet_output, decoder_output) - # convert outputs to numpy - # plot results - wav = None - if use_griffin_lim: - wav = inv_spectrogram(postnet_output, ap, CONFIG) - # trim silence - if do_trim_silence: - wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs diff --git a/utils/tensorboard_logger.py b/utils/tensorboard_logger.py deleted file mode 100644 index cbf68ad6..00000000 --- a/utils/tensorboard_logger.py +++ /dev/null @@ -1,81 +0,0 @@ -import traceback -from tensorboardX import SummaryWriter - - -class TensorboardLogger(object): - def __init__(self, log_dir, model_name): - self.model_name = model_name - self.writer = SummaryWriter(log_dir) - self.train_stats = {} - self.eval_stats = {} - - def tb_model_weights(self, model, step): - layer_num = 1 - for name, param in model.named_parameters(): - if param.numel() == 1: - self.writer.add_scalar( - "layer{}-{}/value".format(layer_num, name), - param.max(), step) - else: - self.writer.add_scalar( - "layer{}-{}/max".format(layer_num, name), - param.max(), step) - self.writer.add_scalar( - "layer{}-{}/min".format(layer_num, name), - param.min(), step) - self.writer.add_scalar( - "layer{}-{}/mean".format(layer_num, name), - param.mean(), step) - self.writer.add_scalar( - "layer{}-{}/std".format(layer_num, name), - param.std(), step) - self.writer.add_histogram( - "layer{}-{}/param".format(layer_num, name), param, step) - self.writer.add_histogram( - "layer{}-{}/grad".format(layer_num, name), param.grad, step) - layer_num += 1 - - def dict_to_tb_scalar(self, scope_name, stats, step): - for key, value in stats.items(): - self.writer.add_scalar('{}/{}'.format(scope_name, key), value, step) - - def dict_to_tb_figure(self, scope_name, figures, step): - for key, value in figures.items(): - self.writer.add_figure('{}/{}'.format(scope_name, key), value, step) - - def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): - for key, value in audios.items(): - try: - self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) - except: - traceback.print_exc() - - def tb_train_iter_stats(self, step, stats): - self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) - - def tb_train_epoch_stats(self, step, stats): - self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step) - - def tb_train_figures(self, step, figures): - self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step) - - def tb_train_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate) - - def tb_eval_stats(self, step, stats): - self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step) - - def tb_eval_figures(self, step, figures): - self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step) - - def tb_eval_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate) - - def tb_test_audios(self, step, audios, sample_rate): - self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate) - - def tb_test_figures(self, step, figures): - self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step) - - def tb_add_text(self, title, text, step): - self.writer.add_text(title, text, step) diff --git a/utils/text/__init__.py b/utils/text/__init__.py deleted file mode 100644 index 41aa6778..00000000 --- a/utils/text/__init__.py +++ /dev/null @@ -1,187 +0,0 @@ -# -*- coding: utf-8 -*- - -import re -from packaging import version -import phonemizer -from phonemizer.phonemize import phonemize -from TTS.utils.text import cleaners -from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ - _eos - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - -_phonemes_to_id = {s: i for i, s in enumerate(phonemes)} -_id_to_phonemes = {i: s for i, s in enumerate(phonemes)} - -# Regular expression matching text enclosed in curly braces: -_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') - -# Regular expression matching punctuations, ignoring empty space -PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+' - - -def text2phone(text, language): - ''' - Convert graphemes to phonemes. - ''' - seperator = phonemizer.separator.Separator(' |', '', '|') - #try: - punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) - if version.parse(phonemizer.__version__) < version.parse('2.1'): - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) - ph = ph[:-1].strip() # skip the last empty character - # phonemizer does not tackle punctuations. Here we do. - # Replace \n with matching punctuations. - if punctuations: - # if text ends with a punctuation. - if text[-1] == punctuations[-1]: - for punct in punctuations[:-1]: - ph = ph.replace('| |\n', '|'+punct+'| |', 1) - ph = ph + punctuations[-1] - else: - for punct in punctuations: - ph = ph.replace('| |\n', '|'+punct+'| |', 1) - elif version.parse(phonemizer.__version__) >= version.parse('2.1'): - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language, preserve_punctuation=True) - # this is a simple fix for phonemizer. - # https://github.com/bootphon/phonemizer/issues/32 - if punctuations: - for punctuation in punctuations: - ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(f"| |{punctuation}", f"|{punctuation}| |") - ph = ph[:-3] - else: - raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.") - - return ph - - -def pad_with_eos_bos(phoneme_sequence, tp=None): - # pylint: disable=global-statement - global _phonemes_to_id, _bos, _eos - if tp: - _bos = tp['bos'] - _eos = tp['eos'] - _, _phonemes = make_symbols(**tp) - _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} - - return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] - - -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): - # pylint: disable=global-statement - global _phonemes_to_id - if tp: - _, _phonemes = make_symbols(**tp) - _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} - - sequence = [] - clean_text = _clean_text(text, cleaner_names) - to_phonemes = text2phone(clean_text, language) - if to_phonemes is None: - print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) - # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. - for phoneme in filter(None, to_phonemes.split('|')): - sequence += _phoneme_to_sequence(phoneme) - # Append EOS char - if enable_eos_bos: - sequence = pad_with_eos_bos(sequence, tp=tp) - return sequence - - -def sequence_to_phoneme(sequence, tp=None): - # pylint: disable=global-statement - '''Converts a sequence of IDs back to a string''' - global _id_to_phonemes - result = '' - if tp: - _, _phonemes = make_symbols(**tp) - _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)} - - for symbol_id in sequence: - if symbol_id in _id_to_phonemes: - s = _id_to_phonemes[symbol_id] - result += s - return result.replace('}{', ' ') - - -def text_to_sequence(text, cleaner_names, tp=None): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - - The text can optionally have ARPAbet sequences enclosed in curly braces embedded - in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." - - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - - Returns: - List of integers corresponding to the symbols in the text - ''' - # pylint: disable=global-statement - global _symbol_to_id - if tp: - _symbols, _ = make_symbols(**tp) - _symbol_to_id = {s: i for i, s in enumerate(_symbols)} - - sequence = [] - # Check for curly braces and treat their contents as ARPAbet: - while text: - m = _CURLY_RE.match(text) - if not m: - sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) - break - sequence += _symbols_to_sequence( - _clean_text(m.group(1), cleaner_names)) - sequence += _arpabet_to_sequence(m.group(2)) - text = m.group(3) - return sequence - - -def sequence_to_text(sequence, tp=None): - '''Converts a sequence of IDs back to a string''' - # pylint: disable=global-statement - global _id_to_symbol - if tp: - _symbols, _ = make_symbols(**tp) - _id_to_symbol = {i: s for i, s in enumerate(_symbols)} - - result = '' - for symbol_id in sequence: - if symbol_id in _id_to_symbol: - s = _id_to_symbol[symbol_id] - # Enclose ARPAbet back in curly braces: - if len(s) > 1 and s[0] == '@': - s = '{%s}' % s[1:] - result += s - return result.replace('}{', ' ') - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text - - -def _symbols_to_sequence(syms): - return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)] - - -def _phoneme_to_sequence(phons): - return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)] - - -def _arpabet_to_sequence(text): - return _symbols_to_sequence(['@' + s for s in text.split()]) - - -def _should_keep_symbol(s): - return s in _symbol_to_id and s not in ['~', '^', '_'] - - -def _should_keep_phoneme(p): - return p in _phonemes_to_id and p not in ['~', '^', '_'] diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py deleted file mode 100644 index f0a66f57..00000000 --- a/utils/text/cleaners.py +++ /dev/null @@ -1,123 +0,0 @@ -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from .number_norm import normalize_numbers - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) - for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), - ]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text).strip() - - -def convert_to_ascii(text): - return unidecode(text) - - -def remove_aux_symbols(text): - text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text) - return text - - -def replace_symbols(text): - text = text.replace(';', ',') - text = text.replace('-', ' ') - text = text.replace(':', ',') - text = text.replace('&', 'and') - return text - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -# TODO: elaborate it -def basic_turkish_cleaners(text): - '''Pipeline for Turkish text''' - text = text.replace("I", "ı") - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including number and abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = replace_symbols(text) - text = remove_aux_symbols(text) - text = collapse_whitespace(text) - return text - - -def phoneme_cleaners(text): - '''Pipeline for phonemes mode, including number and abbreviation expansion.''' - text = convert_to_ascii(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = replace_symbols(text) - text = remove_aux_symbols(text) - text = collapse_whitespace(text) - return text diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py deleted file mode 100644 index c0f23406..00000000 --- a/utils/text/cmudict.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -import re - -VALID_SYMBOLS = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', - 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', - 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', - 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', - 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', - 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', - 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', - 'Y', 'Z', 'ZH' -] - - -class CMUDict: - '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' - - def __init__(self, file_or_path, keep_ambiguous=True): - if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) - else: - entries = _parse_cmudict(file_or_path) - if not keep_ambiguous: - entries = { - word: pron - for word, pron in entries.items() if len(pron) == 1 - } - self._entries = entries - - def __len__(self): - return len(self._entries) - - def lookup(self, word): - '''Returns list of ARPAbet pronunciations of the given word.''' - return self._entries.get(word.upper()) - - @staticmethod - def get_arpabet(word, cmudict, punctuation_symbols): - first_symbol, last_symbol = '', '' - if word and word[0] in punctuation_symbols: - first_symbol = word[0] - word = word[1:] - if word and word[-1] in punctuation_symbols: - last_symbol = word[-1] - word = word[:-1] - arpabet = cmudict.lookup(word) - if arpabet is not None: - return first_symbol + '{%s}' % arpabet[0] + last_symbol - return first_symbol + word + last_symbol - - -_alt_re = re.compile(r'\([0-9]+\)') - - -def _parse_cmudict(file): - cmudict = {} - for line in file: - if line and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - pronunciation = _get_pronunciation(parts[1]) - if pronunciation: - if word in cmudict: - cmudict[word].append(pronunciation) - else: - cmudict[word] = [pronunciation] - return cmudict - - -def _get_pronunciation(s): - parts = s.strip().split(' ') - for part in parts: - if part not in VALID_SYMBOLS: - return None - return ' '.join(parts) diff --git a/utils/text/number_norm.py b/utils/text/number_norm.py deleted file mode 100644 index 7b539bff..00000000 --- a/utils/text/number_norm.py +++ /dev/null @@ -1,71 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -import inflect -import re - -_inflect = inflect.engine() -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') - - -def _remove_commas(m): - return m.group(1).replace(',', '') - - -def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') - - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) - - -def _expand_number(m): - num = int(m.group(0)) - if 1000 < num < 3000: - if num == 2000: - return 'two thousand' - if 2000 < num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - if num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - return _inflect.number_to_words(num, - andword='', - zero='oh', - group=2).replace(', ', ' ') - return _inflect.number_to_words(num, andword='') - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text diff --git a/utils/text/symbols.py b/utils/text/symbols.py deleted file mode 100644 index 544277c5..00000000 --- a/utils/text/symbols.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Defines the set of symbols used in text input to the model. - -The default is a set of ASCII characters that works well for English or text that has been run -through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. -''' -def make_symbols(characters, phonemes, punctuations='!\'(),-.:;? ', pad='_', eos='~', bos='^'):# pylint: disable=redefined-outer-name - ''' Function to create symbols and phonemes ''' - _phonemes_sorted = sorted(list(phonemes)) - - # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): - _arpabet = ['@' + s for s in _phonemes_sorted] - - # Export all symbols: - _symbols = [pad, eos, bos] + list(characters) + _arpabet - _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations) - - return _symbols, _phonemes - -_pad = '_' -_eos = '~' -_bos = '^' -_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' -_punctuations = '!\'(),-.:;? ' -_phoneme_punctuations = '.!;:,?' - -# Phonemes definition -_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' -_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' -_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' -_suprasegmentals = 'ˈˌːˑ' -_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' -_diacrilics = 'ɚ˞ɫ' -_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics - -symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos) - -# Generate ALIEN language -# from random import shuffle -# shuffle(phonemes) - -if __name__ == '__main__': - print(" > TTS symbols {}".format(len(symbols))) - print(symbols) - print(" > TTS phonemes {}".format(len(phonemes))) - print(phonemes) diff --git a/utils/training.py b/utils/training.py deleted file mode 100644 index 9046f9e0..00000000 --- a/utils/training.py +++ /dev/null @@ -1,108 +0,0 @@ -import torch -import numpy as np - - -def setup_torch_training_env(cudnn_enable, cudnn_benchmark): - torch.backends.cudnn.enabled = cudnn_enable - torch.backends.cudnn.benchmark = cudnn_benchmark - torch.manual_seed(54321) - use_cuda = torch.cuda.is_available() - num_gpus = torch.cuda.device_count() - print(" > Using CUDA: ", use_cuda) - print(" > Number of GPUs: ", num_gpus) - return use_cuda, num_gpus - - -def check_update(model, grad_clip, ignore_stopnet=False): - r'''Check model gradient against unexpected jumps and failures''' - skip_flag = False - if ignore_stopnet: - grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) - else: - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - # compatibility with different torch versions - if isinstance(grad_norm, float): - if np.isinf(grad_norm): - print(" | > Gradient is INF !!") - skip_flag = True - else: - if torch.isinf(grad_norm): - print(" | > Gradient is INF !!") - skip_flag = True - return grad_norm, skip_flag - - -def lr_decay(init_lr, global_step, warmup_steps): - r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py''' - warmup_steps = float(warmup_steps) - step = global_step + 1. - lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5, - step**-0.5) - return lr - - -def adam_weight_decay(optimizer): - """ - Custom weight decay operation, not effecting grad values. - """ - for group in optimizer.param_groups: - for param in group['params']: - current_lr = group['lr'] - weight_decay = group['weight_decay'] - factor = -weight_decay * group['lr'] - param.data = param.data.add(param.data, - alpha=factor) - return optimizer, current_lr - -# pylint: disable=dangerous-default-value -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): - """ - Skip biases, BatchNorm parameters, rnns. - and attention projection layer v - """ - decay = [] - no_decay = [] - for name, param in model.named_parameters(): - if not param.requires_grad: - continue - - if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]): - no_decay.append(param) - else: - decay.append(param) - return [{ - 'params': no_decay, - 'weight_decay': 0. - }, { - 'params': decay, - 'weight_decay': weight_decay - }] - - -# pylint: disable=protected-access -class NoamLR(torch.optim.lr_scheduler._LRScheduler): - def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): - self.warmup_steps = float(warmup_steps) - super(NoamLR, self).__init__(optimizer, last_epoch) - - def get_lr(self): - step = max(self.last_epoch, 1) - return [ - base_lr * self.warmup_steps**0.5 * - min(step * self.warmup_steps**-1.5, step**-0.5) - for base_lr in self.base_lrs - ] - - -def gradual_training_scheduler(global_step, config): - """Setup the gradual training schedule wrt number - of active GPUs""" - num_gpus = torch.cuda.device_count() - if num_gpus == 0: - num_gpus = 1 - new_values = None - # we set the scheduling wrt num_gpus - for values in config.gradual_training: - if global_step * num_gpus >= values[0]: - new_values = values - return new_values[1], new_values[2] diff --git a/utils/visual.py b/utils/visual.py deleted file mode 100644 index b4ebec9a..00000000 --- a/utils/visual.py +++ /dev/null @@ -1,93 +0,0 @@ -import torch -import librosa -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme - - -def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None): - if isinstance(alignment, torch.Tensor): - alignment_ = alignment.detach().cpu().numpy().squeeze() - else: - alignment_ = alignment - fig, ax = plt.subplots(figsize=fig_size) - im = ax.imshow( - alignment_.T, aspect='auto', origin='lower', interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - # plt.yticks(range(len(text)), list(text)) - plt.tight_layout() - if title is not None: - plt.title(title) - return fig - - -def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)): - if isinstance(spectrogram, torch.Tensor): - spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T - else: - spectrogram_ = spectrogram.T - if ap is not None: - spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access - fig = plt.figure(figsize=fig_size) - plt.imshow(spectrogram_, aspect="auto", origin="lower") - plt.colorbar() - plt.tight_layout() - return fig - - -def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)): - if decoder_output is not None: - num_plot = 4 - else: - num_plot = 3 - - label_fontsize = 16 - fig = plt.figure(figsize=figsize) - - plt.subplot(num_plot, 1, 1) - plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) - plt.xlabel("Decoder timestamp", fontsize=label_fontsize) - plt.ylabel("Encoder timestamp", fontsize=label_fontsize) - # compute phoneme representation and back - if CONFIG.use_phonemes: - seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) - text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) - print(text) - plt.yticks(range(len(text)), list(text)) - plt.colorbar() - # plot stopnet predictions - plt.subplot(num_plot, 1, 2) - plt.plot(range(len(stop_tokens)), list(stop_tokens)) - # plot postnet spectrogram - plt.subplot(num_plot, 1, 3) - librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear", - fmin=CONFIG.audio['mel_fmin'], - fmax=CONFIG.audio['mel_fmax']) - - plt.xlabel("Time", fontsize=label_fontsize) - plt.ylabel("Hz", fontsize=label_fontsize) - plt.tight_layout() - plt.colorbar() - - if decoder_output is not None: - plt.subplot(num_plot, 1, 4) - librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear", - fmin=CONFIG.audio['mel_fmin'], - fmax=CONFIG.audio['mel_fmax']) - plt.xlabel("Time", fontsize=label_fontsize) - plt.ylabel("Hz", fontsize=label_fontsize) - plt.tight_layout() - plt.colorbar() - - if output_path: - print(output_path) - fig.savefig(output_path) - plt.close() diff --git a/vocoder/README.md b/vocoder/README.md deleted file mode 100644 index e3baf1f9..00000000 --- a/vocoder/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Mozilla TTS Vocoders (Experimental) - -We provide here different vocoder implementations which can be combined with our TTS models to enable "FASTER THAN REAL-TIME" end-to-end TTS stack. - -Currently, there are implementations of the following models. - -- Melgan -- MultiBand-Melgan -- GAN-TTS (Discriminator Only) - -It is also very easy to adapt different vocoder models as we provide here a flexible and modular (but not too modular) framework. - -## Training a model - -You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset. - -In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json''' - -You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path, where '0' is the Id of the GPU you wish to use. - -```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --config_path path/to/config.json``` - -Exampled config files can be found under `vocoder/configs/` folder. - -You can continue a previous training by the following command. - -```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --continue_path path/to/your/model/folder``` - -You can fine-tune a pre-trained model by the following command. - -```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --restore_path path/to/your/model.pth.tar``` - -Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off. - -You can also follow your training runs on Tensorboard as you do with our TTS models. - -## Acknowledgement -Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work. diff --git a/vocoder/__init__.py b/vocoder/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/configs/multiband-melgan_and_rwd_config.json b/vocoder/configs/multiband-melgan_and_rwd_config.json deleted file mode 100644 index 0b751854..00000000 --- a/vocoder/configs/multiband-melgan_and_rwd_config.json +++ /dev/null @@ -1,151 +0,0 @@ -{ - "run_name": "multiband-melgan-rwd", - "run_description": "multiband melgan with random window discriminator from https://arxiv.org/pdf/1909.11646.pdf", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // DISTRIBUTED TRAINING - // "distributed":{ - // "backend": "nccl", - // "url": "tcp:\/\/localhost:54321" - // }, - - // MODEL PARAMETERS - "use_pqmf": true, - - // LOSS PARAMETERS - "use_stft_loss": true, - "use_subband_stft_loss": true, - "use_mse_gan_loss": true, - "use_hinge_gan_loss": false, - "use_feat_match_loss": false, // use only with melgan discriminators - - // loss weights - "stft_loss_weight": 0.5, - "subband_stft_loss_weight": 0.5, - "mse_G_loss_weight": 2.5, - "hinge_G_loss_weight": 2.5, - "feat_match_loss_weight": 25, - - // multiscale stft loss parameters - "stft_loss_params": { - "n_ffts": [1024, 2048, 512], - "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }, - - // subband multiscale stft loss parameters - "subband_stft_loss_params":{ - "n_ffts": [384, 683, 171], - "hop_lengths": [30, 60, 10], - "win_lengths": [150, 300, 60] - }, - - "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch - - // DISCRIMINATOR - "discriminator_model": "random_window_discriminator", - "discriminator_model_params":{ - "uncond_disc_donwsample_factors": [8, 4], - "cond_disc_downsample_factors": [[8, 4, 2, 2, 2], [8, 4, 2, 2], [8, 4, 2], [8, 4], [4, 2, 2]], - "cond_disc_out_channels": [[128, 128, 256, 256], [128, 256, 256], [128, 256], [256], [128, 256]], - "window_sizes": [512, 1024, 2048, 4096, 8192] - }, - "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 - - // GENERATOR - "generator_model": "multiband_melgan_generator", - "generator_model_params": { - "upsample_factors":[8, 4, 2], - "num_res_blocks": 4 - }, - - // DATASET - "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", - "seq_len": 16384, - "pad_short": 2000, - "conv_pad": 0, - "use_noise_augment": false, - "use_cache": true, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "warmup_steps_gen": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "warmup_steps_disc": 4000, - "epochs": 10000, // total number of epochs to train. - "wd": 0.0, // Weight decay weight. - "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 - "disc_clip_grad": -1, // Discriminator gradient clipping threshold. - "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_gen_params": { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }, - "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_disc_params": { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }, - "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_disc": 1e-4, - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 10, - - // PATHS - "output_path": "/home/erogol/Models/LJSpeech/" -} - diff --git a/vocoder/datasets/__init__.py b/vocoder/datasets/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/datasets/gan_dataset.py b/vocoder/datasets/gan_dataset.py deleted file mode 100644 index af23fbf2..00000000 --- a/vocoder/datasets/gan_dataset.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -import glob -import torch -import random -import numpy as np -from torch.utils.data import Dataset -from multiprocessing import Manager - - -class GANDataset(Dataset): - """ - GAN Dataset searchs for all the wav files under root path - and converts them to acoustic features on the fly and returns - random segments of (audio, feature) couples. - """ - def __init__(self, - ap, - items, - seq_len, - hop_len, - pad_short, - conv_pad=2, - is_training=True, - return_segments=True, - use_noise_augment=False, - use_cache=False, - verbose=False): - - self.ap = ap - self.item_list = items - self.compute_feat = not isinstance(items[0], (tuple, list)) - self.seq_len = seq_len - self.hop_len = hop_len - self.pad_short = pad_short - self.conv_pad = conv_pad - self.is_training = is_training - self.return_segments = return_segments - self.use_cache = use_cache - self.use_noise_augment = use_noise_augment - self.verbose = verbose - - assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." - self.feat_frame_len = seq_len // hop_len + (2 * conv_pad) - - # map G and D instances - self.G_to_D_mappings = list(range(len(self.item_list))) - self.shuffle_mapping() - - # cache acoustic features - if use_cache: - self.create_feature_cache() - - def create_feature_cache(self): - self.manager = Manager() - self.cache = self.manager.list() - self.cache += [None for _ in range(len(self.item_list))] - - @staticmethod - def find_wav_files(path): - return glob.glob(os.path.join(path, '**', '*.wav'), recursive=True) - - def __len__(self): - return len(self.item_list) - - def __getitem__(self, idx): - """ Return different items for Generator and Discriminator and - cache acoustic features """ - if self.return_segments: - idx2 = self.G_to_D_mappings[idx] - item1 = self.load_item(idx) - item2 = self.load_item(idx2) - return item1, item2 - item1 = self.load_item(idx) - return item1 - - def shuffle_mapping(self): - random.shuffle(self.G_to_D_mappings) - - def load_item(self, idx): - """ load (audio, feat) couple """ - if self.compute_feat: - # compute features from wav - wavpath = self.item_list[idx] - # print(wavpath) - - if self.use_cache and self.cache[idx] is not None: - audio, mel = self.cache[idx] - else: - audio = self.ap.load_wav(wavpath) - - if len(audio) < self.seq_len + self.pad_short: - audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ - mode='constant', constant_values=0.0) - - mel = self.ap.melspectrogram(audio) - else: - - # load precomputed features - wavpath, feat_path = self.item_list[idx] - - if self.use_cache and self.cache[idx] is not None: - audio, mel = self.cache[idx] - else: - audio = self.ap.load_wav(wavpath) - mel = np.load(feat_path) - - # correct the audio length wrt padding applied in stft - audio = np.pad(audio, (0, self.hop_len), mode="edge") - audio = audio[:mel.shape[-1] * self.hop_len] - assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}' - - audio = torch.from_numpy(audio).float().unsqueeze(0) - mel = torch.from_numpy(mel).float().squeeze(0) - - if self.return_segments: - max_mel_start = mel.shape[1] - self.feat_frame_len - mel_start = random.randint(0, max_mel_start) - mel_end = mel_start + self.feat_frame_len - mel = mel[:, mel_start:mel_end] - - audio_start = mel_start * self.hop_len - audio = audio[:, audio_start:audio_start + - self.seq_len] - - if self.use_noise_augment and self.is_training and self.return_segments: - audio = audio + (1 / 32768) * torch.randn_like(audio) - return (mel, audio) diff --git a/vocoder/datasets/preprocess.py b/vocoder/datasets/preprocess.py deleted file mode 100644 index be60c13a..00000000 --- a/vocoder/datasets/preprocess.py +++ /dev/null @@ -1,37 +0,0 @@ -import glob -import os -from pathlib import Path - -import numpy as np - - -def find_wav_files(data_path): - wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) - return wav_paths - - -def find_feat_files(data_path): - feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True) - return feat_paths - - -def load_wav_data(data_path, eval_split_size): - wav_paths = find_wav_files(data_path) - np.random.seed(0) - np.random.shuffle(wav_paths) - return wav_paths[:eval_split_size], wav_paths[eval_split_size:] - - -def load_wav_feat_data(data_path, feat_path, eval_split_size): - wav_paths = sorted(find_wav_files(data_path)) - feat_paths = sorted(find_feat_files(feat_path)) - assert len(wav_paths) == len(feat_paths) - for wav, feat in zip(wav_paths, feat_paths): - wav_name = Path(wav).stem - feat_name = Path(feat).stem - assert wav_name == feat_name - - items = list(zip(wav_paths, feat_paths)) - np.random.seed(0) - np.random.shuffle(items) - return items[:eval_split_size], items[eval_split_size:] diff --git a/vocoder/layers/__init__.py b/vocoder/layers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/layers/losses.py b/vocoder/layers/losses.py deleted file mode 100644 index 431f7f45..00000000 --- a/vocoder/layers/losses.py +++ /dev/null @@ -1,309 +0,0 @@ -import torch - -from torch import nn -from torch.nn import functional as F - - -class TorchSTFT(): - def __init__(self, n_fft, hop_length, win_length, window='hann_window'): - """ Torch based STFT operation """ - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.window = getattr(torch, window)(win_length) - - def __call__(self, x): - # B x D x T x 2 - o = torch.stft(x, - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=False, - onesided=True) - M = o[:, :, :, 0] - P = o[:, :, :, 1] - return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) - - -################################# -# GENERATOR LOSSES -################################# - - -class STFTLoss(nn.Module): - """ Single scale STFT Loss """ - def __init__(self, n_fft, hop_length, win_length): - super(STFTLoss, self).__init__() - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.stft = TorchSTFT(n_fft, hop_length, win_length) - - def forward(self, y_hat, y): - y_hat_M = self.stft(y_hat) - y_M = self.stft(y) - # magnitude loss - loss_mag = F.l1_loss(torch.log(y_M), torch.log(y_hat_M)) - # spectral convergence loss - loss_sc = torch.norm(y_M - y_hat_M, p="fro") / torch.norm(y_M, p="fro") - return loss_mag, loss_sc - -class MultiScaleSTFTLoss(torch.nn.Module): - """ Multi scale STFT loss """ - def __init__(self, - n_ffts=(1024, 2048, 512), - hop_lengths=(120, 240, 50), - win_lengths=(600, 1200, 240)): - super(MultiScaleSTFTLoss, self).__init__() - self.loss_funcs = torch.nn.ModuleList() - for n_fft, hop_length, win_length in zip(n_ffts, hop_lengths, win_lengths): - self.loss_funcs.append(STFTLoss(n_fft, hop_length, win_length)) - - def forward(self, y_hat, y): - N = len(self.loss_funcs) - loss_sc = 0 - loss_mag = 0 - for f in self.loss_funcs: - lm, lsc = f(y_hat, y) - loss_mag += lm - loss_sc += lsc - loss_sc /= N - loss_mag /= N - return loss_mag, loss_sc - - -class MultiScaleSubbandSTFTLoss(MultiScaleSTFTLoss): - """ Multiscale STFT loss for multi band model outputs """ - # pylint: disable=no-self-use - def forward(self, y_hat, y): - y_hat = y_hat.view(-1, 1, y_hat.shape[2]) - y = y.view(-1, 1, y.shape[2]) - return super().forward(y_hat.squeeze(1), y.squeeze(1)) - - -class MSEGLoss(nn.Module): - """ Mean Squared Generator Loss """ - # pylint: disable=no-self-use - def forward(self, score_real): - loss_fake = F.mse_loss(score_real, score_real.new_ones(score_real.shape)) - return loss_fake - - -class HingeGLoss(nn.Module): - """ Hinge Discriminator Loss """ - # pylint: disable=no-self-use - def forward(self, score_real): - # TODO: this might be wrong - loss_fake = torch.mean(F.relu(1. - score_real)) - return loss_fake - - -################################## -# DISCRIMINATOR LOSSES -################################## - - -class MSEDLoss(nn.Module): - """ Mean Squared Discriminator Loss """ - def __init__(self,): - super(MSEDLoss, self).__init__() - self.loss_func = nn.MSELoss() - - # pylint: disable=no-self-use - def forward(self, score_fake, score_real): - loss_real = self.loss_func(score_real, score_real.new_ones(score_real.shape)) - loss_fake = self.loss_func(score_fake, score_fake.new_zeros(score_fake.shape)) - loss_d = loss_real + loss_fake - return loss_d, loss_real, loss_fake - - -class HingeDLoss(nn.Module): - """ Hinge Discriminator Loss """ - # pylint: disable=no-self-use - def forward(self, score_fake, score_real): - loss_real = torch.mean(F.relu(1. - score_real)) - loss_fake = torch.mean(F.relu(1. + score_fake)) - loss_d = loss_real + loss_fake - return loss_d, loss_real, loss_fake - - -class MelganFeatureLoss(nn.Module): - def __init__(self,): - super(MelganFeatureLoss, self).__init__() - self.loss_func = nn.L1Loss() - - # pylint: disable=no-self-use - def forward(self, fake_feats, real_feats): - loss_feats = 0 - for fake_feat, real_feat in zip(fake_feats, real_feats): - loss_feats += self.loss_func(fake_feat, real_feat) - loss_feats /= len(fake_feats) + len(real_feats) - return loss_feats - - -##################################### -# LOSS WRAPPERS -##################################### - - -def _apply_G_adv_loss(scores_fake, loss_func): - """ Compute G adversarial loss function - and normalize values """ - adv_loss = 0 - if isinstance(scores_fake, list): - for score_fake in scores_fake: - fake_loss = loss_func(score_fake) - adv_loss += fake_loss - adv_loss /= len(scores_fake) - else: - fake_loss = loss_func(scores_fake) - adv_loss = fake_loss - return adv_loss - - -def _apply_D_loss(scores_fake, scores_real, loss_func): - """ Compute D loss func and normalize loss values """ - loss = 0 - real_loss = 0 - fake_loss = 0 - if isinstance(scores_fake, list): - # multi-scale loss - for score_fake, score_real in zip(scores_fake, scores_real): - total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) - loss += total_loss - real_loss += real_loss - fake_loss += fake_loss - # normalize loss values with number of scales - loss /= len(scores_fake) - real_loss /= len(scores_real) - fake_loss /= len(scores_fake) - else: - # single scale loss - total_loss, real_loss, fake_loss = loss_func(scores_fake, scores_real) - loss = total_loss - return loss, real_loss, fake_loss - - -################################## -# MODEL LOSSES -################################## - - -class GeneratorLoss(nn.Module): - def __init__(self, C): - """ Compute Generator Loss values depending on training - configuration """ - super(GeneratorLoss, self).__init__() - assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\ - " [!] Cannot use HingeGANLoss and MSEGANLoss together." - - self.use_stft_loss = C.use_stft_loss - self.use_subband_stft_loss = C.use_subband_stft_loss - self.use_mse_gan_loss = C.use_mse_gan_loss - self.use_hinge_gan_loss = C.use_hinge_gan_loss - self.use_feat_match_loss = C.use_feat_match_loss - - self.stft_loss_weight = C.stft_loss_weight - self.subband_stft_loss_weight = C.subband_stft_loss_weight - self.mse_gan_loss_weight = C.mse_G_loss_weight - self.hinge_gan_loss_weight = C.hinge_G_loss_weight - self.feat_match_loss_weight = C.feat_match_loss_weight - - if C.use_stft_loss: - self.stft_loss = MultiScaleSTFTLoss(**C.stft_loss_params) - if C.use_subband_stft_loss: - self.subband_stft_loss = MultiScaleSubbandSTFTLoss(**C.subband_stft_loss_params) - if C.use_mse_gan_loss: - self.mse_loss = MSEGLoss() - if C.use_hinge_gan_loss: - self.hinge_loss = HingeGLoss() - if C.use_feat_match_loss: - self.feat_match_loss = MelganFeatureLoss() - - def forward(self, y_hat=None, y=None, scores_fake=None, feats_fake=None, feats_real=None, y_hat_sub=None, y_sub=None): - gen_loss = 0 - adv_loss = 0 - return_dict = {} - - # STFT Loss - if self.use_stft_loss: - stft_loss_mg, stft_loss_sc = self.stft_loss(y_hat.squeeze(1), y.squeeze(1)) - return_dict['G_stft_loss_mg'] = stft_loss_mg - return_dict['G_stft_loss_sc'] = stft_loss_sc - gen_loss += self.stft_loss_weight * (stft_loss_mg + stft_loss_sc) - - # subband STFT Loss - if self.use_subband_stft_loss: - subband_stft_loss_mg, subband_stft_loss_sc = self.subband_stft_loss(y_hat_sub, y_sub) - return_dict['G_subband_stft_loss_mg'] = subband_stft_loss_mg - return_dict['G_subband_stft_loss_sc'] = subband_stft_loss_sc - gen_loss += self.subband_stft_loss_weight * (subband_stft_loss_mg + subband_stft_loss_sc) - - # multiscale MSE adversarial loss - if self.use_mse_gan_loss and scores_fake is not None: - mse_fake_loss = _apply_G_adv_loss(scores_fake, self.mse_loss) - return_dict['G_mse_fake_loss'] = mse_fake_loss - adv_loss += self.mse_gan_loss_weight * mse_fake_loss - - # multiscale Hinge adversarial loss - if self.use_hinge_gan_loss and not scores_fake is not None: - hinge_fake_loss = _apply_G_adv_loss(scores_fake, self.hinge_loss) - return_dict['G_hinge_fake_loss'] = hinge_fake_loss - adv_loss += self.hinge_gan_loss_weight * hinge_fake_loss - - # Feature Matching Loss - if self.use_feat_match_loss and not feats_fake: - feat_match_loss = self.feat_match_loss(feats_fake, feats_real) - return_dict['G_feat_match_loss'] = feat_match_loss - adv_loss += self.feat_match_loss_weight * feat_match_loss - return_dict['G_loss'] = gen_loss + adv_loss - return_dict['G_gen_loss'] = gen_loss - return_dict['G_adv_loss'] = adv_loss - return return_dict - - -class DiscriminatorLoss(nn.Module): - """ Compute Discriminator Loss values depending on training - configuration """ - def __init__(self, C): - super(DiscriminatorLoss, self).__init__() - assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\ - " [!] Cannot use HingeGANLoss and MSEGANLoss together." - - self.use_mse_gan_loss = C.use_mse_gan_loss - self.use_hinge_gan_loss = C.use_hinge_gan_loss - - if C.use_mse_gan_loss: - self.mse_loss = MSEDLoss() - if C.use_hinge_gan_loss: - self.hinge_loss = HingeDLoss() - - def forward(self, scores_fake, scores_real): - loss = 0 - return_dict = {} - - if self.use_mse_gan_loss: - mse_D_loss, mse_D_real_loss, mse_D_fake_loss = _apply_D_loss( - scores_fake=scores_fake, - scores_real=scores_real, - loss_func=self.mse_loss) - return_dict['D_mse_gan_loss'] = mse_D_loss - return_dict['D_mse_gan_real_loss'] = mse_D_real_loss - return_dict['D_mse_gan_fake_loss'] = mse_D_fake_loss - loss += mse_D_loss - - if self.use_hinge_gan_loss: - hinge_D_loss, hinge_D_real_loss, hinge_D_fake_loss = _apply_D_loss( - scores_fake=scores_fake, - scores_real=scores_real, - loss_func=self.hinge_loss) - return_dict['D_hinge_gan_loss'] = hinge_D_loss - return_dict['D_hinge_gan_real_loss'] = hinge_D_real_loss - return_dict['D_hinge_gan_fake_loss'] = hinge_D_fake_loss - loss += hinge_D_loss - - return_dict['D_loss'] = loss - return return_dict \ No newline at end of file diff --git a/vocoder/layers/melgan.py b/vocoder/layers/melgan.py deleted file mode 100644 index 58c12a2e..00000000 --- a/vocoder/layers/melgan.py +++ /dev/null @@ -1,45 +0,0 @@ -from torch import nn -from torch.nn.utils import weight_norm - - -class ResidualStack(nn.Module): - def __init__(self, channels, num_res_blocks, kernel_size): - super(ResidualStack, self).__init__() - - assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." - base_padding = (kernel_size - 1) // 2 - - self.blocks = nn.ModuleList() - for idx in range(num_res_blocks): - layer_kernel_size = kernel_size - layer_dilation = layer_kernel_size**idx - layer_padding = base_padding * layer_dilation - self.blocks += [nn.Sequential( - nn.LeakyReLU(0.2), - nn.ReflectionPad1d(layer_padding), - weight_norm( - nn.Conv1d(channels, - channels, - kernel_size=kernel_size, - dilation=layer_dilation, - bias=True)), - nn.LeakyReLU(0.2), - weight_norm( - nn.Conv1d(channels, channels, kernel_size=1, bias=True)), - )] - - self.shortcuts = nn.ModuleList([ - weight_norm(nn.Conv1d(channels, channels, kernel_size=1, - bias=True)) for i in range(num_res_blocks) - ]) - - def forward(self, x): - for block, shortcut in zip(self.blocks, self.shortcuts): - x = shortcut(x) + block(x) - return x - - def remove_weight_norm(self): - for block, shortcut in zip(self.blocks, self.shortcuts): - nn.utils.remove_weight_norm(block[2]) - nn.utils.remove_weight_norm(block[4]) - nn.utils.remove_weight_norm(shortcut) diff --git a/vocoder/layers/pqmf.py b/vocoder/layers/pqmf.py deleted file mode 100644 index ef5a3507..00000000 --- a/vocoder/layers/pqmf.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F - -from scipy import signal as sig - - -# adapted from -# https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan -class PQMF(torch.nn.Module): - def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): - super(PQMF, self).__init__() - - self.N = N - self.taps = taps - self.cutoff = cutoff - self.beta = beta - - QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta)) - H = np.zeros((N, len(QMF))) - G = np.zeros((N, len(QMF))) - for k in range(N): - constant_factor = (2 * k + 1) * (np.pi / - (2 * N)) * (np.arange(taps + 1) - - ((taps - 1) / 2)) - phase = (-1)**k * np.pi / 4 - H[k] = 2 * QMF * np.cos(constant_factor + phase) - - G[k] = 2 * QMF * np.cos(constant_factor - phase) - - H = torch.from_numpy(H[:, None, :]).float() - G = torch.from_numpy(G[None, :, :]).float() - - self.register_buffer("H", H) - self.register_buffer("G", G) - - updown_filter = torch.zeros((N, N, N)).float() - for k in range(N): - updown_filter[k, k, 0] = 1.0 - self.register_buffer("updown_filter", updown_filter) - self.N = N - - self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) - - def forward(self, x): - return self.analysis(x) - - def analysis(self, x): - return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N) - - def synthesis(self, x): - x = F.conv_transpose1d(x, - self.updown_filter * self.N, - stride=self.N) - x = F.conv1d(x, self.G, padding=self.taps // 2) - return x diff --git a/vocoder/layers/qmf.dat b/vocoder/layers/qmf.dat deleted file mode 100644 index 17eab137..00000000 --- a/vocoder/layers/qmf.dat +++ /dev/null @@ -1,640 +0,0 @@ - 0.0000000e+000 - -5.5252865e-004 - -5.6176926e-004 - -4.9475181e-004 - -4.8752280e-004 - -4.8937912e-004 - -5.0407143e-004 - -5.2265643e-004 - -5.4665656e-004 - -5.6778026e-004 - -5.8709305e-004 - -6.1327474e-004 - -6.3124935e-004 - -6.5403334e-004 - -6.7776908e-004 - -6.9416146e-004 - -7.1577365e-004 - -7.2550431e-004 - -7.4409419e-004 - -7.4905981e-004 - -7.6813719e-004 - -7.7248486e-004 - -7.8343323e-004 - -7.7798695e-004 - -7.8036647e-004 - -7.8014496e-004 - -7.7579773e-004 - -7.6307936e-004 - -7.5300014e-004 - -7.3193572e-004 - -7.2153920e-004 - -6.9179375e-004 - -6.6504151e-004 - -6.3415949e-004 - -5.9461189e-004 - -5.5645764e-004 - -5.1455722e-004 - -4.6063255e-004 - -4.0951215e-004 - -3.5011759e-004 - -2.8969812e-004 - -2.0983373e-004 - -1.4463809e-004 - -6.1733441e-005 - 1.3494974e-005 - 1.0943831e-004 - 2.0430171e-004 - 2.9495311e-004 - 4.0265402e-004 - 5.1073885e-004 - 6.2393761e-004 - 7.4580259e-004 - 8.6084433e-004 - 9.8859883e-004 - 1.1250155e-003 - 1.2577885e-003 - 1.3902495e-003 - 1.5443220e-003 - 1.6868083e-003 - 1.8348265e-003 - 1.9841141e-003 - 2.1461584e-003 - 2.3017255e-003 - 2.4625617e-003 - 2.6201759e-003 - 2.7870464e-003 - 2.9469448e-003 - 3.1125421e-003 - 3.2739613e-003 - 3.4418874e-003 - 3.6008268e-003 - 3.7603923e-003 - 3.9207432e-003 - 4.0819753e-003 - 4.2264269e-003 - 4.3730720e-003 - 4.5209853e-003 - 4.6606461e-003 - 4.7932561e-003 - 4.9137604e-003 - 5.0393023e-003 - 5.1407354e-003 - 5.2461166e-003 - 5.3471681e-003 - 5.4196776e-003 - 5.4876040e-003 - 5.5475715e-003 - 5.5938023e-003 - 5.6220643e-003 - 5.6455197e-003 - 5.6389200e-003 - 5.6266114e-003 - 5.5917129e-003 - 5.5404364e-003 - 5.4753783e-003 - 5.3838976e-003 - 5.2715759e-003 - 5.1382275e-003 - 4.9839688e-003 - 4.8109469e-003 - 4.6039530e-003 - 4.3801862e-003 - 4.1251642e-003 - 3.8456408e-003 - 3.5401247e-003 - 3.2091886e-003 - 2.8446758e-003 - 2.4508540e-003 - 2.0274176e-003 - 1.5784683e-003 - 1.0902329e-003 - 5.8322642e-004 - 2.7604519e-005 - -5.4642809e-004 - -1.1568136e-003 - -1.8039473e-003 - -2.4826724e-003 - -3.1933778e-003 - -3.9401124e-003 - -4.7222596e-003 - -5.5337211e-003 - -6.3792293e-003 - -7.2615817e-003 - -8.1798233e-003 - -9.1325330e-003 - -1.0115022e-002 - -1.1131555e-002 - -1.2185000e-002 - -1.3271822e-002 - -1.4390467e-002 - -1.5540555e-002 - -1.6732471e-002 - -1.7943338e-002 - -1.9187243e-002 - -2.0453179e-002 - -2.1746755e-002 - -2.3068017e-002 - -2.4416099e-002 - -2.5787585e-002 - -2.7185943e-002 - -2.8607217e-002 - -3.0050266e-002 - -3.1501761e-002 - -3.2975408e-002 - -3.4462095e-002 - -3.5969756e-002 - -3.7481285e-002 - -3.9005368e-002 - -4.0534917e-002 - -4.2064909e-002 - -4.3609754e-002 - -4.5148841e-002 - -4.6684303e-002 - -4.8216572e-002 - -4.9738576e-002 - -5.1255616e-002 - -5.2763075e-002 - -5.4245277e-002 - -5.5717365e-002 - -5.7161645e-002 - -5.8591568e-002 - -5.9983748e-002 - -6.1345517e-002 - -6.2685781e-002 - -6.3971590e-002 - -6.5224711e-002 - -6.6436751e-002 - -6.7607599e-002 - -6.8704383e-002 - -6.9763024e-002 - -7.0762871e-002 - -7.1700267e-002 - -7.2568258e-002 - -7.3362026e-002 - -7.4100364e-002 - -7.4745256e-002 - -7.5313734e-002 - -7.5800836e-002 - -7.6199248e-002 - -7.6499217e-002 - -7.6709349e-002 - -7.6817398e-002 - -7.6823001e-002 - -7.6720492e-002 - -7.6505072e-002 - -7.6174832e-002 - -7.5730576e-002 - -7.5157626e-002 - -7.4466439e-002 - -7.3640601e-002 - -7.2677464e-002 - -7.1582636e-002 - -7.0353307e-002 - -6.8966401e-002 - -6.7452502e-002 - -6.5769067e-002 - -6.3944481e-002 - -6.1960278e-002 - -5.9816657e-002 - -5.7515269e-002 - -5.5046003e-002 - -5.2409382e-002 - -4.9597868e-002 - -4.6630331e-002 - -4.3476878e-002 - -4.0145828e-002 - -3.6641812e-002 - -3.2958393e-002 - -2.9082401e-002 - -2.5030756e-002 - -2.0799707e-002 - -1.6370126e-002 - -1.1762383e-002 - -6.9636862e-003 - -1.9765601e-003 - 3.2086897e-003 - 8.5711749e-003 - 1.4128883e-002 - 1.9883413e-002 - 2.5822729e-002 - 3.1953127e-002 - 3.8277657e-002 - 4.4780682e-002 - 5.1480418e-002 - 5.8370533e-002 - 6.5440985e-002 - 7.2694330e-002 - 8.0137293e-002 - 8.7754754e-002 - 9.5553335e-002 - 1.0353295e-001 - 1.1168269e-001 - 1.2000780e-001 - 1.2850029e-001 - 1.3715518e-001 - 1.4597665e-001 - 1.5496071e-001 - 1.6409589e-001 - 1.7338082e-001 - 1.8281725e-001 - 1.9239667e-001 - 2.0212502e-001 - 2.1197359e-001 - 2.2196527e-001 - 2.3206909e-001 - 2.4230169e-001 - 2.5264803e-001 - 2.6310533e-001 - 2.7366340e-001 - 2.8432142e-001 - 2.9507167e-001 - 3.0590986e-001 - 3.1682789e-001 - 3.2781137e-001 - 3.3887227e-001 - 3.4999141e-001 - 3.6115899e-001 - 3.7237955e-001 - 3.8363500e-001 - 3.9492118e-001 - 4.0623177e-001 - 4.1756969e-001 - 4.2891199e-001 - 4.4025538e-001 - 4.5159965e-001 - 4.6293081e-001 - 4.7424532e-001 - 4.8552531e-001 - 4.9677083e-001 - 5.0798175e-001 - 5.1912350e-001 - 5.3022409e-001 - 5.4125534e-001 - 5.5220513e-001 - 5.6307891e-001 - 5.7385241e-001 - 5.8454032e-001 - 5.9511231e-001 - 6.0557835e-001 - 6.1591099e-001 - 6.2612427e-001 - 6.3619801e-001 - 6.4612697e-001 - 6.5590163e-001 - 6.6551399e-001 - 6.7496632e-001 - 6.8423533e-001 - 6.9332824e-001 - 7.0223887e-001 - 7.1094104e-001 - 7.1944626e-001 - 7.2774489e-001 - 7.3582118e-001 - 7.4368279e-001 - 7.5131375e-001 - 7.5870808e-001 - 7.6586749e-001 - 7.7277809e-001 - 7.7942875e-001 - 7.8583531e-001 - 7.9197358e-001 - 7.9784664e-001 - 8.0344858e-001 - 8.0876950e-001 - 8.1381913e-001 - 8.1857760e-001 - 8.2304199e-001 - 8.2722753e-001 - 8.3110385e-001 - 8.3469374e-001 - 8.3797173e-001 - 8.4095414e-001 - 8.4362383e-001 - 8.4598185e-001 - 8.4803158e-001 - 8.4978052e-001 - 8.5119715e-001 - 8.5230470e-001 - 8.5310209e-001 - 8.5357206e-001 - 8.5373856e-001 - 8.5357206e-001 - 8.5310209e-001 - 8.5230470e-001 - 8.5119715e-001 - 8.4978052e-001 - 8.4803158e-001 - 8.4598185e-001 - 8.4362383e-001 - 8.4095414e-001 - 8.3797173e-001 - 8.3469374e-001 - 8.3110385e-001 - 8.2722753e-001 - 8.2304199e-001 - 8.1857760e-001 - 8.1381913e-001 - 8.0876950e-001 - 8.0344858e-001 - 7.9784664e-001 - 7.9197358e-001 - 7.8583531e-001 - 7.7942875e-001 - 7.7277809e-001 - 7.6586749e-001 - 7.5870808e-001 - 7.5131375e-001 - 7.4368279e-001 - 7.3582118e-001 - 7.2774489e-001 - 7.1944626e-001 - 7.1094104e-001 - 7.0223887e-001 - 6.9332824e-001 - 6.8423533e-001 - 6.7496632e-001 - 6.6551399e-001 - 6.5590163e-001 - 6.4612697e-001 - 6.3619801e-001 - 6.2612427e-001 - 6.1591099e-001 - 6.0557835e-001 - 5.9511231e-001 - 5.8454032e-001 - 5.7385241e-001 - 5.6307891e-001 - 5.5220513e-001 - 5.4125534e-001 - 5.3022409e-001 - 5.1912350e-001 - 5.0798175e-001 - 4.9677083e-001 - 4.8552531e-001 - 4.7424532e-001 - 4.6293081e-001 - 4.5159965e-001 - 4.4025538e-001 - 4.2891199e-001 - 4.1756969e-001 - 4.0623177e-001 - 3.9492118e-001 - 3.8363500e-001 - 3.7237955e-001 - 3.6115899e-001 - 3.4999141e-001 - 3.3887227e-001 - 3.2781137e-001 - 3.1682789e-001 - 3.0590986e-001 - 2.9507167e-001 - 2.8432142e-001 - 2.7366340e-001 - 2.6310533e-001 - 2.5264803e-001 - 2.4230169e-001 - 2.3206909e-001 - 2.2196527e-001 - 2.1197359e-001 - 2.0212502e-001 - 1.9239667e-001 - 1.8281725e-001 - 1.7338082e-001 - 1.6409589e-001 - 1.5496071e-001 - 1.4597665e-001 - 1.3715518e-001 - 1.2850029e-001 - 1.2000780e-001 - 1.1168269e-001 - 1.0353295e-001 - 9.5553335e-002 - 8.7754754e-002 - 8.0137293e-002 - 7.2694330e-002 - 6.5440985e-002 - 5.8370533e-002 - 5.1480418e-002 - 4.4780682e-002 - 3.8277657e-002 - 3.1953127e-002 - 2.5822729e-002 - 1.9883413e-002 - 1.4128883e-002 - 8.5711749e-003 - 3.2086897e-003 - -1.9765601e-003 - -6.9636862e-003 - -1.1762383e-002 - -1.6370126e-002 - -2.0799707e-002 - -2.5030756e-002 - -2.9082401e-002 - -3.2958393e-002 - -3.6641812e-002 - -4.0145828e-002 - -4.3476878e-002 - -4.6630331e-002 - -4.9597868e-002 - -5.2409382e-002 - -5.5046003e-002 - -5.7515269e-002 - -5.9816657e-002 - -6.1960278e-002 - -6.3944481e-002 - -6.5769067e-002 - -6.7452502e-002 - -6.8966401e-002 - -7.0353307e-002 - -7.1582636e-002 - -7.2677464e-002 - -7.3640601e-002 - -7.4466439e-002 - -7.5157626e-002 - -7.5730576e-002 - -7.6174832e-002 - -7.6505072e-002 - -7.6720492e-002 - -7.6823001e-002 - -7.6817398e-002 - -7.6709349e-002 - -7.6499217e-002 - -7.6199248e-002 - -7.5800836e-002 - -7.5313734e-002 - -7.4745256e-002 - -7.4100364e-002 - -7.3362026e-002 - -7.2568258e-002 - -7.1700267e-002 - -7.0762871e-002 - -6.9763024e-002 - -6.8704383e-002 - -6.7607599e-002 - -6.6436751e-002 - -6.5224711e-002 - -6.3971590e-002 - -6.2685781e-002 - -6.1345517e-002 - -5.9983748e-002 - -5.8591568e-002 - -5.7161645e-002 - -5.5717365e-002 - -5.4245277e-002 - -5.2763075e-002 - -5.1255616e-002 - -4.9738576e-002 - -4.8216572e-002 - -4.6684303e-002 - -4.5148841e-002 - -4.3609754e-002 - -4.2064909e-002 - -4.0534917e-002 - -3.9005368e-002 - -3.7481285e-002 - -3.5969756e-002 - -3.4462095e-002 - -3.2975408e-002 - -3.1501761e-002 - -3.0050266e-002 - -2.8607217e-002 - -2.7185943e-002 - -2.5787585e-002 - -2.4416099e-002 - -2.3068017e-002 - -2.1746755e-002 - -2.0453179e-002 - -1.9187243e-002 - -1.7943338e-002 - -1.6732471e-002 - -1.5540555e-002 - -1.4390467e-002 - -1.3271822e-002 - -1.2185000e-002 - -1.1131555e-002 - -1.0115022e-002 - -9.1325330e-003 - -8.1798233e-003 - -7.2615817e-003 - -6.3792293e-003 - -5.5337211e-003 - -4.7222596e-003 - -3.9401124e-003 - -3.1933778e-003 - -2.4826724e-003 - -1.8039473e-003 - -1.1568136e-003 - -5.4642809e-004 - 2.7604519e-005 - 5.8322642e-004 - 1.0902329e-003 - 1.5784683e-003 - 2.0274176e-003 - 2.4508540e-003 - 2.8446758e-003 - 3.2091886e-003 - 3.5401247e-003 - 3.8456408e-003 - 4.1251642e-003 - 4.3801862e-003 - 4.6039530e-003 - 4.8109469e-003 - 4.9839688e-003 - 5.1382275e-003 - 5.2715759e-003 - 5.3838976e-003 - 5.4753783e-003 - 5.5404364e-003 - 5.5917129e-003 - 5.6266114e-003 - 5.6389200e-003 - 5.6455197e-003 - 5.6220643e-003 - 5.5938023e-003 - 5.5475715e-003 - 5.4876040e-003 - 5.4196776e-003 - 5.3471681e-003 - 5.2461166e-003 - 5.1407354e-003 - 5.0393023e-003 - 4.9137604e-003 - 4.7932561e-003 - 4.6606461e-003 - 4.5209853e-003 - 4.3730720e-003 - 4.2264269e-003 - 4.0819753e-003 - 3.9207432e-003 - 3.7603923e-003 - 3.6008268e-003 - 3.4418874e-003 - 3.2739613e-003 - 3.1125421e-003 - 2.9469448e-003 - 2.7870464e-003 - 2.6201759e-003 - 2.4625617e-003 - 2.3017255e-003 - 2.1461584e-003 - 1.9841141e-003 - 1.8348265e-003 - 1.6868083e-003 - 1.5443220e-003 - 1.3902495e-003 - 1.2577885e-003 - 1.1250155e-003 - 9.8859883e-004 - 8.6084433e-004 - 7.4580259e-004 - 6.2393761e-004 - 5.1073885e-004 - 4.0265402e-004 - 2.9495311e-004 - 2.0430171e-004 - 1.0943831e-004 - 1.3494974e-005 - -6.1733441e-005 - -1.4463809e-004 - -2.0983373e-004 - -2.8969812e-004 - -3.5011759e-004 - -4.0951215e-004 - -4.6063255e-004 - -5.1455722e-004 - -5.5645764e-004 - -5.9461189e-004 - -6.3415949e-004 - -6.6504151e-004 - -6.9179375e-004 - -7.2153920e-004 - -7.3193572e-004 - -7.5300014e-004 - -7.6307936e-004 - -7.7579773e-004 - -7.8014496e-004 - -7.8036647e-004 - -7.7798695e-004 - -7.8343323e-004 - -7.7248486e-004 - -7.6813719e-004 - -7.4905981e-004 - -7.4409419e-004 - -7.2550431e-004 - -7.1577365e-004 - -6.9416146e-004 - -6.7776908e-004 - -6.5403334e-004 - -6.3124935e-004 - -6.1327474e-004 - -5.8709305e-004 - -5.6778026e-004 - -5.4665656e-004 - -5.2265643e-004 - -5.0407143e-004 - -4.8937912e-004 - -4.8752280e-004 - -4.9475181e-004 - -5.6176926e-004 - -5.5252865e-004 diff --git a/vocoder/models/__init__.py b/vocoder/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/models/melgan_discriminator.py b/vocoder/models/melgan_discriminator.py deleted file mode 100644 index 3847babb..00000000 --- a/vocoder/models/melgan_discriminator.py +++ /dev/null @@ -1,78 +0,0 @@ -import numpy as np -from torch import nn -from torch.nn.utils import weight_norm - - -class MelganDiscriminator(nn.Module): - def __init__(self, - in_channels=1, - out_channels=1, - kernel_sizes=(5, 3), - base_channels=16, - max_channels=1024, - downsample_factors=(4, 4, 4, 4)): - super(MelganDiscriminator, self).__init__() - self.layers = nn.ModuleList() - - layer_kernel_size = np.prod(kernel_sizes) - layer_padding = (layer_kernel_size - 1) // 2 - - # initial layer - self.layers += [ - nn.Sequential( - nn.ReflectionPad1d(layer_padding), - weight_norm( - nn.Conv1d(in_channels, - base_channels, - layer_kernel_size, - stride=1)), nn.LeakyReLU(0.2, inplace=True)) - ] - - # downsampling layers - layer_in_channels = base_channels - for downsample_factor in downsample_factors: - layer_out_channels = min(layer_in_channels * downsample_factor, - max_channels) - layer_kernel_size = downsample_factor * 10 + 1 - layer_padding = (layer_kernel_size - 1) // 2 - layer_groups = layer_in_channels // 4 - self.layers += [ - nn.Sequential( - weight_norm( - nn.Conv1d(layer_in_channels, - layer_out_channels, - kernel_size=layer_kernel_size, - stride=downsample_factor, - padding=layer_padding, - groups=layer_groups)), - nn.LeakyReLU(0.2, inplace=True)) - ] - layer_in_channels = layer_out_channels - - # last 2 layers - layer_padding1 = (kernel_sizes[0] - 1) // 2 - layer_padding2 = (kernel_sizes[1] - 1) // 2 - self.layers += [ - nn.Sequential( - weight_norm( - nn.Conv1d(layer_out_channels, - layer_out_channels, - kernel_size=kernel_sizes[0], - stride=1, - padding=layer_padding1)), - nn.LeakyReLU(0.2, inplace=True), - ), - weight_norm( - nn.Conv1d(layer_out_channels, - out_channels, - kernel_size=kernel_sizes[1], - stride=1, - padding=layer_padding2)), - ] - - def forward(self, x): - feats = [] - for layer in self.layers: - x = layer(x) - feats.append(x) - return x, feats diff --git a/vocoder/models/melgan_generator.py b/vocoder/models/melgan_generator.py deleted file mode 100644 index 01b52ea8..00000000 --- a/vocoder/models/melgan_generator.py +++ /dev/null @@ -1,98 +0,0 @@ -import torch -from torch import nn -from torch.nn.utils import weight_norm - -from TTS.vocoder.layers.melgan import ResidualStack - - -class MelganGenerator(nn.Module): - def __init__(self, - in_channels=80, - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=(8, 8, 2, 2), - res_kernel=3, - num_res_blocks=3): - super(MelganGenerator, self).__init__() - - # assert model parameters - assert (proj_kernel - - 1) % 2 == 0, " [!] proj_kernel should be an odd number." - - # setup additional model parameters - base_padding = (proj_kernel - 1) // 2 - act_slope = 0.2 - self.inference_padding = 2 - - # initial layer - layers = [] - layers += [ - nn.ReflectionPad1d(base_padding), - weight_norm( - nn.Conv1d(in_channels, - base_channels, - kernel_size=proj_kernel, - stride=1, - bias=True)) - ] - - # upsampling layers and residual stacks - for idx, upsample_factor in enumerate(upsample_factors): - layer_in_channels = base_channels // (2**idx) - layer_out_channels = base_channels // (2**(idx + 1)) - layer_filter_size = upsample_factor * 2 - layer_stride = upsample_factor - layer_output_padding = upsample_factor % 2 - layer_padding = upsample_factor // 2 + layer_output_padding - layers += [ - nn.LeakyReLU(act_slope), - weight_norm( - nn.ConvTranspose1d(layer_in_channels, - layer_out_channels, - layer_filter_size, - stride=layer_stride, - padding=layer_padding, - output_padding=layer_output_padding, - bias=True)), - ResidualStack( - channels=layer_out_channels, - num_res_blocks=num_res_blocks, - kernel_size=res_kernel - ) - ] - - layers += [nn.LeakyReLU(act_slope)] - - # final layer - layers += [ - nn.ReflectionPad1d(base_padding), - weight_norm( - nn.Conv1d(layer_out_channels, - out_channels, - proj_kernel, - stride=1, - bias=True)), - nn.Tanh() - ] - self.layers = nn.Sequential(*layers) - - def forward(self, c): - return self.layers(c) - - def inference(self, c): - c = c.to(self.layers[1].weight.device) - c = torch.nn.functional.pad( - c, - (self.inference_padding, self.inference_padding), - 'replicate') - return self.layers(c) - - def remove_weight_norm(self): - for _, layer in enumerate(self.layers): - if len(layer.state_dict()) != 0: - try: - nn.utils.remove_weight_norm(layer) - except ValueError: - layer.remove_weight_norm() - diff --git a/vocoder/models/melgan_multiscale_discriminator.py b/vocoder/models/melgan_multiscale_discriminator.py deleted file mode 100644 index dbcc1f30..00000000 --- a/vocoder/models/melgan_multiscale_discriminator.py +++ /dev/null @@ -1,41 +0,0 @@ -from torch import nn - -from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator - - -class MelganMultiscaleDiscriminator(nn.Module): - def __init__(self, - in_channels=1, - out_channels=1, - num_scales=3, - kernel_sizes=(5, 3), - base_channels=16, - max_channels=1024, - downsample_factors=(4, 4, 4), - pooling_kernel_size=4, - pooling_stride=2, - pooling_padding=1): - super(MelganMultiscaleDiscriminator, self).__init__() - - self.discriminators = nn.ModuleList([ - MelganDiscriminator(in_channels=in_channels, - out_channels=out_channels, - kernel_sizes=kernel_sizes, - base_channels=base_channels, - max_channels=max_channels, - downsample_factors=downsample_factors) - for _ in range(num_scales) - ]) - - self.pooling = nn.AvgPool1d(kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False) - - - def forward(self, x): - scores = list() - feats = list() - for disc in self.discriminators: - score, feat = disc(x) - scores.append(score) - feats.append(feat) - x = self.pooling(x) - return scores, feats \ No newline at end of file diff --git a/vocoder/models/multiband_melgan_generator.py b/vocoder/models/multiband_melgan_generator.py deleted file mode 100644 index 15e7426e..00000000 --- a/vocoder/models/multiband_melgan_generator.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch - -from TTS.vocoder.models.melgan_generator import MelganGenerator -from TTS.vocoder.layers.pqmf import PQMF - - -class MultibandMelganGenerator(MelganGenerator): - def __init__(self, - in_channels=80, - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=(2, 8, 2, 2), - res_kernel=3, - num_res_blocks=3): - super(MultibandMelganGenerator, - self).__init__(in_channels=in_channels, - out_channels=out_channels, - proj_kernel=proj_kernel, - base_channels=base_channels, - upsample_factors=upsample_factors, - res_kernel=res_kernel, - num_res_blocks=num_res_blocks) - self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) - - def pqmf_analysis(self, x): - return self.pqmf_layer.analysis(x) - - def pqmf_synthesis(self, x): - return self.pqmf_layer.synthesis(x) - - @torch.no_grad() - def inference(self, cond_features): - cond_features = cond_features.to(self.layers[1].weight.device) - cond_features = torch.nn.functional.pad( - cond_features, - (self.inference_padding, self.inference_padding), - 'replicate') - return self.pqmf_synthesis(self.layers(cond_features)) diff --git a/vocoder/models/random_window_discriminator.py b/vocoder/models/random_window_discriminator.py deleted file mode 100644 index 3efd395e..00000000 --- a/vocoder/models/random_window_discriminator.py +++ /dev/null @@ -1,225 +0,0 @@ -import numpy as np -from torch import nn - - -class GBlock(nn.Module): - def __init__(self, in_channels, cond_channels, downsample_factor): - super(GBlock, self).__init__() - - self.in_channels = in_channels - self.cond_channels = cond_channels - self.downsample_factor = downsample_factor - - self.start = nn.Sequential( - nn.AvgPool1d(downsample_factor, stride=downsample_factor), - nn.ReLU(), - nn.Conv1d(in_channels, in_channels * 2, kernel_size=3, padding=1)) - self.lc_conv1d = nn.Conv1d(cond_channels, - in_channels * 2, - kernel_size=1) - self.end = nn.Sequential( - nn.ReLU(), - nn.Conv1d(in_channels * 2, - in_channels * 2, - kernel_size=3, - dilation=2, - padding=2)) - self.residual = nn.Sequential( - nn.Conv1d(in_channels, in_channels * 2, kernel_size=1), - nn.AvgPool1d(downsample_factor, stride=downsample_factor)) - - def forward(self, inputs, conditions): - outputs = self.start(inputs) + self.lc_conv1d(conditions) - outputs = self.end(outputs) - residual_outputs = self.residual(inputs) - outputs = outputs + residual_outputs - - return outputs - - -class DBlock(nn.Module): - def __init__(self, in_channels, out_channels, downsample_factor): - super(DBlock, self).__init__() - - self.in_channels = in_channels - self.downsample_factor = downsample_factor - self.out_channels = out_channels - - self.donwsample_layer = nn.AvgPool1d(downsample_factor, - stride=downsample_factor) - self.layers = nn.Sequential( - nn.ReLU(), - nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1), - nn.ReLU(), - nn.Conv1d(out_channels, - out_channels, - kernel_size=3, - dilation=2, - padding=2)) - self.residual = nn.Sequential( - nn.Conv1d(in_channels, out_channels, kernel_size=1), ) - - def forward(self, inputs): - if self.downsample_factor > 1: - outputs = self.layers(self.donwsample_layer(inputs))\ - + self.donwsample_layer(self.residual(inputs)) - else: - outputs = self.layers(inputs) + self.residual(inputs) - return outputs - - -class ConditionalDiscriminator(nn.Module): - def __init__(self, - in_channels, - cond_channels, - downsample_factors=(2, 2, 2), - out_channels=(128, 256)): - super(ConditionalDiscriminator, self).__init__() - - assert len(downsample_factors) == len(out_channels) + 1 - - self.in_channels = in_channels - self.cond_channels = cond_channels - self.downsample_factors = downsample_factors - self.out_channels = out_channels - - self.pre_cond_layers = nn.ModuleList() - self.post_cond_layers = nn.ModuleList() - - # layers before condition features - self.pre_cond_layers += [DBlock(in_channels, 64, 1)] - in_channels = 64 - for (i, channel) in enumerate(out_channels): - self.pre_cond_layers.append( - DBlock(in_channels, channel, downsample_factors[i])) - in_channels = channel - - # condition block - self.cond_block = GBlock(in_channels, cond_channels, - downsample_factors[-1]) - - # layers after condition block - self.post_cond_layers += [ - DBlock(in_channels * 2, in_channels * 2, 1), - DBlock(in_channels * 2, in_channels * 2, 1), - nn.AdaptiveAvgPool1d(1), - nn.Conv1d(in_channels * 2, 1, kernel_size=1), - ] - - def forward(self, inputs, conditions): - batch_size = inputs.size()[0] - outputs = inputs.view(batch_size, self.in_channels, -1) - for layer in self.pre_cond_layers: - outputs = layer(outputs) - outputs = self.cond_block(outputs, conditions) - for layer in self.post_cond_layers: - outputs = layer(outputs) - - return outputs - - -class UnconditionalDiscriminator(nn.Module): - def __init__(self, - in_channels, - base_channels=64, - downsample_factors=(8, 4), - out_channels=(128, 256)): - super(UnconditionalDiscriminator, self).__init__() - - self.downsample_factors = downsample_factors - self.in_channels = in_channels - self.downsample_factors = downsample_factors - self.out_channels = out_channels - - self.layers = nn.ModuleList() - self.layers += [DBlock(self.in_channels, base_channels, 1)] - in_channels = base_channels - for (i, factor) in enumerate(downsample_factors): - self.layers.append(DBlock(in_channels, out_channels[i], factor)) - in_channels *= 2 - self.layers += [ - DBlock(in_channels, in_channels, 1), - DBlock(in_channels, in_channels, 1), - nn.AdaptiveAvgPool1d(1), - nn.Conv1d(in_channels, 1, kernel_size=1), - ] - - def forward(self, inputs): - batch_size = inputs.size()[0] - outputs = inputs.view(batch_size, self.in_channels, -1) - for layer in self.layers: - outputs = layer(outputs) - return outputs - - -class RandomWindowDiscriminator(nn.Module): - """Random Window Discriminator as described in - http://arxiv.org/abs/1909.11646""" - def __init__(self, - cond_channels, - hop_length, - uncond_disc_donwsample_factors=(8, 4), - cond_disc_downsample_factors=((8, 4, 2, 2, 2), (8, 4, 2, 2), - (8, 4, 2), (8, 4), (4, 2, 2)), - cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256), - (128, 256), (256, ), (128, 256)), - window_sizes=(512, 1024, 2048, 4096, 8192)): - - super(RandomWindowDiscriminator, self).__init__() - self.cond_channels = cond_channels - self.window_sizes = window_sizes - self.hop_length = hop_length - self.base_window_size = self.hop_length * 2 - self.ks = [ws // self.base_window_size for ws in window_sizes] - - # check arguments - assert len(cond_disc_downsample_factors) == len( - cond_disc_out_channels) == len(window_sizes) - for ws in window_sizes: - assert ws % hop_length == 0 - - for idx, cf in enumerate(cond_disc_downsample_factors): - assert np.prod(cf) == hop_length // self.ks[idx] - - # define layers - self.unconditional_discriminators = nn.ModuleList([]) - for k in self.ks: - layer = UnconditionalDiscriminator( - in_channels=k, - base_channels=64, - downsample_factors=uncond_disc_donwsample_factors) - self.unconditional_discriminators.append(layer) - - self.conditional_discriminators = nn.ModuleList([]) - for idx, k in enumerate(self.ks): - layer = ConditionalDiscriminator( - in_channels=k, - cond_channels=cond_channels, - downsample_factors=cond_disc_downsample_factors[idx], - out_channels=cond_disc_out_channels[idx]) - self.conditional_discriminators.append(layer) - - def forward(self, x, c): - scores = [] - feats = [] - # unconditional pass - for (window_size, layer) in zip(self.window_sizes, - self.unconditional_discriminators): - index = np.random.randint(x.shape[-1] - window_size) - - score = layer(x[:, :, index:index + window_size]) - scores.append(score) - - # conditional pass - for (window_size, layer) in zip(self.window_sizes, - self.conditional_discriminators): - frame_size = window_size // self.hop_length - lc_index = np.random.randint(c.shape[-1] - frame_size) - sample_index = lc_index * self.hop_length - x_sub = x[:, :, - sample_index:(lc_index + frame_size) * self.hop_length] - c_sub = c[:, :, lc_index:lc_index + frame_size] - - score = layer(x_sub, c_sub) - scores.append(score) - return scores, feats diff --git a/vocoder/notebooks/Untitled.ipynb b/vocoder/notebooks/Untitled.ipynb deleted file mode 100644 index ce49d6fa..00000000 --- a/vocoder/notebooks/Untitled.ipynb +++ /dev/null @@ -1,678 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "#function example with several unknowns (variables) for optimization\n", - "#Gerald Schuller, Nov. 2016\n", - "import numpy as np\n", - "\n", - "def functionexamp(x):\n", - " #x: array with 2 variables\n", - " \n", - " y=np.sin(x[0])+np.cos(x[1])\n", - " return y" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " fun: -1.9999999999888387\n", - " jac: array([4.7236681e-06, 0.0000000e+00])\n", - " message: 'Optimization terminated successfully.'\n", - " nfev: 12\n", - " nit: 2\n", - " njev: 3\n", - " status: 0\n", - " success: True\n", - " x: array([-1.5707916 , -3.14159265])\n" - ] - } - ], - "source": [ - "#Optimization example, see also:\n", - "#https://docs.scipy.org/doc/scipy-0.18.1/reference/optimize.html\n", - "#Gerald Schuller, Nov. 2016\n", - "#run it with \"python optimizationExample.py\" in a termina shell\n", - "#or type \"ipython\" in a termina shell and copy lines below:\n", - "\n", - "import numpy as np\n", - "import scipy.optimize as optimize\n", - "\n", - "#Example for 2 unknowns, args: function-name, starting point, method:\n", - "xmin = optimize.minimize(functionexamp, [-1.0, -3.0], method='CG')\n", - "print(xmin)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "function [p,passedge] = opt_filter(filtorder,N)\n", - "\n", - "% opt_filter Create Lowpass Prototype Filter for the Pseudo-QMF \n", - "% Filter Bank with N Subbands\n", - "%\n", - "% Adapted from the paper by C. D. Creusere and S. K. Mitra, titled \n", - "% \"A simple method for designing high-quality prototype filters for \n", - "% M-band pseudo-QMF banks,\" IEEE Trans. Signal Processing,vol. 43, \n", - "% pp. 1005-1007, Apr. 1995 and the book by S. K. Mitra titled \"\n", - "% Digital Signal Processing: A Computer-Based Approach, McGraw-Hill, 2001\n", - "%\n", - "% Arguments:\n", - "% filtorder Filter order (i.e., filter length - 1)\n", - "% N Number of subbands\n", - "\n", - "stopedge = 1/N; % Stopband edge fixed at (1/N)pi\n", - "passedge = 1/(4*N); % Start value for passband edge\n", - "tol = 0.000001; % Tolerance\n", - "step = 0.1*passedge; % Step size for searching the passband edge\n", - "way = -1; % Search direction, increase or reduce the passband edge\n", - "tcost = 0; % Current error calculated with the cost function\n", - "pcost = 10; % Previous error calculated with the cost function\n", - "flag = 0; % Set to 1 to stop the search\n", - "\n", - "while flag == 0\n", - " \n", - "% Design the lowpass filter using Parks-McClellan algorithm\n", - " \n", - " p = remez(filtorder,[0,passedge,stopedge,1],[1,1,0,0],[5,1]);\n", - " \n", - "% Calculates the cost function according to Eq. (2.36)\n", - "\n", - " P = fft(p,4096);\n", - " OptRange = floor(2048/N); % 0 to pi/N\n", - " phi = zeros(OptRange,1); % Initialize to zeros\n", - "\n", - "% Compute the flatness in the range from 0 to pi/N\n", - "\n", - "\tfor k = 1:OptRange\n", - " phi(k) = abs(P(OptRange-k+2))^2 + abs(P(k))^2;\n", - "\tend\n", - "\ttcost = max(abs(phi - ones(max(size(phi)),1)));\n", - " \t\n", - "\tif tcost > pcost % If search in wrong direction\n", - "\t\tstep = step/2; % Reduce step size by half \n", - "\t\tway = -way; % Change the search direction \n", - "\tend\n", - "\t\n", - "\tif abs(pcost - tcost) < tol % If improvement is below tol \n", - "\t\tflag = 1; % Stop the search \n", - "\tend\n", - "\t\n", - "\tpcost = tcost;\n", - "\tpassedge = passedge + way*step; % Adjust the passband edge\n", - " \n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sig.remez" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.0125" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "1 / 4. / 20.0" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Band edges should be less than 1/2 the sampling frequency", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremez\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m64\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m16.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m4.0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/miniconda3/lib/python3.7/site-packages/scipy/signal/fir_filter_design.py\u001b[0m in \u001b[0;36mremez\u001b[0;34m(numtaps, bands, desired, weight, Hz, type, maxiter, grid_density, fs)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[0mbands\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbands\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 855\u001b[0m return sigtools._remez(numtaps, bands, desired, weight, tnum, fs,\n\u001b[0;32m--> 856\u001b[0;31m maxiter, grid_density)\n\u001b[0m\u001b[1;32m 857\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Band edges should be less than 1/2 the sampling frequency" - ] - } - ], - "source": [ - "p = sig.remez(65, [0, 1/16.0, 1/4.0, 1], [1, 0], [5, 1])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "def create_pqmf_filter(filter_len=64, N=4):\n", - " stop_edge = 1 / N\n", - " pass_edge = 1 / (4 * N)\n", - " tol = 1e-8\n", - " cutoff = 0.1 * pass_edge\n", - " cost = 0\n", - " cost_prev = float('inf')\n", - " \n", - " p = sig.remez(filter_len, [0, pass_edge, stop_edge, 1], [1, 1, 0, 0], [5, 1])\n", - " \n", - " P = sig.freqz(p, workN=2048)\n", - " opt_range = 2048 // N\n", - " phi = np.zeros(opt_range)\n", - " \n", - " H = np.abs(P)\n", - " phi = H[opt_range + 2] \n", - " for i in range(opt_range):\n", - " phi[i] = abs(P(opt_range - i + 2)) ** 2 + abs(P[i]) ** 2" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import scipy as sp\n", - "import scipy.signal as sig\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "\n", - "def optimfuncQMF(x):\n", - " \"\"\"Optimization function for a PQMF Filterbank\n", - " x: coefficients to optimize (first half of prototype h because of symmetry)\n", - " err: resulting total error\n", - " \"\"\"\n", - " K = ntaps * N \n", - " h = np.append(x, np.flipud(x))\n", - " cutoff = 0.15\n", - " \n", - "# breakpoint()\n", - " f, H_im = sig.freqz(h, worN=K)\n", - " H = np.abs(H_im) #only keeping the real part\n", - " \n", - " posfreq = np.square(H[0:K//N])\n", - " \n", - " #Negative frequencies are symmetric around 0:\n", - " negfreq = np.flipud(np.square(H[0:K//N]))\n", - " \n", - " #Sum of magnitude squared frequency responses should be closed to unity (or N)\n", - " unitycond = np.sum(np.abs(posfreq + negfreq - 2*(N*N)*np.ones(K//N)))/K\n", - " \n", - " #plt.plot(posfreq+negfreq)\n", - " \n", - " #High attenuation after the next subband:\n", - " att = np.sum(np.abs(H[int(cutoff*K//N):]))/K\n", - " \n", - " #Total (weighted) error:\n", - " err = unitycond + 100*att\n", - " return err" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(32,)" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "xmin.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8.684549400499243\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import scipy.optimize as opt\n", - "import scipy.signal as sig\n", - "\n", - "ntaps = 64\n", - "N = 4\n", - "\n", - "#optimize for 16 filter coefficients:\n", - "xmin = opt.minimize(optimfuncQMF, ntaps*np.ones(ntaps), method='SLSQP', tol=1e-8)\n", - "xmin = xmin[\"x\"]\n", - "\n", - "err = optimfuncQMF(xmin)\n", - "print(err)\n", - "\n", - "#Restore symmetric upper half of window:\n", - "h = np.concatenate((xmin, np.flipud(xmin)))\n", - "plt.plot(h)\n", - "plt.title('Resulting PQMF Window Function')\n", - "plt.xlabel('Sample')\n", - "plt.ylabel('Value')\n", - "plt.show()\n", - "\n", - "f, H = sig.freqz(h)\n", - "plt.plot(f, 20*np.log10(np.abs(H)))\n", - "plt.title('Resulting PQMF Magnitude Response')\n", - "plt.xlabel('Normalized Frequency')\n", - "plt.ylabel('dB')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "N = 4\n", - "f, H_im = sig.freqz(h)\n", - "posfreq = np.square(H[0:512//N])\n", - "negfreq = np.flipud(np.square(H[0:512//N]))\n", - "plt.plot((np.abs(posfreq) + np.abs(negfreq)))\n", - "plt.xlabel('Frequency (512 is Nyquist)')\n", - "plt.ylabel('Magnitude')\n", - "plt.title('Unity Condition, Sum of Squared Magnitude of 2 Neighboring Subbands')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "b = sig.firwin(80, 0.5, window=('kaiser', 8))" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "f, H_im = sig.freqz(h)\n", - "posfreq = np.square(H[0:512//N])\n", - "negfreq = np.flipud(np.square(H[0:512//N]))\n", - "plt.plot((np.abs(posfreq) + np.abs(negfreq)))\n", - "plt.xlabel('Frequency (512 is Nyquist)')\n", - "plt.ylabel('Magnitude')\n", - "plt.title('Unity Condition, Sum of Squared Magnitude of 2 Neighboring Subbands')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(63,)" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "b.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cutoff = 0.15\n", - "beta = 9\n", - "ntaps = 63\n", - "N = 4\n", - "\n", - "b = sig.firwin(ntaps, cutoff, window=('kaiser', beta))\n", - "w, h = sig.freqz(b)\n", - "\n", - "plt.plot(b)\n", - "plt.title('Resulting PQMF Window Function')\n", - "plt.xlabel('Sample')\n", - "plt.ylabel('Value')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "fig, ax1 = plt.subplots()\n", - "ax1.set_title('Digital filter frequency response')\n", - "\n", - "ax1.plot(w / (2 * np.pi), 20 * np.log10(abs(h)), 'b')\n", - "ax1.set_ylabel('Amplitude [dB]', color='b')\n", - "ax1.set_xlabel('Frequency [rad/sample]')\n", - "\n", - "ax2 = ax1.twinx()\n", - "angles = np.unwrap(np.angle(h))\n", - "ax2.plot(w / (2 * np.pi), angles, 'g')\n", - "ax2.set_ylabel('Angle (radians)', color='g')\n", - "ax2.grid()\n", - "ax2.axis('tight')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(63,)" - ] - }, - "execution_count": 105, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "b.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "def optimfuncQMF(x):\n", - " \"\"\"Optimization function for a PQMF Filterbank\n", - " x: coefficients to optimize (first half of prototype h because of symmetry)\n", - " err: resulting total error\n", - " \"\"\"\n", - " N = 2 #4 subbands\n", - " cutoff = 1.5 #1.5\n", - " h = np.append(x, np.flipud(x))\n", - " f, H_im = sig.freqz(h)\n", - " H = np.abs(H_im) #only keeping the real part\n", - " \n", - " posfreq = np.square(H[0:512//N])\n", - " \n", - " #Negative frequencies are symmetric around 0:\n", - " negfreq = np.flipud(np.square(H[0:512//N]))\n", - " \n", - " #Sum of magnitude squared frequency responses should be closed to unity (or N)\n", - " unitycond = np.sum(np.abs(posfreq + negfreq - 2*(N*N)*np.ones(512//N)))//512\n", - " \n", - " #plt.plot(posfreq+negfreq)\n", - " \n", - " #High attenuation after the next subband:\n", - " att = np.sum(np.abs(H[int(cutoff*512//N):]))//512\n", - " \n", - " #Total (weighted) error:\n", - " err = unitycond + 100*att\n", - " return err" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3.0\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "err = optimfuncQMF(b)\n", - "print(err)\n", - "\n", - "#Restore symmetric upper half of window:\n", - "h = np.concatenate((xmin, np.flipud(xmin)))\n", - "plt.plot(h)\n", - "plt.title('Resulting PQMF Window Function')\n", - "plt.xlabel('Sample')\n", - "plt.ylabel('Value')\n", - "plt.show()\n", - "\n", - "f, H = sig.freqz(h)\n", - "plt.plot(f, 20*np.log10(np.abs(H)))\n", - "plt.title('Resulting PQMF Magnitude Response')\n", - "plt.xlabel('Normalized Frequency')\n", - "plt.ylabel('dB')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.7.4 64-bit ('base': conda)", - "language": "python", - "name": "python37464bitbaseconda58faf23c4b5f4fef93406f29a1005f35" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/vocoder/notebooks/Untitled1.ipynb b/vocoder/notebooks/Untitled1.ipynb deleted file mode 100644 index 7fec5150..00000000 --- a/vocoder/notebooks/Untitled1.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/vocoder/pqmf_output.wav b/vocoder/pqmf_output.wav deleted file mode 100644 index 8a77747b..00000000 Binary files a/vocoder/pqmf_output.wav and /dev/null differ diff --git a/vocoder/tests/__init__.py b/vocoder/tests/__init__.py deleted file mode 100644 index 8b137891..00000000 --- a/vocoder/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/vocoder/tf/convert_melgan_tflite.py b/vocoder/tf/convert_melgan_tflite.py deleted file mode 100644 index 9a652b57..00000000 --- a/vocoder/tf/convert_melgan_tflite.py +++ /dev/null @@ -1,33 +0,0 @@ -# Convert Tensorflow Tacotron2 model to TF-Lite binary - -import argparse - -from TTS.utils.io import load_config -from TTS.vocoder.tf.utils.generic_utils import setup_generator -from TTS.vocoder.tf.utils.io import load_checkpoint -from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite - - -parser = argparse.ArgumentParser() -parser.add_argument('--tf_model', - type=str, - help='Path to target torch model to be converted to TF.') -parser.add_argument('--config_path', - type=str, - help='Path to config file of torch model.') -parser.add_argument('--output_path', - type=str, - help='path to tflite output binary.') -args = parser.parse_args() - -# Set constants -CONFIG = load_config(args.config_path) - -# load the model -model = setup_generator(CONFIG) -model.build_inference() -model = load_checkpoint(model, args.tf_model) - -# create tflite model -tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path) - diff --git a/vocoder/tf/convert_melgan_torch_to_tf.py b/vocoder/tf/convert_melgan_torch_to_tf.py deleted file mode 100644 index 4c8515d9..00000000 --- a/vocoder/tf/convert_melgan_torch_to_tf.py +++ /dev/null @@ -1,117 +0,0 @@ -import argparse -import os - -import numpy as np -import tensorflow as tf -import torch -from fuzzywuzzy import fuzz - -from TTS.utils.io import load_config -from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import ( - compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf) -from TTS.vocoder.tf.utils.generic_utils import \ - setup_generator as setup_tf_generator -from TTS.vocoder.tf.utils.io import save_checkpoint -from TTS.vocoder.utils.generic_utils import setup_generator - -# prevent GPU use -os.environ['CUDA_VISIBLE_DEVICES'] = '' - -# define args -parser = argparse.ArgumentParser() -parser.add_argument('--torch_model_path', - type=str, - help='Path to target torch model to be converted to TF.') -parser.add_argument('--config_path', - type=str, - help='Path to config file of torch model.') -parser.add_argument( - '--output_path', - type=str, - help='path to output file including file name to save TF model.') -args = parser.parse_args() - -# load model config -config_path = args.config_path -c = load_config(config_path) -num_speakers = 0 - -# init torch model -model = setup_generator(c) -checkpoint = torch.load(args.torch_model_path, - map_location=torch.device('cpu')) -state_dict = checkpoint['model'] -model.load_state_dict(state_dict) -model.remove_weight_norm() -state_dict = model.state_dict() - -# init tf model -model_tf = setup_tf_generator(c) - -common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE' -# get tf_model graph by passing an input -# B x D x T -dummy_input = tf.random.uniform((7, 80, 64), dtype=tf.float32) -mel_pred = model_tf(dummy_input, training=False) - -# get tf variables -tf_vars = model_tf.weights - -# match variable names with fuzzy logic -torch_var_names = list(state_dict.keys()) -tf_var_names = [we.name for we in model_tf.weights] -var_map = [] -for tf_name in tf_var_names: - # skip re-mapped layer names - if tf_name in [name[0] for name in var_map]: - continue - tf_name_edited = convert_tf_name(tf_name) - ratios = [ - fuzz.ratio(torch_name, tf_name_edited) - for torch_name in torch_var_names - ] - max_idx = np.argmax(ratios) - matching_name = torch_var_names[max_idx] - del torch_var_names[max_idx] - var_map.append((tf_name, matching_name)) - -# pass weights -tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict) - -# Compare TF and TORCH models -# check embedding outputs -model.eval() -dummy_input_torch = torch.ones((1, 80, 10)) -dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy()) -dummy_input_tf = tf.transpose(dummy_input_tf, perm=[0, 2, 1]) -dummy_input_tf = tf.expand_dims(dummy_input_tf, 2) - -out_torch = model.layers[0](dummy_input_torch) -out_tf = model_tf.model_layers[0](dummy_input_tf) -out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :] - -assert compare_torch_tf(out_torch, out_tf_) < 1e-5 - -for i in range(1, len(model.layers)): - print(f"{i} -> {model.layers[i]} vs {model_tf.model_layers[i]}") - out_torch = model.layers[i](out_torch) - out_tf = model_tf.model_layers[i](out_tf) - out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :] - diff = compare_torch_tf(out_torch, out_tf_) - assert diff < 1e-5, diff - -torch.manual_seed(0) -dummy_input_torch = torch.rand((1, 80, 100)) -dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy()) -model.inference_padding = 0 -model_tf.inference_padding = 0 -output_torch = model.inference(dummy_input_torch) -output_tf = model_tf(dummy_input_tf, training=False) -assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf( - output_torch, output_tf) - -# save tf model -save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'], - args.output_path) -print(' > Model conversion is successfully completed :).') - diff --git a/vocoder/tf/layers/melgan.py b/vocoder/tf/layers/melgan.py deleted file mode 100644 index 3fad4c2a..00000000 --- a/vocoder/tf/layers/melgan.py +++ /dev/null @@ -1,57 +0,0 @@ -import tensorflow as tf - - -class ReflectionPad1d(tf.keras.layers.Layer): - def __init__(self, padding): - super(ReflectionPad1d, self).__init__() - self.padding = padding - - def call(self, x): - return tf.pad(x, [[0, 0], [self.padding, self.padding], [0, 0], [0, 0]], "REFLECT") - - -class ResidualStack(tf.keras.layers.Layer): - def __init__(self, channels, num_res_blocks, kernel_size, name): - super(ResidualStack, self).__init__(name=name) - - assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." - base_padding = (kernel_size - 1) // 2 - - self.blocks = [] - num_layers = 2 - for idx in range(num_res_blocks): - layer_kernel_size = kernel_size - layer_dilation = layer_kernel_size**idx - layer_padding = base_padding * layer_dilation - block = [ - tf.keras.layers.LeakyReLU(0.2), - ReflectionPad1d(layer_padding), - tf.keras.layers.Conv2D(filters=channels, - kernel_size=(kernel_size, 1), - dilation_rate=(layer_dilation, 1), - use_bias=True, - padding='valid', - name=f'blocks.{idx}.{num_layers}'), - tf.keras.layers.LeakyReLU(0.2), - tf.keras.layers.Conv2D(filters=channels, - kernel_size=(1, 1), - use_bias=True, - name=f'blocks.{idx}.{num_layers + 2}') - ] - self.blocks.append(block) - self.shortcuts = [ - tf.keras.layers.Conv2D(channels, - kernel_size=1, - use_bias=True, - name=f'shortcuts.{i}') - for i in range(num_res_blocks) - ] - - def call(self, x): - # breakpoint() - for block, shortcut in zip(self.blocks, self.shortcuts): - res = shortcut(x) - for layer in block: - x = layer(x) - x += res - return x \ No newline at end of file diff --git a/vocoder/tf/layers/pqmf.py b/vocoder/tf/layers/pqmf.py deleted file mode 100644 index c018971f..00000000 --- a/vocoder/tf/layers/pqmf.py +++ /dev/null @@ -1,66 +0,0 @@ -import numpy as np -import tensorflow as tf - -from scipy import signal as sig - - -class PQMF(tf.keras.layers.Layer): - def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): - super(PQMF, self).__init__() - # define filter coefficient - self.N = N - self.taps = taps - self.cutoff = cutoff - self.beta = beta - - QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta)) - H = np.zeros((N, len(QMF))) - G = np.zeros((N, len(QMF))) - for k in range(N): - constant_factor = (2 * k + 1) * (np.pi / - (2 * N)) * (np.arange(taps + 1) - - ((taps - 1) / 2)) - phase = (-1)**k * np.pi / 4 - H[k] = 2 * QMF * np.cos(constant_factor + phase) - - G[k] = 2 * QMF * np.cos(constant_factor - phase) - - # [N, 1, taps + 1] == [filter_width, in_channels, out_channels] - self.H = np.transpose(H[:, None, :], (2, 1, 0)).astype('float32') - self.G = np.transpose(G[None, :, :], (2, 1, 0)).astype('float32') - - # filter for downsampling & upsampling - updown_filter = np.zeros((N, N, N), dtype=np.float32) - for k in range(N): - updown_filter[0, k, k] = 1.0 - self.updown_filter = updown_filter.astype(np.float32) - - def analysis(self, x): - """ - x : B x 1 x T - """ - x = tf.transpose(x, perm=[0, 2, 1]) - x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) - x = tf.nn.conv1d(x, self.H, stride=1, padding='VALID') - x = tf.nn.conv1d(x, - self.updown_filter, - stride=self.N, - padding='VALID') - x = tf.transpose(x, perm=[0, 2, 1]) - return x - - def synthesis(self, x): - """ - x : B x D x T - """ - x = tf.transpose(x, perm=[0, 2, 1]) - x = tf.nn.conv1d_transpose( - x, - self.updown_filter * self.N, - strides=self.N, - output_shape=(tf.shape(x)[0], tf.shape(x)[1] * self.N, - self.N)) - x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) - x = tf.nn.conv1d(x, self.G, stride=1, padding="VALID") - x = tf.transpose(x, perm=[0, 2, 1]) - return x diff --git a/vocoder/tf/models/melgan_generator.py b/vocoder/tf/models/melgan_generator.py deleted file mode 100644 index 168fd29e..00000000 --- a/vocoder/tf/models/melgan_generator.py +++ /dev/null @@ -1,128 +0,0 @@ -import logging -import os - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # FATAL -logging.getLogger('tensorflow').setLevel(logging.FATAL) - -import tensorflow as tf -from TTS.vocoder.tf.layers.melgan import ResidualStack, ReflectionPad1d - - -#pylint: disable=too-many-ancestors -#pylint: disable=abstract-method -class MelganGenerator(tf.keras.models.Model): - """ Melgan Generator TF implementation dedicated for inference with no - weight norm """ - def __init__(self, - in_channels=80, - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=(8, 8, 2, 2), - res_kernel=3, - num_res_blocks=3): - super(MelganGenerator, self).__init__() - - self.in_channels = in_channels - - # assert model parameters - assert (proj_kernel - - 1) % 2 == 0, " [!] proj_kernel should be an odd number." - - # setup additional model parameters - base_padding = (proj_kernel - 1) // 2 - act_slope = 0.2 - self.inference_padding = 2 - - # initial layer - self.initial_layer = [ - ReflectionPad1d(base_padding), - tf.keras.layers.Conv2D(filters=base_channels, - kernel_size=(proj_kernel, 1), - strides=1, - padding='valid', - use_bias=True, - name="1") - ] - num_layers = 3 # count number of layers for layer naming - - # upsampling layers and residual stacks - self.upsample_layers = [] - for idx, upsample_factor in enumerate(upsample_factors): - layer_out_channels = base_channels // (2**(idx + 1)) - layer_filter_size = upsample_factor * 2 - layer_stride = upsample_factor - # layer_output_padding = upsample_factor % 2 - self.upsample_layers += [ - tf.keras.layers.LeakyReLU(act_slope), - tf.keras.layers.Conv2DTranspose( - filters=layer_out_channels, - kernel_size=(layer_filter_size, 1), - strides=(layer_stride, 1), - padding='same', - # output_padding=layer_output_padding, - use_bias=True, - name=f'{num_layers}'), - ResidualStack(channels=layer_out_channels, - num_res_blocks=num_res_blocks, - kernel_size=res_kernel, - name=f'layers.{num_layers + 1}') - ] - num_layers += num_res_blocks - 1 - - self.upsample_layers += [tf.keras.layers.LeakyReLU(act_slope)] - - # final layer - self.final_layers = [ - ReflectionPad1d(base_padding), - tf.keras.layers.Conv2D(filters=out_channels, - kernel_size=(proj_kernel, 1), - use_bias=True, - name=f'layers.{num_layers + 1}'), - tf.keras.layers.Activation("tanh") - ] - - # self.model_layers = tf.keras.models.Sequential(self.initial_layer + self.upsample_layers + self.final_layers, name="layers") - self.model_layers = self.initial_layer + self.upsample_layers + self.final_layers - - @tf.function(experimental_relax_shapes=True) - def call(self, c, training=False): - """ - c : B x C x T - """ - if training: - raise NotImplementedError() - return self.inference(c) - - def inference(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - # o = self.model_layers(c) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - return o[:, :, 0, :] - - def build_inference(self): - x = tf.random.uniform((1, self.in_channels, 4), dtype=tf.float32) - self(x, training=False) - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, None, None], dtype=tf.float32), - ],) - def inference_tflite(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - # o = self.model_layers(c) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - return o[:, :, 0, :] \ No newline at end of file diff --git a/vocoder/tf/models/multiband_melgan_generator.py b/vocoder/tf/models/multiband_melgan_generator.py deleted file mode 100644 index bdd333ed..00000000 --- a/vocoder/tf/models/multiband_melgan_generator.py +++ /dev/null @@ -1,60 +0,0 @@ -import tensorflow as tf - -from TTS.vocoder.tf.models.melgan_generator import MelganGenerator -from TTS.vocoder.tf.layers.pqmf import PQMF - -#pylint: disable=too-many-ancestors -#pylint: disable=abstract-method -class MultibandMelganGenerator(MelganGenerator): - def __init__(self, - in_channels=80, - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=(2, 8, 2, 2), - res_kernel=3, - num_res_blocks=3): - super(MultibandMelganGenerator, - self).__init__(in_channels=in_channels, - out_channels=out_channels, - proj_kernel=proj_kernel, - base_channels=base_channels, - upsample_factors=upsample_factors, - res_kernel=res_kernel, - num_res_blocks=num_res_blocks) - self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) - - def pqmf_analysis(self, x): - return self.pqmf_layer.analysis(x) - - def pqmf_synthesis(self, x): - return self.pqmf_layer.synthesis(x) - - def inference(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - o = self.pqmf_layer.synthesis(o[:, :, 0, :]) - return o - - @tf.function( - experimental_relax_shapes=True, - input_signature=[ - tf.TensorSpec([1, 80, None], dtype=tf.float32), - ],) - def inference_tflite(self, c): - c = tf.transpose(c, perm=[0, 2, 1]) - c = tf.expand_dims(c, 2) - # FIXME: TF had no replicate padding as in Torch - # c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT") - o = c - for layer in self.model_layers: - o = layer(o) - o = tf.transpose(o, perm=[0, 3, 2, 1]) - o = self.pqmf_layer.synthesis(o[:, :, 0, :]) - return o diff --git a/vocoder/tf/utils/__init__.py b/vocoder/tf/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/tf/utils/convert_torch_to_tf_utils.py b/vocoder/tf/utils/convert_torch_to_tf_utils.py deleted file mode 100644 index 25139cc3..00000000 --- a/vocoder/tf/utils/convert_torch_to_tf_utils.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np -import tensorflow as tf - - -def compare_torch_tf(torch_tensor, tf_tensor): - """ Compute the average absolute difference b/w torch and tf tensors """ - return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() - - -def convert_tf_name(tf_name): - """ Convert certain patterns in TF layer names to Torch patterns """ - tf_name_tmp = tf_name - tf_name_tmp = tf_name_tmp.replace(':0', '') - tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0') - tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1') - tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh') - tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight') - tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight') - tf_name_tmp = tf_name_tmp.replace('/beta', '/bias') - tf_name_tmp = tf_name_tmp.replace('/', '.') - return tf_name_tmp - - -def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): - """ Transfer weigths from torch state_dict to TF variables """ - print(" > Passing weights from Torch to TF ...") - for tf_var in tf_vars: - torch_var_name = var_map_dict[tf_var.name] - print(f' | > {tf_var.name} <-- {torch_var_name}') - # if tuple, it is a bias variable - if 'kernel' in tf_var.name: - torch_weight = state_dict[torch_var_name] - numpy_weight = torch_weight.permute([2, 1, 0]).numpy()[:, None, :, :] - if 'bias' in tf_var.name: - torch_weight = state_dict[torch_var_name] - numpy_weight = torch_weight - assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" - tf.keras.backend.set_value(tf_var, numpy_weight) - return tf_vars - - -def load_tf_vars(model_tf, tf_vars): - for tf_var in tf_vars: - model_tf.get_layer(tf_var.name).set_weights(tf_var) - return model_tf diff --git a/vocoder/tf/utils/generic_utils.py b/vocoder/tf/utils/generic_utils.py deleted file mode 100644 index 580a3738..00000000 --- a/vocoder/tf/utils/generic_utils.py +++ /dev/null @@ -1,35 +0,0 @@ -import re -import importlib - - -def to_camel(text): - text = text.capitalize() - return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module('TTS.vocoder.tf.models.' + - c.generator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': - model = MyModel( - in_channels=c.audio['num_mels'], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], - res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': - pass - if c.generator_model in 'multiband_melgan_generator': - model = MyModel( - in_channels=c.audio['num_mels'], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params['upsample_factors'], - res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - return model \ No newline at end of file diff --git a/vocoder/tf/utils/io.py b/vocoder/tf/utils/io.py deleted file mode 100644 index d95d972c..00000000 --- a/vocoder/tf/utils/io.py +++ /dev/null @@ -1,27 +0,0 @@ -import datetime -import pickle -import tensorflow as tf - - -def save_checkpoint(model, current_step, epoch, output_path, **kwargs): - """ Save TF Vocoder model """ - state = { - 'model': model.weights, - 'step': current_step, - 'epoch': epoch, - 'date': datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - pickle.dump(state, open(output_path, 'wb')) - - -def load_checkpoint(model, checkpoint_path): - """ Load TF Vocoder model """ - checkpoint = pickle.load(open(checkpoint_path, 'rb')) - chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} - tf_vars = model.weights - for tf_var in tf_vars: - layer_name = tf_var.name - chkp_var_value = chkp_var_dict[layer_name] - tf.keras.backend.set_value(tf_var, chkp_var_value) - return model \ No newline at end of file diff --git a/vocoder/tf/utils/tflite.py b/vocoder/tf/utils/tflite.py deleted file mode 100644 index d0637596..00000000 --- a/vocoder/tf/utils/tflite.py +++ /dev/null @@ -1,31 +0,0 @@ -import tensorflow as tf - - -def convert_melgan_to_tflite(model, - output_path=None, - experimental_converter=True): - """Convert Tensorflow MelGAN model to TFLite. Save a binary file if output_path is - provided, else return TFLite model.""" - - concrete_function = model.inference_tflite.get_concrete_function() - converter = tf.lite.TFLiteConverter.from_concrete_functions( - [concrete_function]) - converter.experimental_new_converter = experimental_converter - converter.optimizations = [] - converter.target_spec.supported_ops = [ - tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS - ] - tflite_model = converter.convert() - print(f'Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.') - if output_path is not None: - # same model binary if outputpath is provided - with open(output_path, 'wb') as f: - f.write(tflite_model) - return None - return tflite_model - - -def load_tflite_model(tflite_path): - tflite_model = tf.lite.Interpreter(model_path=tflite_path) - tflite_model.allocate_tensors() - return tflite_model \ No newline at end of file diff --git a/vocoder/train.py b/vocoder/train.py deleted file mode 100644 index dc081a5e..00000000 --- a/vocoder/train.py +++ /dev/null @@ -1,657 +0,0 @@ -import argparse -import glob -import os -import sys -import time -import traceback - -import torch -from torch.utils.data import DataLoader - -from inspect import signature - -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) -from TTS.utils.io import copy_config_file, load_config -from TTS.utils.radam import RAdam -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.gan_dataset import GANDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -# from distribute import (DistributedSampler, apply_gradient_allreduce, -# init_distributed, reduce_tensor) -from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.io import save_checkpoint, save_best_model -from TTS.vocoder.utils.console_logger import ConsoleLogger -from TTS.vocoder.utils.generic_utils import (check_config, plot_results, - setup_discriminator, - setup_generator) - - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = GANDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose) - dataset.shuffle_mapping() - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) - return loader - - -def format_data(data): - if isinstance(data[0], list): - # setup input data - c_G, x_G = data[0] - c_D, x_D = data[1] - - # dispatch data to GPU - if use_cuda: - c_G = c_G.cuda(non_blocking=True) - x_G = x_G.cuda(non_blocking=True) - c_D = c_D.cuda(non_blocking=True) - x_D = x_D.cuda(non_blocking=True) - - return c_G, x_G, c_D, x_D - - # return a whole audio segment - co, x = data - if use_cuda: - co = co.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return co, x, None, None - - -def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, - scheduler_G, scheduler_D, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model_G.train() - model_D.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, c_D, y_D = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G) - y_hat_sub = None - y_G_sub = None - y_hat_vis = y_hat # for visualization - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_hat_vis = y_hat - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - - # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - loss_G = loss_G_dict['G_loss'] - - # optimizer generator - optimizer_G.zero_grad() - loss_G.backward() - if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), - c.gen_clip_grad) - optimizer_G.step() - if scheduler_G is not None: - scheduler_G.step() - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - with torch.no_grad(): - y_hat = model_G(c_D) - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach(), c_D) - D_out_real = model_D(y_D, c_D) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_D) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict['D_loss'] - - # optimizer discriminator - optimizer_D.zero_grad() - loss_D.backward() - if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), - c.disc_clip_grad) - optimizer_D.step() - if scheduler_D is not None: - scheduler_D.step() - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]['lr'] - current_lr_D = list(optimizer_D.param_groups)[0]['lr'] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - step_time, loader_time, current_lr_G, - current_lr_D, loss_dict, - keep_avg.avg_values) - - # plot step stats - if global_step % 10 == 0: - iter_stats = { - "lr_G": current_lr_G, - "lr_D": current_lr_D, - "step_time": step_time - } - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) - - # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model_G.eval() - model_D.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, _, _ = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G) - y_hat_sub = None - y_G_sub = None - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_G_sub = model_G.pqmf_analysis(y_G) - - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - feats_fake, feats_real = None, None - - # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - with torch.no_grad(): - y_hat = model_G(c_G) - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach(), c_G) - D_out_real = model_D(y_G, c_G) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values['avg_' + key] = value - update_eval_values['avg_loader_time'] = loader_time - update_eval_values['avg_step_time'] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) - - # synthesize a full voice - data_loader.return_segments = False - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio) - - # DISTRUBUTED - # if num_gpus > 1: - # init_distributed(args.rank, num_gpus, args.group_id, - # c.distributed["backend"], c.distributed["url"]) - - # setup models - model_gen = setup_generator(c) - model_disc = setup_discriminator(c) - - # setup optimizers - optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), - lr=c.lr_disc, - weight_decay=0) - - # schedulers - scheduler_gen = None - scheduler_disc = None - if 'lr_scheduler_gen' in c: - scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if 'lr_scheduler_disc' in c: - scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) - - # setup criterion - criterion_gen = GeneratorLoss(c) - criterion_disc = DiscriminatorLoss(c) - - if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') - try: - print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint['model']) - print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint['optimizer']) - print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint['model_disc']) - print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) - if 'scheduler' in checkpoint: - print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint['scheduler']) - # NOTE: Not sure if necessary - scheduler_gen.optimizer = optimizer_gen - if 'scheduler_disc' in checkpoint: - print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) - scheduler_disc.optimizer = optimizer_disc - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) - model_gen.load_state_dict(model_dict) - - model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) - model_disc.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - for group in optimizer_gen.param_groups: - group['lr'] = c.lr_gen - - for group in optimizer_disc.param_groups: - group['lr'] = c.lr_disc - - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] - else: - args.restore_step = 0 - - if use_cuda: - model_gen.cuda() - criterion_gen.cuda() - model_disc.cuda() - criterion_disc.cuda() - - # DISTRUBUTED - # if num_gpus > 1: - # model = apply_gradient_allreduce(model) - - num_params = count_parameters(model_gen) - print(" > Generator has {} parameters".format(num_params), flush=True) - num_params = count_parameters(model_disc) - print(" > Discriminator has {} parameters".format(num_params), flush=True) - - if 'best_loss' not in locals(): - best_loss = float('inf') - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_gen, criterion_gen, optimizer_gen, - model_disc, criterion_disc, optimizer_disc, - scheduler_gen, scheduler_disc, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, - global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--continue_path', - type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) - parser.add_argument( - '--restore_path', - type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') - - # DISTRUBUTED - parser.add_argument( - '--rank', - type=int, - default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') - args = parser.parse_args() - - if args.continue_path != '': - args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') - list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv - latest_model_file = max(list_of_files, key=os.path.getctime) - args.restore_path = latest_model_file - print(f" > Training continues for {args.restore_path}") - - # setup output paths and read configs - c = load_config(args.config_path) - check_config(c) - _ = os.path.dirname(os.path.realpath(__file__)) - - OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) - - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') - - c_logger = ConsoleLogger() - - if args.rank == 0: - os.makedirs(AUDIO_PATH, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) - os.chmod(AUDIO_PATH, 0o775) - os.chmod(OUT_PATH, 0o775) - - LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') - - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/vocoder/utils/__init__.py b/vocoder/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vocoder/utils/console_logger.py b/vocoder/utils/console_logger.py deleted file mode 100644 index 6af0b823..00000000 --- a/vocoder/utils/console_logger.py +++ /dev/null @@ -1,97 +0,0 @@ -import datetime -from TTS.utils.io import AttrDict - - -tcolors = AttrDict({ - 'OKBLUE': '\033[94m', - 'HEADER': '\033[95m', - 'OKGREEN': '\033[92m', - 'WARNING': '\033[93m', - 'FAIL': '\033[91m', - 'ENDC': '\033[0m', - 'BOLD': '\033[1m', - 'UNDERLINE': '\033[4m' -}) - - -class ConsoleLogger(): - # TODO: merge this with TTS ConsoleLogger - def __init__(self): - # use these to compare values between iterations - self.old_train_loss_dict = None - self.old_epoch_loss_dict = None - self.old_eval_loss_dict = None - - # pylint: disable=no-self-use - def get_time(self): - now = datetime.datetime.now() - return now.strftime("%Y-%m-%d %H:%M:%S") - - def print_epoch_start(self, epoch, max_epoch): - print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD, - epoch, max_epoch, tcolors.ENDC), - flush=True) - - def print_train_start(self): - print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}") - - def print_train_step(self, batch_steps, step, global_step, - step_time, loader_time, lrG, lrD, - loss_dict, avg_loss_dict): - indent = " | > " - print() - log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format( - tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) - for key, value in loss_dict.items(): - # print the avg value if given - if f'avg_{key}' in avg_loss_dict.keys(): - log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) - else: - log_text += "{}{}: {:.5f} \n".format(indent, key, value) - log_text += f"{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lrG: {lrG}\n{indent}lrD: {lrD}" - print(log_text, flush=True) - - # pylint: disable=unused-argument - def print_train_epoch_end(self, global_step, epoch, epoch_time, - print_dict): - indent = " | > " - log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch_time:.2f} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" - for key, value in print_dict.items(): - log_text += "{}{}: {:.5f}\n".format(indent, key, value) - print(log_text, flush=True) - - def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") - - def print_eval_step(self, step, loss_dict, avg_loss_dict): - indent = " | > " - print() - log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" - for key, value in loss_dict.items(): - # print the avg value if given - if f'avg_{key}' in avg_loss_dict.keys(): - log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) - else: - log_text += "{}{}: {:.5f} \n".format(indent, key, value) - print(log_text, flush=True) - - def print_epoch_end(self, epoch, avg_loss_dict): - indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format( - tcolors.BOLD, tcolors.ENDC) - for key, value in avg_loss_dict.items(): - # print the avg value if given - color = '' - sign = '+' - diff = 0 - if self.old_eval_loss_dict is not None and key in self.old_eval_loss_dict: - diff = value - self.old_eval_loss_dict[key] - if diff < 0: - color = tcolors.OKGREEN - sign = '' - elif diff > 0: - color = tcolors.FAIL - sign = '+' - log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff) - self.old_eval_loss_dict = avg_loss_dict - print(log_text, flush=True) diff --git a/vocoder/utils/generic_utils.py b/vocoder/utils/generic_utils.py deleted file mode 100644 index 031d299d..00000000 --- a/vocoder/utils/generic_utils.py +++ /dev/null @@ -1,149 +0,0 @@ -import re -import importlib -import numpy as np -from matplotlib import pyplot as plt - -from TTS.utils.visual import plot_spectrogram - - -def plot_results(y_hat, y, ap, global_step, name_prefix): - """ Plot vocoder model results """ - - # select an instance from batch - y_hat = y_hat[0].squeeze(0).detach().cpu().numpy() - y = y[0].squeeze(0).detach().cpu().numpy() - - spec_fake = ap.melspectrogram(y_hat).T - spec_real = ap.melspectrogram(y).T - spec_diff = np.abs(spec_fake - spec_real) - - # plot figure and save it - fig_wave = plt.figure() - plt.subplot(2, 1, 1) - plt.plot(y) - plt.title("groundtruth speech") - plt.subplot(2, 1, 2) - plt.plot(y_hat) - plt.title(f"generated speech @ {global_step} steps") - plt.tight_layout() - plt.close() - - figures = { - name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake), - name_prefix + "spectrogram/real": plot_spectrogram(spec_real), - name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff), - name_prefix + "speech_comparison": fig_wave, - } - return figures - - -def to_camel(text): - text = text.capitalize() - return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.generator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': - model = MyModel( - in_channels=c.audio['num_mels'], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], - res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': - pass - if c.generator_model in 'multiband_melgan_generator': - model = MyModel( - in_channels=c.audio['num_mels'], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params['upsample_factors'], - res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'parallel_wavegan_generator': - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=c.generator_model_params['num_res_blocks'], - stacks=c.generator_model_params['stacks'], - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=c.audio['num_mels'], - aux_context_window=c['conv_pad'], - dropout=0.0, - bias=True, - use_weight_norm=True, - upsample_conditional_features=True, - upsample_factors=c.generator_model_params['upsample_factors']) - return model - - -def setup_discriminator(c): - print(" > Discriminator Model: {}".format(c.discriminator_model)) - if 'parallel_wavegan' in c.discriminator_model: - MyModel = importlib.import_module('TTS.vocoder.models.parallel_wavegan_discriminator') - else: - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.discriminator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in 'random_window_discriminator': - model = MyModel( - cond_channels=c.audio['num_mels'], - hop_length=c.audio['hop_length'], - uncond_disc_donwsample_factors=c. - discriminator_model_params['uncond_disc_donwsample_factors'], - cond_disc_downsample_factors=c. - discriminator_model_params['cond_disc_downsample_factors'], - cond_disc_out_channels=c. - discriminator_model_params['cond_disc_out_channels'], - window_sizes=c.discriminator_model_params['window_sizes']) - if c.discriminator_model in 'melgan_multiscale_discriminator': - model = MyModel( - in_channels=1, - out_channels=1, - kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params['base_channels'], - max_channels=c.discriminator_model_params['max_channels'], - downsample_factors=c. - discriminator_model_params['downsample_factors']) - if c.discriminator_model == 'residual_parallel_wavegan_discriminator': - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], - stacks=c.discriminator_model_params['stacks'], - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - ) - if c.discriminator_model == 'parallel_wavegan_discriminator': - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True - ) - return model - - -def check_config(c): - pass \ No newline at end of file diff --git a/vocoder/utils/io.py b/vocoder/utils/io.py deleted file mode 100644 index 9d350238..00000000 --- a/vocoder/utils/io.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -import torch -import datetime - - -def save_model(model, optimizer, scheduler, model_disc, optimizer_disc, - scheduler_disc, current_step, epoch, output_path, **kwargs): - model_state = model.state_dict() - model_disc_state = model_disc.state_dict()\ - if model_disc is not None else None - optimizer_state = optimizer.state_dict()\ - if optimizer is not None else None - optimizer_disc_state = optimizer_disc.state_dict()\ - if optimizer_disc is not None else None - scheduler_state = scheduler.state_dict()\ - if scheduler is not None else None - scheduler_disc_state = scheduler_disc.state_dict()\ - if scheduler_disc is not None else None - state = { - 'model': model_state, - 'optimizer': optimizer_state, - 'scheduler': scheduler_state, - 'model_disc': model_disc_state, - 'optimizer_disc': optimizer_disc_state, - 'scheduler_disc': scheduler_disc_state, - 'step': current_step, - 'epoch': epoch, - 'date': datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint(model, optimizer, scheduler, model_disc, optimizer_disc, - scheduler_disc, current_step, epoch, output_folder, - **kwargs): - file_name = 'checkpoint_{}.pth.tar'.format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, scheduler, model_disc, optimizer_disc, - scheduler_disc, current_step, epoch, checkpoint_path, **kwargs) - - -def save_best_model(target_loss, best_loss, model, optimizer, scheduler, - model_disc, optimizer_disc, scheduler_disc, current_step, - epoch, output_folder, **kwargs): - if target_loss < best_loss: - file_name = 'best_model.pth.tar' - checkpoint_path = os.path.join(output_folder, file_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model(model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - model_loss=target_loss, - **kwargs) - best_loss = target_loss - return best_loss \ No newline at end of file