From 9e63cf4072fa5a0ed3a97380fbc7603e4a0da88c Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 27 Jul 2020 14:29:14 +0200 Subject: [PATCH 1/2] Load requirements from requirements.txt to avoid duplication and out-of-sync issues --- requirements.txt | 2 +- setup.py | 33 ++++++--------------------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/requirements.txt b/requirements.txt index 60706213..14c3a03d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.5 -tensorflow>=2.2 +tensorflow==2.3.0rc0 numpy>=1.16.0 scipy>=0.19.0 numba==0.48 diff --git a/setup.py b/setup.py index d80505af..1302dc99 100644 --- a/setup.py +++ b/setup.py @@ -76,34 +76,13 @@ def pip_install(package_name): ) +reqs_from_file = open('requirements.txt').readlines() +reqs_without_tf = [r for r in reqs_from_file if not r.startswith('tensorflow')] +tf_req = [r for r in reqs_from_file if r.startswith('tensorflow')] + requirements = { - 'install_requires':[ - "torch>=1.5", - "numpy>=1.16.0", - "numba==0.48", - "scipy>=0.19.0", - "librosa==0.7.2", - "unidecode==0.4.20", - "attrdict", - "tensorboardX", - "matplotlib", - "Pillow", - "flask", - "tqdm", - "inflect", - "pysbd", - "bokeh==1.4.0", - "soundfile", - "phonemizer>=2.2.0", - "nose==1.3.7", - "cardboardlint==1.3.0", - "pylint==2.5.3", - 'fuzzywuzzy', - 'gdown' - ], - 'pip_install':[ - 'tensorflow==2.3.0rc0', - ] + 'install_requires': reqs_without_tf, + 'pip_install': tf_req } From b21dceb351483f6b685288bfddb7055d11047c4e Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 27 Jul 2020 14:43:04 +0200 Subject: [PATCH 2/2] Remove some duplicated or unused files --- TTS/tts/utils/import torch.py | 177 -------------- TTS/utils/synthesis.py | 231 ------------------ ..._and_MultiBand_MelGAN_TFLite_Example.ipynb | 2 +- 3 files changed, 1 insertion(+), 409 deletions(-) delete mode 100644 TTS/tts/utils/import torch.py delete mode 100644 TTS/utils/synthesis.py diff --git a/TTS/tts/utils/import torch.py b/TTS/tts/utils/import torch.py deleted file mode 100644 index a8518094..00000000 --- a/TTS/tts/utils/import torch.py +++ /dev/null @@ -1,177 +0,0 @@ -import torch -import librosa -import soundfile as sf -import numpy as np -import scipy.io -import scipy.signal - -from TTS.tts.utils.stft_torch import STFT - -class AudioProcessor(object): - def __init__(self, - sample_rate=None, - num_mels=None, - frame_shift_ms=None, - frame_length_ms=None, - hop_length=None, - win_length=None, - num_freq=None, - power=None, - mel_fmin=None, - mel_fmax=None, - griffin_lim_iters=None, - do_trim_silence=False, - trim_db=60, - sound_norm=False, - use_cuda=False, - **_): - - print(" > Setting up Torch based Audio Processor...") - # setup class attributed - self.sample_rate = sample_rate - self.num_mels = num_mels - self.frame_shift_ms = frame_shift_ms - self.frame_length_ms = frame_length_ms - self.num_freq = num_freq - self.power = power - self.griffin_lim_iters = griffin_lim_iters - self.mel_fmin = mel_fmin or 0 - self.mel_fmax = mel_fmax - self.do_trim_silence = do_trim_silence - self.trim_db = trim_db - self.sound_norm = sound_norm - # setup stft parameters - if hop_length is None: - self.n_fft, self.hop_length, self.win_length = self._stft_parameters() - else: - self.hop_length = hop_length - self.win_length = win_length - self.n_fft = (self.num_freq - 1) * 2 - members = vars(self) - # print class attributes - for key, value in members.items(): - print(" | > {}:{}".format(key, value)) - # create spectrogram utils - self.mel_basis = torch.from_numpy(self._build_mel_basis()).float() - self.inv_mel_basis = torch.from_numpy(np.linalg.pinv(self._build_mel_basis())).float() - self.stft = STFT(filter_length=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, - window='hann', padding_mode='constant', use_cuda=use_cuda) - - ### setting up the parameters ### - def _build_mel_basis(self): - if self.mel_fmax is not None: - assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel( - self.sample_rate, - self.n_fft, - n_mels=self.num_mels, - fmin=self.mel_fmin, - fmax=self.mel_fmax) - - def _stft_parameters(self, ): - """Compute necessary stft parameters with given time values""" - n_fft = (self.num_freq - 1) * 2 - factor = self.frame_length_ms / self.frame_shift_ms - assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(hop_length * factor) - return n_fft, hop_length, win_length - - ### DB and AMP conversion ### - def amp_to_db(self, x): - return torch.log10(torch.clamp(x, min=1e-5)) - - def db_to_amp(self, x): - return torch.pow(10.0, x) - - ### SPECTROGRAM ### - def linear_to_mel(self, spectrogram): - return torch.matmul(self.mel_basis, spectrogram) - - def mel_to_linear(self, mel_spec): - return np.maximum(1e-10, np.matmul(self.inv_mel_basis, mel_spec)) - - def spectrogram(self, y): - ''' Compute spectrograms - Args: - y (Tensor): audio signal. (B x T) - ''' - M, P = self.stft.transform(y) - return self.amp_to_db(M) - - def melspectrogram(self, y): - ''' Compute mel-spectrograms - Args: - y (Tensor): audio signal. (B x T) - ''' - M, P = self.stft.transform(y) - return self.amp_to_db(self.linear_to_mel(M)) - - ### INV SPECTROGRAM ### - def inv_spectrogram(self, S): - """Converts spectrogram to waveform using librosa""" - S = self.db_to_amp(S) - return self.griffin_lim(S**self.power) - - def inv_melspectrogram(self, S): - '''Converts mel spectrogram to waveform using librosa''' - S = self.db_to_amp(S) - S = self.mel_to_linear(S) # Convert back to linear - return self.griffin_lim(S**self.power) - - def out_linear_to_mel(self, linear_spec): - S = self._denormalize(linear_spec) - S = self._db_to_amp(S) - S = self._linear_to_mel(np.abs(S)) - S = self._amp_to_db(S) - mel = self._normalize(S) - return mel - - def griffin_lim(self, S): - """ - PARAMS - ------ - magnitudes: spectrogram magnitudes - """ - - angles = np.angle(np.exp(2j * np.pi * np.random.rand(*S.size()))) - angles = angles.astype(np.float32) - angles = torch.from_numpy(angles) - signal = self.stft.inverse(S, angles).squeeze(1) - - for _ in range(self.griffin_lim_iters): - _, angles = self.stft.transform(signal) - signal = self.stft.inverse(S, angles).squeeze(1) - return signal - - ### Audio processing ### - def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): - window_length = int(self.sample_rate * min_silence_sec) - hop_length = int(window_length / 4) - threshold = self._db_to_amp(threshold_db) - for x in range(hop_length, len(wav) - window_length, hop_length): - if np.max(wav[x:x + window_length]) < threshold: - return x + hop_length - return len(wav) - - def trim_silence(self, wav): - """ Trim silent parts with a threshold and 0.01 sec margin """ - margin = int(self.sample_rate * 0.01) - wav = wav[margin:-margin] - return librosa.effects.trim( - wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] - - def sound_norm(self, x): - return x / abs(x).max() * 0.9 - - ### SAVE and LOAD ### - def load_wav(self, filename, sr=None): - if sr is None: - x, sr = sf.read(filename) - else: - x, sr = librosa.load(filename, sr=sr) - return x - - def save_wav(self, wav, path): - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) \ No newline at end of file diff --git a/TTS/utils/synthesis.py b/TTS/utils/synthesis.py deleted file mode 100644 index 2b5da449..00000000 --- a/TTS/utils/synthesis.py +++ /dev/null @@ -1,231 +0,0 @@ -import pkg_resources -installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable -if 'tensorflow' in installed or 'tensorflow-gpu' in installed: - import tensorflow as tf -import torch -import numpy as np -from .text import text_to_sequence, phoneme_to_sequence - - -def text_to_seqvec(text, CONFIG): - text_cleaner = [CONFIG.text_cleaner] - # text ot phonemes to sequence vector - if CONFIG.use_phonemes: - seq = np.asarray( - phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, - CONFIG.enable_eos_bos_chars, - tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), - dtype=np.int32) - else: - seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) - return seq - - -def numpy_to_torch(np_array, dtype, cuda=False): - if np_array is None: - return None - tensor = torch.as_tensor(np_array, dtype=dtype) - if cuda: - return tensor.cuda() - return tensor - - -def numpy_to_tf(np_array, dtype): - if np_array is None: - return None - tensor = tf.convert_to_tensor(np_array, dtype=dtype) - return tensor - - -def compute_style_mel(style_wav, ap): - style_mel = ap.melspectrogram( - ap.load_wav(style_wav)).expand_dims(0) - return style_mel - - -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id) - else: - if truncated: - decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id) - else: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id) - return decoder_output, postnet_output, alignments, stop_tokens - - -def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst and style_mel is not None: - raise NotImplementedError(' [!] GST inference not implemented for TF') - if truncated: - raise NotImplementedError(' [!] Truncated inference not implemented for TF') - if speaker_id is not None: - raise NotImplementedError(' [!] Multi-Speaker not implemented for TF') - # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, training=False) - return decoder_output, postnet_output, alignments, stop_tokens - - -def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst and style_mel is not None: - raise NotImplementedError(' [!] GST inference not implemented for TfLite') - if truncated: - raise NotImplementedError(' [!] Truncated inference not implemented for TfLite') - if speaker_id is not None: - raise NotImplementedError(' [!] Multi-Speaker not implemented for TfLite') - # get input and output details - input_details = model.get_input_details() - output_details = model.get_output_details() - # reshape input tensor for the new input shape - model.resize_tensor_input(input_details[0]['index'], inputs.shape) - model.allocate_tensors() - detail = input_details[0] - # input_shape = detail['shape'] - model.set_tensor(detail['index'], inputs) - # run the model - model.invoke() - # collect outputs - decoder_output = model.get_tensor(output_details[0]['index']) - postnet_output = model.get_tensor(output_details[1]['index']) - # tflite model only returns feature frames - return decoder_output, postnet_output, None, None - - -def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = decoder_output[0].data.cpu().numpy() - alignment = alignments[0].cpu().data.numpy() - stop_tokens = stop_tokens[0].cpu().numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - -def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].numpy() - decoder_output = decoder_output[0].numpy() - alignment = alignments[0].numpy() - stop_tokens = stop_tokens[0].numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - -def parse_outputs_tflite(postnet_output, decoder_output): - postnet_output = postnet_output[0] - decoder_output = decoder_output[0] - return postnet_output, decoder_output - - -def trim_silence(wav, ap): - return wav[:ap.find_endpoint(wav)] - - -def inv_spectrogram(postnet_output, ap, CONFIG): - if CONFIG.model.lower() in ["tacotron"]: - wav = ap.inv_spectrogram(postnet_output.T) - else: - wav = ap.inv_melspectrogram(postnet_output.T) - return wav - - -def id_to_torch(speaker_id): - if speaker_id is not None: - speaker_id = np.asarray(speaker_id) - speaker_id = torch.from_numpy(speaker_id).unsqueeze(0) - return speaker_id - - -# TODO: perform GL with pytorch for batching -def apply_griffin_lim(inputs, input_lens, CONFIG, ap): - '''Apply griffin-lim to each sample iterating throught the first dimension. - Args: - inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size. - input_lens (Tensor or np.Array): 1D array of sample lengths. - CONFIG (Dict): TTS config. - ap (AudioProcessor): TTS audio processor. - ''' - wavs = [] - for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding - wav = inv_spectrogram(spec, ap, CONFIG) - # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" - wavs.append(wav[:wav_len]) - return wavs - - -def synthesis(model, - text, - CONFIG, - use_cuda, - ap, - speaker_id=None, - style_wav=None, - truncated=False, - enable_eos_bos_chars=False, #pylint: disable=unused-argument - use_griffin_lim=False, - do_trim_silence=False, - backend='torch'): - """Synthesize voice for the given text. - - Args: - model (TTS.tts.models): model to synthesize. - text (str): target text - CONFIG (dict): config dictionary to be loaded from config.json. - use_cuda (bool): enable cuda. - ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process - model outputs. - speaker_id (int): id of speaker - style_wav (str): Uses for style embedding of GST. - truncated (bool): keep model states after inference. It can be used - for continuous inference at long texts. - enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. - do_trim_silence (bool): trim silence after synthesis. - backend (str): tf or torch - """ - # GST processing - style_mel = None - if CONFIG.model == "TacotronGST" and style_wav is not None: - style_mel = compute_style_mel(style_wav, ap) - # preprocess the given text - inputs = text_to_seqvec(text, CONFIG) - # pass tensors to backend - if backend == 'torch': - speaker_id = id_to_torch(speaker_id) - style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) - inputs = inputs.unsqueeze(0) - elif backend == 'tf': - # TODO: handle speaker id for tf model - style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) - elif backend == 'tflite': - style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) - # synthesize voice - if backend == 'torch': - decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens) - elif backend == 'tf': - decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens) - elif backend == 'tflite': - decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, truncated, speaker_id, style_mel) - postnet_output, decoder_output = parse_outputs_tflite( - postnet_output, decoder_output) - # convert outputs to numpy - # plot results - wav = None - if use_griffin_lim: - wav = inv_spectrogram(postnet_output, ap, CONFIG) - # trim silence - if do_trim_silence: - wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb index 43a758eb..57d9261b 100644 --- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb +++ b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb @@ -1145,7 +1145,7 @@ "from TTS.utils.io import load_config\n", "from TTS.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.synthesis import synthesis" + "from TTS.tts.utils.synthesis import synthesis" ], "execution_count": null, "outputs": []