mirror of https://github.com/coqui-ai/TTS.git
Bug fixes
This commit is contained in:
parent
a211766a10
commit
b54da3b460
2
.compute
2
.compute
|
@ -1,3 +1,3 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source ../tmp/venv/bin/activate
|
source ../tmp/venv/bin/activate
|
||||||
python train.py --config_path config.json
|
python train.py --config_path config.json --debug true
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
{
|
{
|
||||||
"model_name": "best-model",
|
"model_name": "audio-update-l2-loss",
|
||||||
"num_mels": 80,
|
"num_mels": 80,
|
||||||
"num_freq": 1025,
|
"num_freq": 1025,
|
||||||
"sample_rate": 20000,
|
"sample_rate": 20000,
|
||||||
"frame_length_ms": 50,
|
"frame_length_ms": 50,
|
||||||
"frame_shift_ms": 12.5,
|
"frame_shift_ms": 12.5,
|
||||||
"preemphasis": 0.97,
|
"preemphasis": 0.97,
|
||||||
|
"min_mel_freq": 125,
|
||||||
|
"max_mel_freq": 7600,
|
||||||
"min_level_db": -100,
|
"min_level_db": -100,
|
||||||
"ref_level_db": 20,
|
"ref_level_db": 20,
|
||||||
"embedding_size": 256,
|
"embedding_size": 256,
|
||||||
|
@ -25,7 +27,7 @@
|
||||||
|
|
||||||
"checkpoint": true,
|
"checkpoint": true,
|
||||||
"save_step": 376,
|
"save_step": 376,
|
||||||
"print_step": 50,
|
"print_step": 10,
|
||||||
"data_path": "/snakepit/shared/data/keithito/LJSpeech-1.1/",
|
"data_path": "/snakepit/shared/data/keithito/LJSpeech-1.1/",
|
||||||
"min_seq_len": 0,
|
"min_seq_len": 0,
|
||||||
"output_path": "experiments/"
|
"output_path": "experiments/"
|
||||||
|
|
|
@ -38,11 +38,11 @@ class LocationSensitiveAttention(nn.Module):
|
||||||
super(LocationSensitiveAttention, self).__init__()
|
super(LocationSensitiveAttention, self).__init__()
|
||||||
self.kernel_size = kernel_size
|
self.kernel_size = kernel_size
|
||||||
self.filters = filters
|
self.filters = filters
|
||||||
padding = int((kernel - 1) / 2)
|
padding = int((kernel_size - 1) / 2)
|
||||||
self.loc_conv = nn.Conv1d(2, filters,
|
self.loc_conv = nn.Conv1d(1, filters,
|
||||||
kernel_size=kernel_size, stride=1,
|
kernel_size=kernel_size, stride=1,
|
||||||
padding=padding, bias=False)
|
padding=padding, bias=False)
|
||||||
self.loc_linear = nn.Linear(loc_dim, hidden_dim)
|
self.loc_linear = nn.Linear(filters, hidden_dim)
|
||||||
self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
|
self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
|
||||||
self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
|
self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
|
||||||
self.v = nn.Linear(hidden_dim, 1, bias=False)
|
self.v = nn.Linear(hidden_dim, 1, bias=False)
|
||||||
|
@ -79,7 +79,7 @@ class AttentionRNNCell(nn.Module):
|
||||||
memory_dim (int): memory vector (decoder autogression) feature dimension.
|
memory_dim (int): memory vector (decoder autogression) feature dimension.
|
||||||
align_model (str): 'b' for Bahdanau, 'ls' Location Sensitive alignment.
|
align_model (str): 'b' for Bahdanau, 'ls' Location Sensitive alignment.
|
||||||
"""
|
"""
|
||||||
super(AttentionRNN, self).__init__()
|
super(AttentionRNNCell, self).__init__()
|
||||||
self.align_model = align_model
|
self.align_model = align_model
|
||||||
self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim)
|
self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim)
|
||||||
# pick bahdanau or location sensitive attention
|
# pick bahdanau or location sensitive attention
|
||||||
|
|
|
@ -275,7 +275,7 @@ class Decoder(nn.Module):
|
||||||
# dim=1)
|
# dim=1)
|
||||||
attention_rnn_hidden, current_context_vec, attention = self.attention_rnn(
|
attention_rnn_hidden, current_context_vec, attention = self.attention_rnn(
|
||||||
processed_memory, current_context_vec, attention_rnn_hidden,
|
processed_memory, current_context_vec, attention_rnn_hidden,
|
||||||
inputs, attention, input_lens)
|
inputs, attention.unsqueeze(1), input_lens)
|
||||||
# attention_cum += attention
|
# attention_cum += attention
|
||||||
# Concat RNN output and attention context vector
|
# Concat RNN output and attention context vector
|
||||||
decoder_input = self.project_to_decoder_in(
|
decoder_input = self.project_to_decoder_in(
|
||||||
|
|
3
train.py
3
train.py
|
@ -25,7 +25,7 @@ from utils.generic_utils import (Progbar, remove_experiment_folder,
|
||||||
from utils.visual import plot_alignment, plot_spectrogram
|
from utils.visual import plot_alignment, plot_spectrogram
|
||||||
from datasets.LJSpeech import LJSpeechDataset
|
from datasets.LJSpeech import LJSpeechDataset
|
||||||
from models.tacotron import Tacotron
|
from models.tacotron import Tacotron
|
||||||
from layers.losses import L2LossMasked
|
from layers.losses import L1LossMasked
|
||||||
|
|
||||||
torch.manual_seed(1)
|
torch.manual_seed(1)
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
|
@ -338,6 +338,7 @@ def evaluate(model, criterion, criterion_st, data_loader, current_step):
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
|
# Setup the dataset
|
||||||
# Setup the dataset
|
# Setup the dataset
|
||||||
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
|
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
|
||||||
os.path.join(c.data_path, 'wavs'),
|
os.path.join(c.data_path, 'wavs'),
|
||||||
|
|
|
@ -37,7 +37,7 @@ def get_commit_hash():
|
||||||
|
|
||||||
def create_experiment_folder(root_path, model_name, debug):
|
def create_experiment_folder(root_path, model_name, debug):
|
||||||
""" Create a folder with the current date and time """
|
""" Create a folder with the current date and time """
|
||||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I:%M%p")
|
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||||
if debug:
|
if debug:
|
||||||
commit_hash = 'debug'
|
commit_hash = 'debug'
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue