From 78c3897599d415d9312f57d61fde715feacef595 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 09:47:52 +0200 Subject: [PATCH 1/4] root path speaker matching added data root path in speaker matching for mailabs, this way you don't need to start at the very bottom of the folder hierarchy if you want to explicitly define metafiles. --- datasets/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 9dd7a610..a3701c4d 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -82,14 +82,14 @@ def mailabs(root_path, meta_files=None): # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for idx, csv_file in enumerate(csv_files): + txt_file = os.path.join(root_path, csv_file) # determine speaker based on folder structure... - speaker_name_match = speaker_regex.search(csv_file) + speaker_name_match = speaker_regex.search(txt_file) if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") print(" | > {}".format(csv_file)) folder = folders[idx] - txt_file = os.path.join(root_path, csv_file) with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|') From 537879482dc31592f51fb8cae79919615091d49a Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 13:31:10 +0200 Subject: [PATCH 2/4] fixed config comment strings for attention parameters --- config.json | 12 ++++++------ config_libritts.json | 10 +++++----- config_tacotron.json | 4 ++-- config_tacotron2.json | 10 +++++----- config_tacotron_de.json | 12 ++++++------ config_tacotron_gst.json | 4 ++-- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/config.json b/config.json index 807c4c60..24d26e16 100644 --- a/config.json +++ b/config.json @@ -40,12 +40,12 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": false, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_libritts.json b/config_libritts.json index f9a752ec..5579e565 100644 --- a/config_libritts.json +++ b/config_libritts.json @@ -39,13 +39,13 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron.json b/config_tacotron.json index 127a4b3d..92ee3909 100644 --- a/config_tacotron.json +++ b/config_tacotron.json @@ -42,10 +42,10 @@ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. - "use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. "transition_agent": true, // enable/disable transition agent of forward attention. - "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": false, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron2.json b/config_tacotron2.json index fd188d20..02b4341b 100644 --- a/config_tacotron2.json +++ b/config_tacotron2.json @@ -39,12 +39,12 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron_de.json b/config_tacotron_de.json index 834bfed4..fc3efbec 100644 --- a/config_tacotron_de.json +++ b/config_tacotron_de.json @@ -40,12 +40,12 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "forward_attn_mask": false, - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster. + "transition_agent": false, // enable/disable transition agent of forward attention. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron_gst.json b/config_tacotron_gst.json index 98fafa54..5a0f2c09 100644 --- a/config_tacotron_gst.json +++ b/config_tacotron_gst.json @@ -42,8 +42,8 @@ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. - "use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. "transition_agent": false, // enable/disable transition agent of forward attention. "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. From 4c9fbeeaf81c2df8461fe5f35225ae9ecd0728a9 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 14:23:36 +0200 Subject: [PATCH 3/4] simplified folder variable --- datasets/preprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index a3701c4d..a86f8e5d 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -75,21 +75,19 @@ def mailabs(root_path, meta_files=None): speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") if meta_files is None: csv_files = glob(root_path+"/**/metadata.csv", recursive=True) - folders = [os.path.dirname(f) for f in csv_files] else: csv_files = meta_files - folders = [f.strip().split("by_book")[1][1:] for f in csv_files] # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for idx, csv_file in enumerate(csv_files): txt_file = os.path.join(root_path, csv_file) + folder = os.path.dirname(txt_file) # determine speaker based on folder structure... speaker_name_match = speaker_regex.search(txt_file) if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") print(" | > {}".format(csv_file)) - folder = folders[idx] with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|') From d4045fd47b1dc5939d6100c9f8a2faf3863fc1fc Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 14:30:06 +0200 Subject: [PATCH 4/4] unused var --- datasets/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index a86f8e5d..e5f4e1a2 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -79,7 +79,7 @@ def mailabs(root_path, meta_files=None): csv_files = meta_files # meta_files = [f.strip() for f in meta_files.split(",")] items = [] - for idx, csv_file in enumerate(csv_files): + for csv_file in csv_files: txt_file = os.path.join(root_path, csv_file) folder = os.path.dirname(txt_file) # determine speaker based on folder structure...