mirror of https://github.com/coqui-ai/TTS.git
commit
f24f7c1237
|
@ -19,13 +19,13 @@
|
||||||
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
||||||
""" voxceleb 1 & 2 """
|
""" voxceleb 1 & 2 """
|
||||||
|
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import pandas
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from absl import logging
|
from absl import logging
|
||||||
|
|
||||||
|
@ -185,8 +185,11 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||||
# Write to CSV file which contains four columns:
|
# Write to CSV file which contains four columns:
|
||||||
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
||||||
csv_file_path = os.path.join(output_dir, output_file)
|
csv_file_path = os.path.join(output_dir, output_file)
|
||||||
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
|
||||||
df.to_csv(csv_file_path, index=False, sep="\t")
|
writer = csv.writer(f, delimiter="\t")
|
||||||
|
writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
||||||
|
for wav_file in files:
|
||||||
|
writer.writerow(wav_file)
|
||||||
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import csv
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
@ -5,7 +6,6 @@ from glob import glob
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
########################
|
########################
|
||||||
|
@ -25,25 +25,27 @@ def cml_tts(root_path, meta_file, ignored_speakers=None):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
client_id = None if "client_id" in metadata.columns else "default"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["wav_filename", "transcript"])
|
||||||
|
client_id = None if "client_id" in metadata[0] else "default"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
|
if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.wav_filename)
|
audio_path = os.path.join(root_path, row["wav_filename"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.transcript,
|
"text": row["transcript"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": client_id if client_id is not None else row.client_id,
|
"speaker_name": client_id if client_id is not None else row["client_id"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -63,25 +65,27 @@ def coqui(root_path, meta_file, ignored_speakers=None):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["audio_file", "text"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["audio_file", "text"])
|
||||||
|
speaker_name = None if "speaker_name" in metadata[0] else "coqui"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
|
if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.audio_file)
|
audio_path = os.path.join(root_path, row["audio_file"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.text,
|
"text": row["text"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
|
"speaker_name": speaker_name if speaker_name is not None else row["speaker_name"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
bokeh==1.4.0
|
bokeh==1.4.0
|
||||||
|
pandas>=1.4,<2.0
|
||||||
|
|
|
@ -8,7 +8,6 @@ torchaudio
|
||||||
soundfile>=0.12.0
|
soundfile>=0.12.0
|
||||||
librosa>=0.10.0
|
librosa>=0.10.0
|
||||||
scikit-learn>=1.3.0
|
scikit-learn>=1.3.0
|
||||||
numba==0.55.1;python_version<"3.9"
|
|
||||||
numba>=0.57.0;python_version>="3.9"
|
numba>=0.57.0;python_version>="3.9"
|
||||||
inflect>=5.6.0
|
inflect>=5.6.0
|
||||||
tqdm>=4.64.1
|
tqdm>=4.64.1
|
||||||
|
@ -24,7 +23,6 @@ flask>=2.0.1
|
||||||
pysbd>=0.3.4
|
pysbd>=0.3.4
|
||||||
# deps for notebooks
|
# deps for notebooks
|
||||||
umap-learn>=0.5.1
|
umap-learn>=0.5.1
|
||||||
pandas>=1.4,<2.0
|
|
||||||
# deps for training
|
# deps for training
|
||||||
matplotlib>=3.7.0
|
matplotlib>=3.7.0
|
||||||
# coqui stack
|
# coqui stack
|
||||||
|
@ -54,4 +52,4 @@ encodec>=0.1.1
|
||||||
# deps for XTTS
|
# deps for XTTS
|
||||||
unidecode>=1.3.2
|
unidecode>=1.3.2
|
||||||
num2words
|
num2words
|
||||||
spacy[ja]>=3
|
spacy[ja]>=3
|
||||||
|
|
Loading…
Reference in New Issue