diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 029922d3..ce876edc 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -187,3 +187,21 @@ def libri_tts(root_path, meta_files=None): for item in items: assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}" return items + + +def custom_turkish(root_path, meta_file): + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "turkish-female" + skipped_files = [] + with open(txt_file, 'r', encoding='utf-8') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav') + if not os.path.exists(wav_file): + skipped_files.append(wav_file) + continue + text = cols[1].strip() + items.append([text, wav_file, speaker_name]) + print(f" [!] {len(skipped_files)} files skipped. They are not exist...") + return items diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index e6b611b4..92c2d934 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -91,6 +91,14 @@ def transliteration_cleaners(text): return text +# TODO: elaborate it +def basic_turkish_cleaners(text): + '''Pipeline for Turkish text''' + text = text.replace("I", "ı") + text = lowercase(text) + text = collapse_whitespace(text) + + def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text)