mirror of https://github.com/coqui-ai/TTS.git
updates to dataset analysis notebooks for compatibility with latest version of TTS (#1853)
This commit is contained in:
parent
e4db7c51b5
commit
c30b6485ea
|
@ -45,7 +45,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"NUM_PROC = 8\n",
|
"NUM_PROC = 8\n",
|
||||||
"DATASET_CONFIG = BaseDatasetConfig(\n",
|
"DATASET_CONFIG = BaseDatasetConfig(\n",
|
||||||
" name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/home/ubuntu/TTS/depot/data/male_dataset1_44k/\"\n",
|
" name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/absolute/path/to/your/dataset/\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -58,13 +58,13 @@
|
||||||
"def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument\n",
|
"def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument\n",
|
||||||
" txt_file = os.path.join(root_path, meta_file)\n",
|
" txt_file = os.path.join(root_path, meta_file)\n",
|
||||||
" items = []\n",
|
" items = []\n",
|
||||||
" speaker_name = \"maledataset1\"\n",
|
" speaker_name = \"myspeaker\"\n",
|
||||||
" with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n",
|
" with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n",
|
||||||
" for line in ttf:\n",
|
" for line in ttf:\n",
|
||||||
" cols = line.split(\"|\")\n",
|
" cols = line.split(\"|\")\n",
|
||||||
" wav_file = os.path.join(root_path, \"wavs\", cols[0])\n",
|
" wav_file = os.path.join(root_path, \"wavs\", cols[0] + \".wav\") \n",
|
||||||
" text = cols[1]\n",
|
" text = cols[1]\n",
|
||||||
" items.append([text, wav_file, speaker_name])\n",
|
" items.append({\"text\": text, \"audio_file\": wav_file, \"speaker_name\": speaker_name})\n",
|
||||||
" return items"
|
" return items"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -78,7 +78,10 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
|
"# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
|
||||||
"train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n",
|
"train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n",
|
||||||
|
"if eval_samples is not None:\n",
|
||||||
" items = train_samples + eval_samples\n",
|
" items = train_samples + eval_samples\n",
|
||||||
|
"else:\n",
|
||||||
|
" items = train_samples\n",
|
||||||
"print(\" > Number of audio files: {}\".format(len(items)))\n",
|
"print(\" > Number of audio files: {}\".format(len(items)))\n",
|
||||||
"print(items[1])"
|
"print(items[1])"
|
||||||
]
|
]
|
||||||
|
@ -94,7 +97,7 @@
|
||||||
"# check wavs if exist\n",
|
"# check wavs if exist\n",
|
||||||
"wav_files = []\n",
|
"wav_files = []\n",
|
||||||
"for item in items:\n",
|
"for item in items:\n",
|
||||||
" wav_file = item[1].strip()\n",
|
" wav_file = item[\"audio_file\"].strip()\n",
|
||||||
" wav_files.append(wav_file)\n",
|
" wav_files.append(wav_file)\n",
|
||||||
" if not os.path.exists(wav_file):\n",
|
" if not os.path.exists(wav_file):\n",
|
||||||
" print(waf_path)"
|
" print(waf_path)"
|
||||||
|
@ -131,8 +134,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def load_item(item):\n",
|
"def load_item(item):\n",
|
||||||
" text = item[0].strip()\n",
|
" text = item[\"text\"].strip()\n",
|
||||||
" file_name = item[1].strip()\n",
|
" file_name = item[\"audio_file\"].strip()\n",
|
||||||
" audio, sr = librosa.load(file_name, sr=None)\n",
|
" audio, sr = librosa.load(file_name, sr=None)\n",
|
||||||
" audio_len = len(audio) / sr\n",
|
" audio_len = len(audio) / sr\n",
|
||||||
" text_len = len(text)\n",
|
" text_len = len(text)\n",
|
||||||
|
@ -416,7 +419,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.5"
|
"version": "3.9.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
|
@ -37,7 +37,7 @@
|
||||||
"# set some vars\n",
|
"# set some vars\n",
|
||||||
"# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n",
|
"# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n",
|
||||||
"CONFIG_FILE = \"/path/to/config/config.json\"\n",
|
"CONFIG_FILE = \"/path/to/config/config.json\"\n",
|
||||||
"CHARS_TO_REMOVE = \".,:!?'\""
|
"CHARS_TO_REMOVE = \".,:!?'\"\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -59,7 +59,8 @@
|
||||||
"# extra imports that might not be included in requirements.txt\n",
|
"# extra imports that might not be included in requirements.txt\n",
|
||||||
"import collections\n",
|
"import collections\n",
|
||||||
"import operator\n",
|
"import operator\n",
|
||||||
"\n"
|
"\n",
|
||||||
|
"%matplotlib inline"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -75,7 +76,7 @@
|
||||||
"CONFIG = load_config(CONFIG_FILE)\n",
|
"CONFIG = load_config(CONFIG_FILE)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Load some properties from config.json\n",
|
"# Load some properties from config.json\n",
|
||||||
"CONFIG_METADATA = sorted(load_tts_samples(CONFIG.datasets)[0])\n",
|
"CONFIG_METADATA = load_tts_samples(CONFIG.datasets)[0]\n",
|
||||||
"CONFIG_METADATA = CONFIG_METADATA\n",
|
"CONFIG_METADATA = CONFIG_METADATA\n",
|
||||||
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
||||||
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
||||||
|
@ -84,7 +85,10 @@
|
||||||
"\n",
|
"\n",
|
||||||
"# Will be printed on generated output graph\n",
|
"# Will be printed on generated output graph\n",
|
||||||
"CONFIG_RUN_NAME = CONFIG.run_name\n",
|
"CONFIG_RUN_NAME = CONFIG.run_name\n",
|
||||||
"CONFIG_RUN_DESC = CONFIG.run_description"
|
"CONFIG_RUN_DESC = CONFIG.run_description\n",
|
||||||
|
"\n",
|
||||||
|
"# Needed to convert text to phonemes and phonemes to ids\n",
|
||||||
|
"tokenizer, config = TTSTokenizer.init_from_config(CONFIG)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -112,12 +116,13 @@
|
||||||
"source": [
|
"source": [
|
||||||
"def get_phoneme_from_sequence(text):\n",
|
"def get_phoneme_from_sequence(text):\n",
|
||||||
" temp_list = []\n",
|
" temp_list = []\n",
|
||||||
" if len(text[0]) > 0:\n",
|
" if len(text[\"text\"]) > 0:\n",
|
||||||
" temp_text = text[0].rstrip('\\n')\n",
|
" #temp_text = text[0].rstrip('\\n')\n",
|
||||||
|
" temp_text = text[\"text\"].rstrip('\\n')\n",
|
||||||
" for rm_bad_chars in CHARS_TO_REMOVE:\n",
|
" for rm_bad_chars in CHARS_TO_REMOVE:\n",
|
||||||
" temp_text = temp_text.replace(rm_bad_chars,\"\")\n",
|
" temp_text = temp_text.replace(rm_bad_chars,\"\")\n",
|
||||||
" seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n",
|
" seq = tokenizer.text_to_ids(temp_text)\n",
|
||||||
" text = sequence_to_phoneme(seq)\n",
|
" text = tokenizer.ids_to_text(seq)\n",
|
||||||
" text = text.replace(\" \",\"\")\n",
|
" text = text.replace(\" \",\"\")\n",
|
||||||
" temp_list.append(text)\n",
|
" temp_list.append(text)\n",
|
||||||
" return temp_list"
|
" return temp_list"
|
||||||
|
@ -229,7 +234,7 @@
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
@ -243,7 +248,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.9.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
Loading…
Reference in New Issue