diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index 51963847..7fc51a3a 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -45,7 +45,7 @@ "source": [ "NUM_PROC = 8\n", "DATASET_CONFIG = BaseDatasetConfig(\n", - " name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/home/ubuntu/TTS/depot/data/male_dataset1_44k/\"\n", + " name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/absolute/path/to/your/dataset/\"\n", ")" ] }, @@ -58,13 +58,13 @@ "def formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument\n", " txt_file = os.path.join(root_path, meta_file)\n", " items = []\n", - " speaker_name = \"maledataset1\"\n", + " speaker_name = \"myspeaker\"\n", " with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n", " for line in ttf:\n", " cols = line.split(\"|\")\n", - " wav_file = os.path.join(root_path, \"wavs\", cols[0])\n", + " wav_file = os.path.join(root_path, \"wavs\", cols[0] + \".wav\") \n", " text = cols[1]\n", - " items.append([text, wav_file, speaker_name])\n", + " items.append({\"text\": text, \"audio_file\": wav_file, \"speaker_name\": speaker_name})\n", " return items" ] }, @@ -78,7 +78,10 @@ "source": [ "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n", "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n", - "items = train_samples + eval_samples\n", + "if eval_samples is not None:\n", + " items = train_samples + eval_samples\n", + "else:\n", + " items = train_samples\n", "print(\" > Number of audio files: {}\".format(len(items)))\n", "print(items[1])" ] @@ -94,7 +97,7 @@ "# check wavs if exist\n", "wav_files = []\n", "for item in items:\n", - " wav_file = item[1].strip()\n", + " wav_file = item[\"audio_file\"].strip()\n", " wav_files.append(wav_file)\n", " if not os.path.exists(wav_file):\n", " print(waf_path)" @@ -131,8 +134,8 @@ "outputs": [], "source": [ "def load_item(item):\n", - " text = item[0].strip()\n", - " file_name = item[1].strip()\n", + " text = item[\"text\"].strip()\n", + " file_name = item[\"audio_file\"].strip()\n", " audio, sr = librosa.load(file_name, sr=None)\n", " audio_len = len(audio) / sr\n", " text_len = len(text)\n", @@ -416,7 +419,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb index 2b7f5d67..d481ed29 100644 --- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -37,7 +37,7 @@ "# set some vars\n", "# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n", "CONFIG_FILE = \"/path/to/config/config.json\"\n", - "CHARS_TO_REMOVE = \".,:!?'\"" + "CHARS_TO_REMOVE = \".,:!?'\"\n" ] }, { @@ -59,7 +59,8 @@ "# extra imports that might not be included in requirements.txt\n", "import collections\n", "import operator\n", - "\n" + "\n", + "%matplotlib inline" ] }, { @@ -75,7 +76,7 @@ "CONFIG = load_config(CONFIG_FILE)\n", "\n", "# Load some properties from config.json\n", - "CONFIG_METADATA = sorted(load_tts_samples(CONFIG.datasets)[0])\n", + "CONFIG_METADATA = load_tts_samples(CONFIG.datasets)[0]\n", "CONFIG_METADATA = CONFIG_METADATA\n", "CONFIG_DATASET = CONFIG.datasets[0]\n", "CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n", @@ -84,7 +85,10 @@ "\n", "# Will be printed on generated output graph\n", "CONFIG_RUN_NAME = CONFIG.run_name\n", - "CONFIG_RUN_DESC = CONFIG.run_description" + "CONFIG_RUN_DESC = CONFIG.run_description\n", + "\n", + "# Needed to convert text to phonemes and phonemes to ids\n", + "tokenizer, config = TTSTokenizer.init_from_config(CONFIG)" ] }, { @@ -112,12 +116,13 @@ "source": [ "def get_phoneme_from_sequence(text):\n", " temp_list = []\n", - " if len(text[0]) > 0:\n", - " temp_text = text[0].rstrip('\\n')\n", + " if len(text[\"text\"]) > 0:\n", + " #temp_text = text[0].rstrip('\\n')\n", + " temp_text = text[\"text\"].rstrip('\\n')\n", " for rm_bad_chars in CHARS_TO_REMOVE:\n", " temp_text = temp_text.replace(rm_bad_chars,\"\")\n", - " seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n", - " text = sequence_to_phoneme(seq)\n", + " seq = tokenizer.text_to_ids(temp_text)\n", + " text = tokenizer.ids_to_text(seq)\n", " text = text.replace(\" \",\"\")\n", " temp_list.append(text)\n", " return temp_list" @@ -229,7 +234,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -243,7 +248,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.12" } }, "nbformat": 4,