├── Dataset ├── extract_clips.py ├── SEP-28k Dataset Download.ipynb └── Augmentation of Clips.ipynb ├── README.md ├── WordRep - Deep Learning.ipynb └── SoundRep - Deep Learning.ipynb /Dataset/extract_clips.py: -------------------------------------------------------------------------------- 1 | # 2 | # For licensing see accompanying LICENSE file. 3 | # Copyright (C) 2021 Apple Inc. All Rights Reserved. 4 | # 5 | 6 | """ 7 | For each podcast episode: 8 | * Get all clip information for that episode 9 | * Save each clip as a new wav file. 10 | """ 11 | 12 | import os 13 | import pathlib 14 | import subprocess 15 | 16 | import numpy as np 17 | import pandas as pd 18 | from scipy.io import wavfile 19 | 20 | import argparse 21 | 22 | parser = argparse.ArgumentParser(description='Extract clips from SEP-28k or FluencyBank.') 23 | parser.add_argument('--labels', type=str, required=True, 24 | help='Path to the labels csv files (e.g., SEP-28k_labels.csv)') 25 | parser.add_argument('--wavs', type=str, default="wavs", 26 | help='Path where audio files from download_audio.py are saved') 27 | parser.add_argument('--clips', type=str, default="clips", 28 | help='Path where clips should be extracted') 29 | parser.add_argument("--progress", action="store_true", 30 | help="Show progress") 31 | 32 | args = parser.parse_args() 33 | label_file = args.labels 34 | data_dir = args.wavs 35 | output_dir = args.clips 36 | 37 | 38 | # Load label/clip file 39 | data = pd.read_csv(label_file, dtype={"EpId":str}) 40 | 41 | # Get label columns from data file 42 | shows = data.Show 43 | episodes = data.EpId 44 | clip_idxs = data.ClipId 45 | starts = data.Start 46 | stops = data.Stop 47 | labels = data.iloc[:,5:].values 48 | 49 | n_items = len(shows) 50 | 51 | loaded_wav = "" 52 | cur_iter = range(n_items) 53 | if args.progress: 54 | from tqdm import tqdm 55 | cur_iter = tqdm(cur_iter) 56 | 57 | for i in cur_iter: 58 | clip_idx = clip_idxs[i] 59 | show_abrev = shows[i] 60 | episode = episodes[i].strip() 61 | 62 | # Setup paths 63 | wav_path = f"{data_dir}/{shows[i]}/{episode}.wav" 64 | clip_dir = pathlib.Path(f"{output_dir}/{show_abrev}/{episode}/") 65 | clip_path = f"{clip_dir}/{shows[i]}_{episode}_{clip_idx}.wav" 66 | 67 | if not os.path.exists(wav_path): 68 | print("Missing", wav_path) 69 | continue 70 | 71 | # Verify clip directory exists 72 | os.makedirs(clip_dir, exist_ok=True) 73 | 74 | # Load audio. For efficiency reasons don't reload if we've already open the file. 75 | if wav_path != loaded_wav: 76 | sample_rate, audio = wavfile.read(wav_path) 77 | assert sample_rate == 16000, "Sample rate must be 16 khz" 78 | 79 | # Keep track of the open file 80 | loaded_wav = wav_path 81 | 82 | # Save clip to file 83 | clip = audio[starts[i]:stops[i]] 84 | wavfile.write(clip_path, sample_rate, clip) 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stutter Detection and Classification 2 | 3 | Stuttering is a neuro-developmental speech disorder that interrupts the flow of speech due to involuntary pauses and sound repetitions. It has profound psychological impacts that affect social interactions and professional advancements. 4 | 5 | Automatically detecting stuttering events in speech recordings could assist speech therapists or speech pathologists track the fluency of people who stutter (PWS). It will also assist in the improvement of the existing speech recognition system for PWS. 6 | 7 | In this project, the SEP-28k dataset is utilized to perform comparative analysis to assess the performance of various machine learning models in classifying the five dysfluency types namely Prolongation, Interjection, Word Repetition, Sound Repetition and Blocks. 8 | 9 | ## Research Paper 10 | Read the full publication of this work here: https://www.sciencedirect.com/science/article/pii/S2215016124005016 11 | 12 | ## Contributions: 13 | 14 | 1. Developing robust machine learning models for classifying the five classes of Stutter- Interjection, Prolongation, Blocks, Sound Repetitions and Word Repetitions. 15 | 2. Conducting analysis on the impact of various features extracted manually as well as using pre-trained models. 16 | 3. Performing comparative analysis on each class of stutter using various machine learning models. 17 | 18 | ## Dataset 19 | The SEP-28k dataset, published by Apple Machine Learning Research, is used for training and evaluation.It consists of audio recordings from six podcast shows and is annotated with various dysfluency types. The dataset is multi-label and multi-class, with annotations done by at least three annotators. 20 | 21 | Annotations: The SEP-28k dataset includes annotations for audio clips, labeled by at least three annotators. These labels cover various categories such as 'Unsure', 'PoorAudioQuality', 'Music', 'DifficultToUnderstand', 'Interjection', 'Prolongation', 'Blocks', 'WordRep', 'SoundRep', 'NoStutteredWords', 'NoSpeech', and 'NaturalPause'. Each 3-second clip can have multiple labels, making the dataset both multi-label and multi-class. 22 | 23 |

24 | image 25 |

26 | 27 | ## Preprocessing: 28 | The following preprocessing steps were done before proceeding further: 29 | - Deleted audio clips not exactly 3 seconds long. 30 | - Ensured all audio clips had a 16kHz sampling rate. 31 | - Reduced classes from 11 to fewer due to lack of meaningful contribution. 32 | - Dropped the 'Unsure' column. 33 | - Removed clips labeled as music (introductory or concluding music). 34 | - Dropped the 'Music' column. 35 | - Removed clips labeled as 'DifficultToUnderstand' by two or more annotators. 36 | - Dropped the 'DifficultToUnderstand' column. 37 | - Class Imbalance was handled 38 | 39 | ## Feature Extraction: 40 | The following features were extracted: 41 | - MFCCs (Mel-Frequency Cepstral Coefficients): Capture the power spectrum of audio signals. 42 | - Zero Crossing Rate: Measure the rate at which the signal changes sign. 43 | - Jitter: Measure the frequency variation from cycle to cycle. 44 | - Shimmer: Measure the amplitude variation from cycle to cycle. 45 | 46 | ## Model Training: 47 | After the extracted features are concatenated into an array, these features are given to the models for training. Here individual models have been considered for each class- namely, K-Nearest Neighbors (KNN), Support Vector Machine (SVM), Random Forest, Decision Tree and Naive Bayes. All the above mentioned models are trained for all the classes and the model which gives the highest accuracy is chosen for that particular class. 48 | 49 | ## Hyperparameter Tuning: 50 | Hyperparameter tuning and cross validation techniques were applied on each of the models i.e., KNN, Logistic Regression, Decision Tree, Random Forest, and SVM, and just cross validation (with 10-fold cross-validation) for Naïve Bayes as it is a non-parametric model. 51 | 52 | ## Evaluation Metrics: 53 | Both training and testing were done on the SEP-28k dataset where the train to test ratio is 70:30. The main evaluation metric considered for choosing the models for each class is accuracy, as the data for each class was balanced. Other metrics considered are F1-score, accuracy, precision, recall, confusion matrix and ROC-AUC. 54 | 55 | ## Deployment: 56 | https://github.com/Ramitha-V/Stutter-Detection-and-Classification/assets/162662008/0f9b8ac6-bcea-439d-857a-e8e0754d3081 57 | -------------------------------------------------------------------------------- /Dataset/SEP-28k Dataset Download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4e08bf4d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import requests\n", 11 | "import pandas as pd\n", 12 | "import os\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "2d9dd988", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv(\"D:\\SEM 4\\Project\\\\SEP-28k_episodes.csv\", header=None)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "d5ffacca", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df.columns = ['name', 'desc', 'link', 'podcast_name', 'pod_id']" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "id": "c1129cd6", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
namedesclinkpodcast_namepod_id
0He_Stutters_Podcast_–_Make_Room_For_The_Stutte...episode-208-with-kelsey-hhttps://stutterrockstar.files.wordpress.com/2...HeStutters0
1He_Stutters_Podcast_–_Make_Room_For_The_Stutte...episode-208-with-kelsey-hhttps://stutterrockstar.files.wordpress.com/2...HeStutters1
2He_Stutters_Podcast_–_Make_Room_For_The_Stutte...episode-208-with-kelsey-hhttps://stutterrockstar.files.wordpress.com/2...HeStutters2
3He_Stutters_Podcast_–_Make_Room_For_The_Stutte...episode-208-with-kelsey-hhttps://stutterrockstar.files.wordpress.com/2...HeStutters3
4He_Stutters_Podcast_–_Make_Room_For_The_Stutte...episode-208-with-kelsey-hhttps://stutterrockstar.files.wordpress.com/2...HeStutters4
..................
380Women_Who_Stutter_Podcast_–_Make_Room_For_The_...cool148https://stutterrockstar.files.wordpress.com/2...WomenWhoStutter105
381Women_Who_Stutter_Podcast_–_Make_Room_For_The_...cool148https://stutterrockstar.files.wordpress.com/2...WomenWhoStutter106
382Women_Who_Stutter_Podcast_–_Make_Room_For_The_...cool148https://stutterrockstar.files.wordpress.com/2...WomenWhoStutter107
383Women_Who_Stutter_Podcast_–_Make_Room_For_The_...cool148https://stutterrockstar.files.wordpress.com/2...WomenWhoStutter108
384Women_Who_Stutter_Podcast_–_Make_Room_For_The_...cool148https://stutterrockstar.files.wordpress.com/2...WomenWhoStutter109
\n", 161 | "

385 rows × 5 columns

\n", 162 | "
" 163 | ], 164 | "text/plain": [ 165 | " name \\\n", 166 | "0 He_Stutters_Podcast_–_Make_Room_For_The_Stutte... \n", 167 | "1 He_Stutters_Podcast_–_Make_Room_For_The_Stutte... \n", 168 | "2 He_Stutters_Podcast_–_Make_Room_For_The_Stutte... \n", 169 | "3 He_Stutters_Podcast_–_Make_Room_For_The_Stutte... \n", 170 | "4 He_Stutters_Podcast_–_Make_Room_For_The_Stutte... \n", 171 | ".. ... \n", 172 | "380 Women_Who_Stutter_Podcast_–_Make_Room_For_The_... \n", 173 | "381 Women_Who_Stutter_Podcast_–_Make_Room_For_The_... \n", 174 | "382 Women_Who_Stutter_Podcast_–_Make_Room_For_The_... \n", 175 | "383 Women_Who_Stutter_Podcast_–_Make_Room_For_The_... \n", 176 | "384 Women_Who_Stutter_Podcast_–_Make_Room_For_The_... \n", 177 | "\n", 178 | " desc \\\n", 179 | "0 episode-208-with-kelsey-h \n", 180 | "1 episode-208-with-kelsey-h \n", 181 | "2 episode-208-with-kelsey-h \n", 182 | "3 episode-208-with-kelsey-h \n", 183 | "4 episode-208-with-kelsey-h \n", 184 | ".. ... \n", 185 | "380 cool148 \n", 186 | "381 cool148 \n", 187 | "382 cool148 \n", 188 | "383 cool148 \n", 189 | "384 cool148 \n", 190 | "\n", 191 | " link podcast_name \\\n", 192 | "0 https://stutterrockstar.files.wordpress.com/2... HeStutters \n", 193 | "1 https://stutterrockstar.files.wordpress.com/2... HeStutters \n", 194 | "2 https://stutterrockstar.files.wordpress.com/2... HeStutters \n", 195 | "3 https://stutterrockstar.files.wordpress.com/2... HeStutters \n", 196 | "4 https://stutterrockstar.files.wordpress.com/2... HeStutters \n", 197 | ".. ... ... \n", 198 | "380 https://stutterrockstar.files.wordpress.com/2... WomenWhoStutter \n", 199 | "381 https://stutterrockstar.files.wordpress.com/2... WomenWhoStutter \n", 200 | "382 https://stutterrockstar.files.wordpress.com/2... WomenWhoStutter \n", 201 | "383 https://stutterrockstar.files.wordpress.com/2... WomenWhoStutter \n", 202 | "384 https://stutterrockstar.files.wordpress.com/2... WomenWhoStutter \n", 203 | "\n", 204 | " pod_id \n", 205 | "0 0 \n", 206 | "1 1 \n", 207 | "2 2 \n", 208 | "3 3 \n", 209 | "4 4 \n", 210 | ".. ... \n", 211 | "380 105 \n", 212 | "381 106 \n", 213 | "382 107 \n", 214 | "383 108 \n", 215 | "384 109 \n", 216 | "\n", 217 | "[385 rows x 5 columns]" 218 | ] 219 | }, 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "df" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 5, 232 | "id": "5a93449e", 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "path = r\"D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\"" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 6, 242 | "id": "96bff8d8", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stderr", 247 | "output_type": "stream", 248 | "text": [ 249 | "Downloading: 100%|██████████████████████████| 53.8M/53.8M [01:34<00:00, 570kB/s]" 250 | ] 251 | }, 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "Audio file downloaded successfully: downloaded_audio.mp3\n" 257 | ] 258 | }, 259 | { 260 | "name": "stderr", 261 | "output_type": "stream", 262 | "text": [ 263 | "\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "import requests\n", 269 | "from tqdm import tqdm # Import tqdm for the progress bar\n", 270 | "import pandas as pd\n", 271 | "\n", 272 | "url = \"https://stutterrockstar.files.wordpress.com/2011/05/male-episode-1-with-alan1.mp3\"\n", 273 | "output_file = \"downloaded_audio.mp3\"\n", 274 | "\n", 275 | "response = requests.get(url, stream=True)\n", 276 | "\n", 277 | "# Check if the request was successful (status code 200)\n", 278 | "if response.status_code == 200:\n", 279 | " # Get the total file size in bytes (content length)\n", 280 | " total_size = int(response.headers.get('content-length', 0))\n", 281 | "\n", 282 | " # Create a progress bar using tqdm\n", 283 | " with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading', ncols=80) as bar:\n", 284 | " with open(output_file, 'wb') as f:\n", 285 | " for data in response.iter_content(chunk_size=1024):\n", 286 | " f.write(data)\n", 287 | " bar.update(len(data))\n", 288 | "\n", 289 | " print(f\"Audio file downloaded successfully: {output_file}\")\n", 290 | "else:\n", 291 | " print(f\"Failed to download audio file. Status code: {response.status_code}\")\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 7, 297 | "id": "9427ba0e", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "def download_audio_(row):\n", 302 | " url = row['link'].lstrip()\n", 303 | " audio_name = str(row['pod_id']) + \".mp3\"\n", 304 | " output_file = os.path.join(pod_path, audio_name)\n", 305 | " \n", 306 | " response = requests.get(url)\n", 307 | "\n", 308 | " \n", 309 | " if response.status_code == 200:\n", 310 | " with open(output_file, 'wb') as f:\n", 311 | " f.write(response.content)\n", 312 | " print(f\"Audio file downloaded successfully: {output_file}\")\n", 313 | " else:\n", 314 | " print(f\"Failed to download audio file. Status code: {response.status_code}\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 8, 320 | "id": "da72ea0a", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "def download_audio(row):\n", 325 | " url = row['link'].lstrip()\n", 326 | " audio_name = str(row['pod_id']) + \".mp3\"\n", 327 | " output_file = os.path.join(pod_path, audio_name)\n", 328 | " \n", 329 | " response = requests.get(url)\n", 330 | "\n", 331 | " \n", 332 | " if response.status_code == 200:\n", 333 | " # Get the total file size in bytes (content length)\n", 334 | " total_size = int(response.headers.get('content-length', 0))\n", 335 | "\n", 336 | " # Create a progress bar using tqdm\n", 337 | " with tqdm(total=total_size, unit='B', unit_scale=True, desc=f'Downloading {audio_name}', ncols=80) as bar:\n", 338 | " with open(output_file, 'wb') as f:\n", 339 | " for data in response.iter_content(chunk_size=1024):\n", 340 | " f.write(data)\n", 341 | " bar.update(len(data)) \n", 342 | " print(f\"Audio file downloaded successfully: {output_file}\")\n", 343 | " \n", 344 | " else:\n", 345 | " print(f\"Failed to download audio file. Status code: {response.status_code}\")" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 10, 351 | "id": "22c25ad8", 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "name": "stderr", 356 | "output_type": "stream", 357 | "text": [ 358 | "Downloading 0.mp3: 17.0MB [00:00, 589MB/s]\n" 359 | ] 360 | }, 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\0.mp3\n" 366 | ] 367 | }, 368 | { 369 | "name": "stderr", 370 | "output_type": "stream", 371 | "text": [ 372 | "Downloading 1.mp3: 38.8MB [00:00, 638MB/s]\n" 373 | ] 374 | }, 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\1.mp3\n" 380 | ] 381 | }, 382 | { 383 | "name": "stderr", 384 | "output_type": "stream", 385 | "text": [ 386 | "Downloading 2.mp3: 37.6MB [00:00, 639MB/s]\n" 387 | ] 388 | }, 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\2.mp3\n" 394 | ] 395 | }, 396 | { 397 | "name": "stderr", 398 | "output_type": "stream", 399 | "text": [ 400 | "Downloading 3.mp3: 40.2MB [00:00, 235MB/s]\n" 401 | ] 402 | }, 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\3.mp3\n" 408 | ] 409 | }, 410 | { 411 | "name": "stderr", 412 | "output_type": "stream", 413 | "text": [ 414 | "Downloading 4.mp3: 43.2MB [00:00, 278MB/s]\n" 415 | ] 416 | }, 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\4.mp3\n" 422 | ] 423 | }, 424 | { 425 | "name": "stderr", 426 | "output_type": "stream", 427 | "text": [ 428 | "Downloading 5.mp3: 60.6MB [00:00, 253MB/s]\n" 429 | ] 430 | }, 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\5.mp3\n" 436 | ] 437 | }, 438 | { 439 | "name": "stderr", 440 | "output_type": "stream", 441 | "text": [ 442 | "Downloading 6.mp3: 55.0MB [00:00, 262MB/s]\n" 443 | ] 444 | }, 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\6.mp3\n" 450 | ] 451 | }, 452 | { 453 | "name": "stderr", 454 | "output_type": "stream", 455 | "text": [ 456 | "Downloading 7.mp3: 74.8MB [00:00, 296MB/s]\n" 457 | ] 458 | }, 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\7.mp3\n" 464 | ] 465 | }, 466 | { 467 | "name": "stderr", 468 | "output_type": "stream", 469 | "text": [ 470 | "Downloading 8.mp3: 16.0MB [00:00, 138MB/s]\n" 471 | ] 472 | }, 473 | { 474 | "name": "stdout", 475 | "output_type": "stream", 476 | "text": [ 477 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\8.mp3\n" 478 | ] 479 | }, 480 | { 481 | "name": "stderr", 482 | "output_type": "stream", 483 | "text": [ 484 | "Downloading 9.mp3: 35.4MB [00:00, 206MB/s]\n" 485 | ] 486 | }, 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\9.mp3\n" 492 | ] 493 | }, 494 | { 495 | "name": "stderr", 496 | "output_type": "stream", 497 | "text": [ 498 | "Downloading 10.mp3: 41.3MB [00:00, 215MB/s]\n" 499 | ] 500 | }, 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\10.mp3\n" 506 | ] 507 | }, 508 | { 509 | "name": "stderr", 510 | "output_type": "stream", 511 | "text": [ 512 | "Downloading 11.mp3: 27.1MB [00:00, 192MB/s]\n" 513 | ] 514 | }, 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\11.mp3\n" 520 | ] 521 | }, 522 | { 523 | "name": "stderr", 524 | "output_type": "stream", 525 | "text": [ 526 | "Downloading 12.mp3: 20.5MB [00:00, 174MB/s]\n" 527 | ] 528 | }, 529 | { 530 | "name": "stdout", 531 | "output_type": "stream", 532 | "text": [ 533 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\12.mp3\n" 534 | ] 535 | }, 536 | { 537 | "name": "stderr", 538 | "output_type": "stream", 539 | "text": [ 540 | "Downloading 13.mp3: 24.7MB [00:00, 190MB/s]\n" 541 | ] 542 | }, 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\13.mp3\n" 548 | ] 549 | }, 550 | { 551 | "name": "stderr", 552 | "output_type": "stream", 553 | "text": [ 554 | "Downloading 14.mp3: 22.4MB [00:00, 178MB/s]\n" 555 | ] 556 | }, 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\14.mp3\n" 562 | ] 563 | }, 564 | { 565 | "name": "stderr", 566 | "output_type": "stream", 567 | "text": [ 568 | "Downloading 15.mp3: 36.8MB [00:00, 350MB/s]\n" 569 | ] 570 | }, 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\15.mp3\n" 576 | ] 577 | }, 578 | { 579 | "name": "stderr", 580 | "output_type": "stream", 581 | "text": [ 582 | "Downloading 16.mp3: 41.7MB [00:00, 238MB/s]\n" 583 | ] 584 | }, 585 | { 586 | "name": "stdout", 587 | "output_type": "stream", 588 | "text": [ 589 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\16.mp3\n" 590 | ] 591 | }, 592 | { 593 | "name": "stderr", 594 | "output_type": "stream", 595 | "text": [ 596 | "Downloading 17.mp3: 226MB [00:00, 325MB/s] \n" 597 | ] 598 | }, 599 | { 600 | "name": "stdout", 601 | "output_type": "stream", 602 | "text": [ 603 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\17.mp3\n" 604 | ] 605 | }, 606 | { 607 | "name": "stderr", 608 | "output_type": "stream", 609 | "text": [ 610 | "Downloading 18.mp3: 48.3MB [00:00, 243MB/s]\n" 611 | ] 612 | }, 613 | { 614 | "name": "stdout", 615 | "output_type": "stream", 616 | "text": [ 617 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\18.mp3\n" 618 | ] 619 | }, 620 | { 621 | "name": "stderr", 622 | "output_type": "stream", 623 | "text": [ 624 | "Downloading 19.mp3: 43.0MB [00:00, 243MB/s]\n" 625 | ] 626 | }, 627 | { 628 | "name": "stdout", 629 | "output_type": "stream", 630 | "text": [ 631 | "Audio file downloaded successfully: D:\\\\SEM 4\\Project\\\\StutterDataset\\\\wavs\\StutterTalk\\19.mp3\n" 632 | ] 633 | } 634 | ], 635 | "source": [ 636 | "for index, row in df.iloc[108:128].iterrows():\n", 637 | " pod_name = row['podcast_name'].lstrip()\n", 638 | " pod_path = os.path.join(path, pod_name)\n", 639 | " if os.path.exists(pod_path):\n", 640 | " download_audio(row)\n", 641 | " time.sleep(5)\n", 642 | " \n", 643 | " else:\n", 644 | " os.makedirs(pod_path)\n", 645 | " download_audio(row)\n", 646 | " time.sleep(5)\n", 647 | "\n", 648 | " " 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "id": "a4435633", 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "id": "198d9929", 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [] 666 | } 667 | ], 668 | "metadata": { 669 | "kernelspec": { 670 | "display_name": "Python 3 (ipykernel)", 671 | "language": "python", 672 | "name": "python3" 673 | }, 674 | "language_info": { 675 | "codemirror_mode": { 676 | "name": "ipython", 677 | "version": 3 678 | }, 679 | "file_extension": ".py", 680 | "mimetype": "text/x-python", 681 | "name": "python", 682 | "nbconvert_exporter": "python", 683 | "pygments_lexer": "ipython3", 684 | "version": "3.11.3" 685 | } 686 | }, 687 | "nbformat": 4, 688 | "nbformat_minor": 5 689 | } 690 | -------------------------------------------------------------------------------- /Dataset/Augmentation of Clips.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "id": "9d3fcce2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Importing the libraries\n", 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import librosa\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import os\n", 16 | "import soundfile as sf\n", 17 | "import random" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 10, 23 | "id": "b556fa5f", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# # Load DataFrame\n", 28 | "df = pd.read_csv('D:\\SEM 4\\Project\\\\SEP-28k_labels.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 11, 34 | "id": "160b2796", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
ShowEpIdClipIdStartStopUnsurePoorAudioQualityProlongationBlockSoundRepWordRepDifficultToUnderstandInterjectionNoStutteredWordsNaturalPauseMusicNoSpeech
0HeStutters003190032031948320000000003100
1HeStutters013197712032025120000000003100
2HeStutters023480976034857760000000003000
3HeStutters033520064035248640001000002000
4HeStutters043572192035769920000000003000
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " Show EpId ClipId Start Stop Unsure PoorAudioQuality \\\n", 184 | "0 HeStutters 0 0 31900320 31948320 0 0 \n", 185 | "1 HeStutters 0 1 31977120 32025120 0 0 \n", 186 | "2 HeStutters 0 2 34809760 34857760 0 0 \n", 187 | "3 HeStutters 0 3 35200640 35248640 0 0 \n", 188 | "4 HeStutters 0 4 35721920 35769920 0 0 \n", 189 | "\n", 190 | " Prolongation Block SoundRep WordRep DifficultToUnderstand \\\n", 191 | "0 0 0 0 0 0 \n", 192 | "1 0 0 0 0 0 \n", 193 | "2 0 0 0 0 0 \n", 194 | "3 1 0 0 0 0 \n", 195 | "4 0 0 0 0 0 \n", 196 | "\n", 197 | " Interjection NoStutteredWords NaturalPause Music NoSpeech \n", 198 | "0 0 3 1 0 0 \n", 199 | "1 0 3 1 0 0 \n", 200 | "2 0 3 0 0 0 \n", 201 | "3 0 2 0 0 0 \n", 202 | "4 0 3 0 0 0 " 203 | ] 204 | }, 205 | "execution_count": 11, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "df.head()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "id": "fc95f46b", 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "array(['HeStutters', 'HVSA', 'IStutterSoWhat', 'MyStutteringLife',\n", 224 | " 'StrongVoices', 'StutterTalk', 'StutteringIsCool',\n", 225 | " 'WomenWhoStutter'], dtype=object)" 226 | ] 227 | }, 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "df['Show'].unique()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 13, 240 | "id": "18a17fd2", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "#Reading the audio files\n", 245 | "main_folder = 'D:\\\\SEM 4\\\\Project\\\\sep28k-final'\n", 246 | "def list_files(directory):\n", 247 | " for root, _, files in os.walk(directory):\n", 248 | " for file in files:\n", 249 | " yield os.path.join(root, file)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 17, 255 | "id": "d7ffddf9", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "def count_files(directory):\n", 260 | " file_count = 0\n", 261 | " for root, _, files in os.walk(directory):\n", 262 | " file_count += len(files)\n", 263 | " return file_count" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 18, 269 | "id": "a03ede16", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "Number of files in main directory: 21836\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "num_files = count_files(main_folder)\n", 282 | "print(\"Number of files in main directory:\", num_files)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 8, 288 | "id": "f570fecf", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "def remove_short_clips(directory):\n", 293 | " removed_count = 0\n", 294 | " for file_path in list_files(directory):\n", 295 | " # Load audio and get its duration\n", 296 | " audio, sr = librosa.load(file_path, sr=None)\n", 297 | " duration = librosa.get_duration(y=audio, sr=sr)\n", 298 | " # Check if duration is less than 3 seconds\n", 299 | " if duration < 3.0:\n", 300 | " # Remove the file\n", 301 | " os.remove(file_path)\n", 302 | " removed_count += 1\n", 303 | " print(f\"Removed {file_path} (duration: {duration:.2f} seconds)\")\n", 304 | " return removed_count\n" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 9, 310 | "id": "b0ff8305", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "Removed D:\\SEM 4\\Project\\sep28k-final\\HeStutters\\17\\HeStutters_17_0.wav (duration: 2.90 seconds)\n", 318 | "Removed D:\\SEM 4\\Project\\sep28k-final\\HeStutters\\2\\HeStutters_2_41.wav (duration: 2.70 seconds)\n", 319 | "Removed D:\\SEM 4\\Project\\sep28k-final\\HVSA\\0\\HVSA_0_0.wav (duration: 2.63 seconds)\n", 320 | "Removed D:\\SEM 4\\Project\\sep28k-final\\HVSA\\3\\HVSA_3_37.wav (duration: 2.86 seconds)\n", 321 | "Removed D:\\SEM 4\\Project\\sep28k-final\\IStutterSoWhat\\2\\IStutterSoWhat_2_37.wav (duration: 2.88 seconds)\n", 322 | "Removed D:\\SEM 4\\Project\\sep28k-final\\MyStutteringLife\\0\\MyStutteringLife_0_10.wav (duration: 2.56 seconds)\n", 323 | "Removed D:\\SEM 4\\Project\\sep28k-final\\MyStutteringLife\\18\\MyStutteringLife_18_1.wav (duration: 2.69 seconds)\n", 324 | "Removed D:\\SEM 4\\Project\\sep28k-final\\MyStutteringLife\\23\\MyStutteringLife_23_3.wav (duration: 2.69 seconds)\n", 325 | "Removed D:\\SEM 4\\Project\\sep28k-final\\MyStutteringLife\\7\\MyStutteringLife_7_13.wav (duration: 2.73 seconds)\n", 326 | "Removed D:\\SEM 4\\Project\\sep28k-final\\StutterTalk\\5\\StutterTalk_5_8.wav (duration: 2.83 seconds)\n", 327 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\101\\WomenWhoStutter_101_0.wav (duration: 2.58 seconds)\n", 328 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\101\\WomenWhoStutter_101_35.wav (duration: 2.58 seconds)\n", 329 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\47\\WomenWhoStutter_47_1.wav (duration: 2.63 seconds)\n", 330 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\73\\WomenWhoStutter_73_0.wav (duration: 2.55 seconds)\n", 331 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\74\\WomenWhoStutter_74_0.wav (duration: 2.54 seconds)\n", 332 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\80\\WomenWhoStutter_80_9.wav (duration: 2.58 seconds)\n", 333 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\85\\WomenWhoStutter_85_0.wav (duration: 2.55 seconds)\n", 334 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\86\\WomenWhoStutter_86_0.wav (duration: 2.55 seconds)\n", 335 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\87\\WomenWhoStutter_87_0.wav (duration: 2.56 seconds)\n", 336 | "Removed D:\\SEM 4\\Project\\sep28k-final\\WomenWhoStutter\\97\\WomenWhoStutter_97_0.wav (duration: 2.59 seconds)\n", 337 | "Total files removed: 20\n" 338 | ] 339 | } 340 | ], 341 | "source": [ 342 | "removed_count = remove_short_clips(main_folder)\n", 343 | "print(\"Total files removed:\", removed_count)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 10, 349 | "id": "bed7c219", 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "Number of files after removal: 21836\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "num_files_after = count_files(main_folder)\n", 362 | "print(\"Number of files after removal:\", num_files_after)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 11, 368 | "id": "4efe6e8a", 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "def check_sampling_rate(directory, target_sr=16000):\n", 373 | " non_matching_files = []\n", 374 | " for file_path in list_files(directory):\n", 375 | " # Load audio and get its sampling rate\n", 376 | " audio, sr = librosa.load(file_path, sr=None)\n", 377 | " # Check if sampling rate is not the target rate\n", 378 | " if sr != target_sr:\n", 379 | " non_matching_files.append((file_path, sr))\n", 380 | " return non_matching_files" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 12, 386 | "id": "812f048e", 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "# Check sampling rate of audio files\n", 391 | "non_matching_files = check_sampling_rate(main_folder)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 13, 397 | "id": "f42d92b0", 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "All files have the desired sampling rate (16000)\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "if non_matching_files:\n", 410 | " print(\"Files with non-matching sampling rates:\")\n", 411 | " for file_path, sr in non_matching_files:\n", 412 | " print(f\"{file_path}: Sampling rate = {sr}\")\n", 413 | "else:\n", 414 | " print(\"All files have the desired sampling rate (16000)\")" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 14, 420 | "id": "a5c2cb78", 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "Number of files after checking: 21836\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "num_files_after = count_files(main_folder)\n", 433 | "print(\"Number of files after checking:\", num_files_after)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 15, 439 | "id": "9bbd0968", 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "def augment_all_files_music(input_folder, output_folder, music_folder):\n", 444 | " # Create the output folder if it doesn't exist\n", 445 | " if not os.path.exists(output_folder):\n", 446 | " os.makedirs(output_folder)\n", 447 | " \n", 448 | " # Loop through all files in the input folder and its subdirectories\n", 449 | " for root, _, files in os.walk(input_folder):\n", 450 | " # Create corresponding output subdirectory structure\n", 451 | " output_subfolder = os.path.join(output_folder, os.path.relpath(root, input_folder))\n", 452 | " os.makedirs(output_subfolder, exist_ok=True)\n", 453 | " \n", 454 | " for file_name in files:\n", 455 | " if file_name.endswith('.wav'): # Assuming all files are in WAV format\n", 456 | " # Construct the full paths for input and output files\n", 457 | " input_file_path = os.path.join(root, file_name)\n", 458 | " output_file_path = os.path.join(output_subfolder, f\"{os.path.splitext(file_name)[0]}_aug.wav\")\n", 459 | " \n", 460 | " # Load stuttered speech sample (ensure duration is 3 seconds)\n", 461 | " audio, sr = librosa.load(input_file_path, sr=None, duration=3.0)\n", 462 | " \n", 463 | " # Randomly select a music file from the music folder\n", 464 | " music_files = os.listdir(music_folder)\n", 465 | " selected_music_file = np.random.choice(music_files)\n", 466 | " music_file = os.path.join(music_folder, selected_music_file)\n", 467 | " \n", 468 | " # Load selected music file\n", 469 | " music, sr_music = librosa.load(music_file, sr=None, duration=len(audio)/sr)\n", 470 | " \n", 471 | " # Perform augmentation\n", 472 | " snr = np.random.uniform(5, 15)\n", 473 | " scaling_factor = np.sqrt(np.var(audio) / (np.var(music) * 10 ** (snr / 10)))\n", 474 | " augmented_audio = audio + scaling_factor * music\n", 475 | " \n", 476 | " # Write augmented audio to the specified output path\n", 477 | " sf.write(output_file_path, augmented_audio, sr)\n" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 16, 483 | "id": "3b1f1292", 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "def augment_all_files_noise(input_folder, output_folder, noise_folder, snr_range=(0, 15)):\n", 488 | " # Create the output folder if it doesn't exist\n", 489 | " if not os.path.exists(output_folder):\n", 490 | " os.makedirs(output_folder)\n", 491 | " \n", 492 | " # Loop through all files in the input folder and its subdirectories\n", 493 | " for root, _, files in os.walk(input_folder):\n", 494 | " # Create corresponding output subdirectory structure\n", 495 | " output_subfolder = os.path.join(output_folder, os.path.relpath(root, input_folder))\n", 496 | " os.makedirs(output_subfolder, exist_ok=True)\n", 497 | " \n", 498 | " for file_name in files:\n", 499 | " if file_name.endswith('.wav'): # Assuming all files are in WAV format\n", 500 | " # Construct the full paths for input and output files\n", 501 | " input_file_path = os.path.join(root, file_name)\n", 502 | " output_file_path = os.path.join(output_subfolder, f\"{os.path.splitext(file_name)[0]}_noise_aug.wav\")\n", 503 | " \n", 504 | " # Load stuttered speech sample (ensure duration is 3 seconds)\n", 505 | " audio, sr = librosa.load(input_file_path, sr=None, duration=3.0)\n", 506 | " \n", 507 | " # Initialize augmented audio\n", 508 | " augmented_audio = np.copy(audio)\n", 509 | " \n", 510 | " # Randomly select a noise file from the noise folder\n", 511 | " noise_files = os.listdir(noise_folder)\n", 512 | " selected_noise_file = np.random.choice(noise_files)\n", 513 | " noise_file = os.path.join(noise_folder, selected_noise_file)\n", 514 | " \n", 515 | " # Load selected noise file with duration matching the length of the stuttered speech clip\n", 516 | " noise, sr_noise = librosa.load(noise_file, sr=None, duration=3.0)\n", 517 | " \n", 518 | " # Ensure noise matches the length of the audio segment\n", 519 | " while len(noise) < len(audio):\n", 520 | " noise = np.concatenate([noise, noise[:len(audio) - len(noise)]])\n", 521 | " \n", 522 | " # Add noise at 1-second intervals\n", 523 | " for i in range(sr, len(audio), sr):\n", 524 | " # Compute SNR\n", 525 | " snr = np.random.uniform(snr_range[0], snr_range[1])\n", 526 | " \n", 527 | " # Compute scaling factor for noise\n", 528 | " scaling_factor = np.sqrt(np.var(audio[i-sr:i]) / (np.var(noise) * 10 ** (snr / 10)))\n", 529 | " \n", 530 | " # Add scaled noise to audio\n", 531 | " augmented_audio[i-sr:i] += scaling_factor * noise[i-sr:i]\n", 532 | " \n", 533 | " # Write augmented audio to the specified output path\n", 534 | " sf.write(output_file_path, augmented_audio, sr)\n" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 17, 540 | "id": "50e7bfb7", 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "def augment_all_files_babble(input_folder, output_folder, speech_folder, snr_range=(13, 20), num_speakers_range=(3, 7)):\n", 545 | " # Create the output folder if it doesn't exist\n", 546 | " if not os.path.exists(output_folder):\n", 547 | " os.makedirs(output_folder)\n", 548 | " \n", 549 | " # Loop through all files in the input folder and its subdirectories\n", 550 | " for root, _, files in os.walk(input_folder):\n", 551 | " # Create corresponding output subdirectory structure\n", 552 | " output_subfolder = os.path.join(output_folder, os.path.relpath(root, input_folder))\n", 553 | " os.makedirs(output_subfolder, exist_ok=True)\n", 554 | " \n", 555 | " for file_name in files:\n", 556 | " if file_name.endswith('.wav'): # Assuming all files are in WAV format\n", 557 | " # Construct the full paths for input and output files\n", 558 | " input_file_path = os.path.join(root, file_name)\n", 559 | " output_file_path = os.path.join(output_subfolder, f\"{os.path.splitext(file_name)[0]}_babble_aug.wav\")\n", 560 | " \n", 561 | " # Load stuttered speech sample (ensure duration is 3 seconds)\n", 562 | " audio, sr = librosa.load(input_file_path, sr=None, duration=3.0)\n", 563 | " \n", 564 | " # Initialize augmented audio\n", 565 | " augmented_audio = np.copy(audio)\n", 566 | " \n", 567 | " # Randomly select speech files from the speech folder to create babble\n", 568 | " babble_audio = np.zeros_like(audio)\n", 569 | " num_speakers = random.randint(num_speakers_range[0], num_speakers_range[1])\n", 570 | " for _ in range(num_speakers):\n", 571 | " speech_files = os.listdir(speech_folder)\n", 572 | " selected_speech_file = np.random.choice(speech_files)\n", 573 | " speech_file = os.path.join(speech_folder, selected_speech_file)\n", 574 | " \n", 575 | " # Load selected speech file with duration matching the length of the stuttered speech clip\n", 576 | " speech, _ = librosa.load(speech_file, sr=None, duration=3.0)\n", 577 | " \n", 578 | " # Ensure speech matches the length of the audio segment\n", 579 | " while len(speech) < len(audio):\n", 580 | " speech = np.concatenate([speech, speech[:len(audio) - len(speech)]])\n", 581 | " \n", 582 | " # Mix speech\n", 583 | " babble_audio += speech\n", 584 | " \n", 585 | " # Compute SNR\n", 586 | " snr = np.random.uniform(snr_range[0], snr_range[1])\n", 587 | " \n", 588 | " # Compute scaling factor for babble\n", 589 | " scaling_factor = np.sqrt(np.var(audio) / (np.var(babble_audio) * 10 ** (snr / 10)))\n", 590 | " \n", 591 | " # Add scaled babble to audio\n", 592 | " augmented_audio += scaling_factor * babble_audio\n", 593 | " \n", 594 | " # Write augmented audio to the specified output path\n", 595 | " sf.write(output_file_path, augmented_audio, sr)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 14, 601 | "id": "d9e52a6f", 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "def augment_all_files_rirs(input_folder, output_folder, rirs_folder):\n", 606 | " # Create the output folder if it doesn't exist\n", 607 | " if not os.path.exists(output_folder):\n", 608 | " os.makedirs(output_folder)\n", 609 | " \n", 610 | " # Loop through all files in the input folder and its subdirectories\n", 611 | " for root, _, files in os.walk(input_folder):\n", 612 | " # Create corresponding output subdirectory structure\n", 613 | " output_subfolder = os.path.join(output_folder, os.path.relpath(root, input_folder))\n", 614 | " os.makedirs(output_subfolder, exist_ok=True)\n", 615 | " \n", 616 | " for file_name in files:\n", 617 | " if file_name.endswith('.wav'): # Assuming all files are in WAV format\n", 618 | " # Construct the full paths for input and output files\n", 619 | " input_file_path = os.path.join(root, file_name)\n", 620 | " output_file_path = os.path.join(output_subfolder, f\"{os.path.splitext(file_name)[0]}_rirs_aug.wav\")\n", 621 | " \n", 622 | " # Load stuttered speech sample (ensure duration is 3 seconds)\n", 623 | " audio, sr = librosa.load(input_file_path, sr=None, duration=3.0)\n", 624 | " \n", 625 | " # Randomly select an RIR file from the folder\n", 626 | " rirs_files = os.listdir(rirs_folder)\n", 627 | " selected_rirs_file = np.random.choice(rirs_files)\n", 628 | " rirs_file = os.path.join(rirs_folder, selected_rirs_file)\n", 629 | " \n", 630 | " # Load selected RIR\n", 631 | " rirs, sr_rirs = librosa.load(rirs_file, sr=None)\n", 632 | " \n", 633 | " # Ensure RIRs is mono\n", 634 | " if len(rirs.shape) > 1:\n", 635 | " rirs = rirs[:, 0]\n", 636 | " \n", 637 | " # Convolve audio with RIRs\n", 638 | " augmented_audio = np.convolve(audio, rirs, mode='same')\n", 639 | " \n", 640 | " # Write augmented audio to the specified output path\n", 641 | " sf.write(output_file_path, augmented_audio, sr)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 15, 647 | "id": "77b7f524", 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "input_folder = \"D:\\\\SEM 4\\\\Project\\\\sep28k-final\"\n", 652 | "output_folder_music = \"D:\\\\SEM 4\\\\Project\\\\augmented\\\\music\"\n", 653 | "output_folder_noise = \"D:\\\\SEM 4\\\\Project\\\\augmented\\\\noise\"\n", 654 | "output_folder_babble = \"D:\\\\SEM 4\\\\Project\\\\augmented\\\\babble\"\n", 655 | "output_folder_rirs = \"D:\\\\SEM 4\\\\Project\\\\augmented\\\\rirs\"\n", 656 | "music_folder = \"D:\\\\SEM 4\\\\Project\\\\musan\\\\music\"\n", 657 | "noise_folder=\"D:\\\\SEM 4\\\\Project\\\\musan\\\\noise\"\n", 658 | "speech_folder=\"D:\\\\SEM 4\\\\Project\\\\musan\\\\speech\"\n", 659 | "rirs_folder = \"D:\\\\SEM 4\\\\Project\\\\musan\\\\rirs\"" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 20, 665 | "id": "993e06bb", 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "augment_all_files_music(input_folder, output_folder_music, music_folder)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 21, 675 | "id": "55f28917", 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "augment_all_files_noise(input_folder, output_folder_noise, noise_folder)" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 22, 685 | "id": "83a0d505", 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "augment_all_files_babble(input_folder,output_folder_babble,speech_folder)" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 16, 695 | "id": "9f330998", 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "augment_all_files_rirs(input_folder,output_folder_rirs,rirs_folder)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 18, 705 | "id": "873486ba", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "def count_wav_files(folder):\n", 710 | " count = 0\n", 711 | " for root, _, files in os.walk(folder):\n", 712 | " for file_name in files:\n", 713 | " if file_name.endswith('.wav'):\n", 714 | " count += 1\n", 715 | " return count" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 42, 721 | "id": "2ec0acde", 722 | "metadata": {}, 723 | "outputs": [ 724 | { 725 | "name": "stdout", 726 | "output_type": "stream", 727 | "text": [ 728 | "Number of WAV files: 20091\n" 729 | ] 730 | } 731 | ], 732 | "source": [ 733 | "number_of_files = count_wav_files(output_folder_music)\n", 734 | "print(\"Number of WAV files:\", number_of_files)" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 20, 740 | "id": "1631e19f", 741 | "metadata": {}, 742 | "outputs": [ 743 | { 744 | "name": "stdout", 745 | "output_type": "stream", 746 | "text": [ 747 | "Number of WAV files: 21836\n" 748 | ] 749 | } 750 | ], 751 | "source": [ 752 | "number_of_files = count_wav_files(output_folder_noise)\n", 753 | "print(\"Number of WAV files:\", number_of_files)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 21, 759 | "id": "b873e174", 760 | "metadata": {}, 761 | "outputs": [ 762 | { 763 | "name": "stdout", 764 | "output_type": "stream", 765 | "text": [ 766 | "Number of WAV files: 21836\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "number_of_files = count_wav_files(output_folder_babble)\n", 772 | "print(\"Number of WAV files:\", number_of_files)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 19, 778 | "id": "f0315eb9", 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "name": "stdout", 783 | "output_type": "stream", 784 | "text": [ 785 | "Number of WAV files: 21836\n" 786 | ] 787 | } 788 | ], 789 | "source": [ 790 | "number_of_files = count_wav_files(output_folder_rirs)\n", 791 | "print(\"Number of WAV files:\", number_of_files) " 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "id": "63af372c", 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [] 801 | } 802 | ], 803 | "metadata": { 804 | "kernelspec": { 805 | "display_name": "Python 3 (ipykernel)", 806 | "language": "python", 807 | "name": "python3" 808 | }, 809 | "language_info": { 810 | "codemirror_mode": { 811 | "name": "ipython", 812 | "version": 3 813 | }, 814 | "file_extension": ".py", 815 | "mimetype": "text/x-python", 816 | "name": "python", 817 | "nbconvert_exporter": "python", 818 | "pygments_lexer": "ipython3", 819 | "version": "3.11.3" 820 | } 821 | }, 822 | "nbformat": 4, 823 | "nbformat_minor": 5 824 | } 825 | -------------------------------------------------------------------------------- /WordRep - Deep Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4512109-72c3-4639-9f84-c4ded9559659", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import os\n", 12 | "import librosa\n", 13 | "import librosa.display\n", 14 | "import pandas as pd\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import sklearn.preprocessing\n", 17 | "from scipy.signal import lfilter\n", 18 | "from joblib import Parallel, delayed\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "from sklearn.ensemble import RandomForestClassifier\n", 21 | "from sklearn.model_selection import GridSearchCV\n", 22 | "import parselmouth\n", 23 | "from sklearn.decomposition import PCA" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "e26e9fbd-fd8d-4a1b-8dc3-d9a68e8ffbb8", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.read_csv(r\"C:\\Users\\jmdgo\\Downloads\\binary_labeled_dataset.csv\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "82127c18-f3c7-4c59-91e1-96ea6e2d7acf", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "clip_path = r\"C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extracted_clips\"" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "id": "c8e02825-5606-4530-ae17-0e5eea811e28", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "audio_paths = []\n", 54 | "for folder in os.listdir(clip_path):\n", 55 | " folder_path = os.path.join(clip_path, folder)\n", 56 | " #episode_paths = []\n", 57 | " for episode in os.listdir(folder_path):\n", 58 | " episode_path = os.path.join(folder_path, episode)\n", 59 | " for wav in os.listdir(episode_path):\n", 60 | " wav_path = os.path.join(episode_path, wav)\n", 61 | " audio_paths.append(wav_path)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "id": "cded2107-5a65-41cc-80fd-7862a35a912a", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df['audio_path'] = ['default'] * len(df)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "id": "49178e08-ce96-4858-97fa-387a1c7106f0", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "for path in audio_paths:\n", 82 | " filename = os.path.basename(path)\n", 83 | " parts = filename.split('_')\n", 84 | " podcast_name = parts[0]\n", 85 | " episode_number = int(parts[1])\n", 86 | " clip_number = int(parts[2].split('.')[0]) \n", 87 | "\n", 88 | " match_row = df[(df['Show'] == podcast_name) & (df['EpId'] == episode_number) & (df['ClipId'] == clip_number)]\n", 89 | " if not match_row.empty:\n", 90 | " match_index = match_row.index[0]\n", 91 | " df.at[match_index, 'audio_path'] = path" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "id": "1a039cfe-617b-4fab-9f73-9cf73262b00a", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "0 60.677123\n", 105 | "1 39.322877\n", 106 | "Name: Stutter, dtype: float64\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "value_counts = df['Stutter'].value_counts()\n", 112 | "total_rows = df.shape[0]\n", 113 | "\n", 114 | "percentage_per_class = (value_counts / total_rows) * 100\n", 115 | "\n", 116 | "print(percentage_per_class)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 9, 122 | "id": "99d679ea-5661-4f3d-912c-f8bcec07633e", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "def extract_mfcc(audio_file, num_mfcc=40):\n", 127 | " audio, sr = librosa.load(audio_file, sr=None)\n", 128 | " mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc)\n", 129 | " mfccs_flat = mfccs.T.flatten()\n", 130 | " return mfccs_flat\n", 131 | "\n", 132 | "def extract_zcr(audio_file, frame_length=2048, hop_length=512):\n", 133 | " audio, sr = librosa.load(audio_file, sr=None)\n", 134 | " zcr = librosa.feature.zero_crossing_rate(audio, frame_length=frame_length, hop_length=hop_length)\n", 135 | " return zcr.flatten()\n", 136 | "\n", 137 | "def extract_jitter_shimmer(audio_file):\n", 138 | " sound = parselmouth.Sound(audio_file)\n", 139 | " pitch = sound.to_pitch()\n", 140 | " pulses = parselmouth.praat.call([sound, pitch], \"To PointProcess (cc)\")\n", 141 | " jitter = parselmouth.praat.call(pulses, \"Get jitter (local)\", 0, 0, 0.0001, 0.02, 1.3)\n", 142 | " shimmer = parselmouth.praat.call([sound, pulses], \"Get shimmer (local)\", 0, 0, 0.0001, 0.02, 1.3, 1.6)\n", 143 | " return jitter, shimmer\n", 144 | "\n", 145 | "def extract_all_features(audio_file, num_mfcc=40, frame_length=2048, hop_length=512):\n", 146 | " mfcc_features = extract_mfcc(audio_file, num_mfcc)\n", 147 | " zcr_features = extract_zcr(audio_file, frame_length, hop_length)\n", 148 | " jitter, shimmer = extract_jitter_shimmer(audio_file)\n", 149 | " return np.concatenate([mfcc_features, zcr_features, [jitter, shimmer]])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "id": "1986acfd-999f-4b6d-a5ef-83c4283fa41e", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "audio_files = df['audio_path'] \n", 160 | "labels = df['WordRep']" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 11, 166 | "id": "f18f0517-edf7-48ac-beb9-455c724add88", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df_WordRep = df[['audio_path', 'WordRep']]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 12, 176 | "id": "6a98512a-db92-42d3-be8d-7faf74367b19", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "df_WordRep_1 = df_WordRep[df_WordRep['WordRep']==1]" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 13, 186 | "id": "4f4f7e7f-0148-4642-be12-1366c45aa674", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "df_WordRep_0 = df[(df['Prolongation']==0) & \n", 191 | " (df['Block']==0) & \n", 192 | " (df['Interjection']==0) & \n", 193 | " (df['WordRep']==0) & \n", 194 | " (df['SoundRep']==0) &\n", 195 | " (df['NoStutteredWords']==1)]\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 14, 201 | "id": "58215a72-ffed-45fc-a98d-c83c3d79b42a", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "(2358, 2)" 208 | ] 209 | }, 210 | "execution_count": 14, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "df_WordRep_1.shape" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 16, 222 | "id": "77dfe2ad-b0ed-4291-93b1-c79b1719201c", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "df_WordRep_0_sampled = df_WordRep_0.sample(2358)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 17, 232 | "id": "695a90ae-1f44-43a7-bdd2-5bdde9340ee5", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "(2358, 17)" 239 | ] 240 | }, 241 | "execution_count": 17, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "df_WordRep_0_sampled.shape" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 18, 253 | "id": "ce16530e-e3c0-4322-a556-bc0eadab17ba", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "df_WordRep_data = pd.concat([df_WordRep_0_sampled, df_WordRep_1], ignore_index=True, axis=0)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 19, 263 | "id": "e76bd344-df9c-4b5c-a4d6-7992f5e2aece", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "(4716, 17)" 270 | ] 271 | }, 272 | "execution_count": 19, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "df_WordRep_data.shape" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 20, 284 | "id": "91a4fd62-172b-416e-8c72-b2b942c8dc22", 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/html": [ 290 | "
\n", 291 | "\n", 304 | "\n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | "
audio_pathWordRep
0C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra...0
1C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra...0
\n", 325 | "
" 326 | ], 327 | "text/plain": [ 328 | " audio_path WordRep\n", 329 | "0 C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra... 0\n", 330 | "1 C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra... 0" 331 | ] 332 | }, 333 | "execution_count": 20, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "df_WordRep.head(2)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 21, 345 | "id": "659f2a48-a4ec-4b22-ab53-0439f4bea6a3", 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "0 50.0\n", 353 | "1 50.0\n", 354 | "Name: WordRep, dtype: float64\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "value_counts = df_WordRep_data['WordRep'].value_counts()\n", 360 | "total_rows = df_WordRep_data.shape[0]\n", 361 | "\n", 362 | "percentage_per_class = (value_counts / total_rows) * 100\n", 363 | "\n", 364 | "print(percentage_per_class)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 22, 370 | "id": "6ac7b6a0-0b0b-4eb1-92b2-6d4b834e12aa", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "X = np.array(df_WordRep_data['audio_path'])\n", 375 | "y = np.array(df_WordRep_data['WordRep'])" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 23, 381 | "id": "02ad3a9a-237b-4e62-8d65-506c7ad0472f", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify = y)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 24, 391 | "id": "30457ffd-8255-4ebc-974d-38f33db7afcb", 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "mfcc_features_train = Parallel(n_jobs=-1)(delayed(extract_mfcc)(audio_file) for audio_file in X_train)\n", 396 | "mfcc_features_test = Parallel(n_jobs=-1)(delayed(extract_mfcc)(audio_file) for audio_file in X_test)\n", 397 | "zcr_features_train = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_train)\n", 398 | "zcr_features_test = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_test)\n", 399 | "\n", 400 | "jitter_shimmer_train = Parallel(n_jobs=-1)(delayed(extract_jitter_shimmer)(audio_file) for audio_file in X_train)\n", 401 | "jitter_shimmer_test = Parallel(n_jobs=-1)(delayed(extract_jitter_shimmer)(audio_file) for audio_file in X_test)\n", 402 | "\n", 403 | "jitter_train, shimmer_train = zip(*jitter_shimmer_train)\n", 404 | "jitter_test, shimmer_test = zip(*jitter_shimmer_test)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "id": "6973590c-5fc5-40f4-a7fb-10d05dd9df47", 410 | "metadata": {}, 411 | "source": [ 412 | "zcr_features_train = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_train)\n", 413 | "zcr_features_test = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_test)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "id": "660938cb-e39a-450a-9061-6c24c02bd983", 419 | "metadata": {}, 420 | "source": [ 421 | "features_train = np.array([np.hstack((mfcc, zcr)) for mfcc, zcr in zip(mfcc_features_train, zcr_features_train)])\n", 422 | "features_test = np.array([np.hstack((mfcc, zcr)) for mfcc, zcr in zip(mfcc_features_test, zcr_features_test)])" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 25, 428 | "id": "ef378e5f", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "(3301, 3856)\n", 436 | "(1415, 3856)\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "merged_features_train = np.column_stack((mfcc_features_train, zcr_features_train, jitter_train, shimmer_train))\n", 442 | "merged_features_test = np.column_stack((mfcc_features_test, zcr_features_test, jitter_test, shimmer_test))\n", 443 | "\n", 444 | "print(merged_features_train.shape)\n", 445 | "print(merged_features_test.shape)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 26, 451 | "id": "edabd8ce", 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "((3301,), (1415,), (3301,), (1415,))" 458 | ] 459 | }, 460 | "execution_count": 26, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "X_train.shape,X_test.shape,y_train.shape,y_test.shape" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 27, 472 | "id": "b0716fea", 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "NaN values found in training data at indices: [[ 96 3854]\n", 480 | " [ 96 3855]\n", 481 | " [1100 3855]\n", 482 | " [1496 3854]\n", 483 | " [1496 3855]\n", 484 | " [2520 3854]\n", 485 | " [2520 3855]]\n", 486 | "NaN values found in testing data at indices: [[ 589 3854]\n", 487 | " [ 589 3855]\n", 488 | " [ 614 3854]\n", 489 | " [ 614 3855]]\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "nan_indices_train = np.argwhere(np.isnan(merged_features_train))\n", 495 | "if len(nan_indices_train) > 0:\n", 496 | " print(\"NaN values found in training data at indices:\", nan_indices_train)\n", 497 | "else:\n", 498 | " print(\"No NaN values found in training data\")\n", 499 | "\n", 500 | "# Check for NaN values in testing data\n", 501 | "nan_indices_test = np.argwhere(np.isnan(merged_features_test))\n", 502 | "if len(nan_indices_test) > 0:\n", 503 | " print(\"NaN values found in testing data at indices:\", nan_indices_test)\n", 504 | "else:\n", 505 | " print(\"No NaN values found in testing data\")" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 28, 511 | "id": "d47130de", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "name": "stdout", 516 | "output_type": "stream", 517 | "text": [ 518 | "No NaN values found in imputed training data\n", 519 | "No NaN values found in imputed testing data\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "from sklearn.impute import KNNImputer\n", 525 | "# KNN imputation\n", 526 | "imputer = KNNImputer(n_neighbors=5) # You can adjust the number of neighbors as needed\n", 527 | "\n", 528 | "# Fit imputer on training data and transform both training and testing data\n", 529 | "merged_features_train = imputer.fit_transform(merged_features_train)\n", 530 | "merged_features_test = imputer.transform(merged_features_test)\n", 531 | "\n", 532 | "# Check if any NaN values remain after imputation\n", 533 | "nan_indices_train = np.argwhere(np.isnan(merged_features_train))\n", 534 | "if len(nan_indices_train) > 0:\n", 535 | " print(\"NaN values still present in imputed training data at indices:\", nan_indices_train)\n", 536 | "else:\n", 537 | " print(\"No NaN values found in imputed training data\")\n", 538 | "\n", 539 | "nan_indices_test = np.argwhere(np.isnan(merged_features_test))\n", 540 | "if len(nan_indices_test) > 0:\n", 541 | " print(\"NaN values still present in imputed testing data at indices:\", nan_indices_test)\n", 542 | "else:\n", 543 | " print(\"No NaN values found in imputed testing data\")\n" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 30, 549 | "id": "ab2ae65f", 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "name": "stdout", 554 | "output_type": "stream", 555 | "text": [ 556 | "Explained variance ratio: [0.24778653 0.15632687 0.04659932 0.03814196 0.03333837 0.0299441\n", 557 | " 0.0278395 0.01992707 0.01829503 0.01710236 0.01471269 0.01259337\n", 558 | " 0.01109808 0.01105278 0.00942105 0.00850567 0.0083739 0.00720376\n", 559 | " 0.00638069 0.006122 0.00581438 0.0056173 0.00542596 0.00520031\n", 560 | " 0.00464309 0.00440965 0.00424653 0.00408088 0.00395583 0.00368943\n", 561 | " 0.00366224 0.00354939 0.00344957 0.00311403 0.00306417 0.00293423\n", 562 | " 0.00285715 0.00275122 0.00269917 0.00259841 0.00254493 0.00249555\n", 563 | " 0.00235417 0.00230372 0.00229554 0.00220197 0.00213064 0.00209418\n", 564 | " 0.0020142 0.00196921]\n" 565 | ] 566 | } 567 | ], 568 | "source": [ 569 | "from sklearn.decomposition import PCA\n", 570 | "\n", 571 | "# Assuming merged_features_train and merged_features_test are your feature arrays\n", 572 | "\n", 573 | "# Merge the training and testing data\n", 574 | "merged_data = np.concatenate((merged_features_train, merged_features_test), axis=0)\n", 575 | "\n", 576 | "# Apply PCA\n", 577 | "pca = PCA(n_components=50) # You can specify the number of principal components as needed\n", 578 | "transformed_data = pca.fit_transform(merged_data)\n", 579 | "\n", 580 | "# Print the explained variance ratio\n", 581 | "print(\"Explained variance ratio:\", pca.explained_variance_ratio_)\n" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 31, 587 | "id": "3f94b2af-6ef7-4b4c-bb18-f970a4e39fda", 588 | "metadata": {}, 589 | "outputs": [ 590 | { 591 | "name": "stdout", 592 | "output_type": "stream", 593 | "text": [ 594 | "Distribution of 0s and 1s in y_train:\n", 595 | "1 1651\n", 596 | "0 1650\n", 597 | "dtype: int64\n", 598 | "\n", 599 | "Distribution of 0s and 1s in y_test:\n", 600 | "0 708\n", 601 | "1 707\n", 602 | "dtype: int64\n" 603 | ] 604 | } 605 | ], 606 | "source": [ 607 | "y_train_series = pd.Series(y_train)\n", 608 | "y_test_series = pd.Series(y_test)\n", 609 | "\n", 610 | "# Count the occurrences of each unique value in y_train\n", 611 | "train_distribution = y_train_series.value_counts()\n", 612 | "\n", 613 | "# Count the occurrences of each unique value in y_test\n", 614 | "test_distribution = y_test_series.value_counts()\n", 615 | "\n", 616 | "print(\"Distribution of 0s and 1s in y_train:\")\n", 617 | "print(train_distribution)\n", 618 | "\n", 619 | "print(\"\\nDistribution of 0s and 1s in y_test:\")\n", 620 | "print(test_distribution)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 32, 626 | "id": "890da51e", 627 | "metadata": {}, 628 | "outputs": [ 629 | { 630 | "name": "stdout", 631 | "output_type": "stream", 632 | "text": [ 633 | "Epoch 1/10\n", 634 | "104/104 [==============================] - 10s 93ms/step - loss: 12.1015 - accuracy: 0.5338 - val_loss: 0.7845 - val_accuracy: 0.5845\n", 635 | "Epoch 2/10\n", 636 | "104/104 [==============================] - 10s 92ms/step - loss: 0.6080 - accuracy: 0.6871 - val_loss: 0.6952 - val_accuracy: 0.6092\n", 637 | "Epoch 3/10\n", 638 | "104/104 [==============================] - 9s 91ms/step - loss: 0.4433 - accuracy: 0.8207 - val_loss: 0.7501 - val_accuracy: 0.6106\n", 639 | "Epoch 4/10\n", 640 | "104/104 [==============================] - 10s 92ms/step - loss: 0.2989 - accuracy: 0.9006 - val_loss: 0.8258 - val_accuracy: 0.5951\n", 641 | "Epoch 5/10\n", 642 | "104/104 [==============================] - 10s 92ms/step - loss: 0.1583 - accuracy: 0.9621 - val_loss: 0.9792 - val_accuracy: 0.5816\n", 643 | "Epoch 6/10\n", 644 | "104/104 [==============================] - 10s 95ms/step - loss: 0.0864 - accuracy: 0.9870 - val_loss: 1.1257 - val_accuracy: 0.5837\n", 645 | "Epoch 7/10\n", 646 | "104/104 [==============================] - 10s 94ms/step - loss: 0.0430 - accuracy: 0.9973 - val_loss: 1.2844 - val_accuracy: 0.5830\n", 647 | "Epoch 8/10\n", 648 | "104/104 [==============================] - 10s 93ms/step - loss: 0.0186 - accuracy: 0.9994 - val_loss: 1.4938 - val_accuracy: 0.5682\n", 649 | "Epoch 9/10\n", 650 | "104/104 [==============================] - 10s 92ms/step - loss: 0.0094 - accuracy: 1.0000 - val_loss: 1.5207 - val_accuracy: 0.5795\n", 651 | "Epoch 10/10\n", 652 | "104/104 [==============================] - 10s 93ms/step - loss: 0.0049 - accuracy: 1.0000 - val_loss: 1.6116 - val_accuracy: 0.5788\n" 653 | ] 654 | }, 655 | { 656 | "data": { 657 | "text/plain": [ 658 | "" 659 | ] 660 | }, 661 | "execution_count": 32, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "import numpy as np\n", 668 | "from sklearn.model_selection import train_test_split\n", 669 | "from tensorflow.keras.models import Sequential\n", 670 | "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense\n", 671 | "from tensorflow.keras.utils import to_categorical\n", 672 | "\n", 673 | "X_train = np.array(merged_features_train)\n", 674 | "X_test = np.array(merged_features_test)\n", 675 | "y_train = np.array(y_train)\n", 676 | "y_test = np.array(y_test)\n", 677 | "\n", 678 | "# Convert labels to categorical\n", 679 | "y_train_categorical = to_categorical(y_train, num_classes=2)\n", 680 | "y_test_categorical = to_categorical(y_test, num_classes=2)\n", 681 | "\n", 682 | "# Reshape features for Conv1D input shape\n", 683 | "X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)\n", 684 | "X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)\n", 685 | "\n", 686 | "# Define the CNN model\n", 687 | "model = Sequential()\n", 688 | "model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)))\n", 689 | "model.add(MaxPooling1D(pool_size=2))\n", 690 | "model.add(Conv1D(64, 3, activation='relu'))\n", 691 | "model.add(MaxPooling1D(pool_size=2))\n", 692 | "model.add(Flatten())\n", 693 | "model.add(Dense(128, activation='relu'))\n", 694 | "model.add(Dense(2, activation='sigmoid')) # Output layer with sigmoid activation for binary classification\n", 695 | "\n", 696 | "# Compile the model\n", 697 | "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", 698 | "\n", 699 | "# Train the model\n", 700 | "model.fit(X_train, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test, y_test_categorical))\n" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 37, 706 | "id": "ed530069", 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "name": "stdout", 711 | "output_type": "stream", 712 | "text": [ 713 | "Epoch 1/10\n", 714 | "104/104 [==============================] - 11s 95ms/step - loss: 16.1136 - accuracy: 0.5183 - val_loss: 4.1389 - val_accuracy: 0.4996\n", 715 | "Epoch 2/10\n", 716 | "104/104 [==============================] - 10s 97ms/step - loss: 1.1178 - accuracy: 0.6122 - val_loss: 0.7899 - val_accuracy: 0.5633\n", 717 | "Epoch 3/10\n", 718 | "104/104 [==============================] - 10s 92ms/step - loss: 0.4696 - accuracy: 0.7776 - val_loss: 0.7654 - val_accuracy: 0.5753\n", 719 | "Epoch 4/10\n", 720 | "104/104 [==============================] - 10s 92ms/step - loss: 0.3207 - accuracy: 0.8812 - val_loss: 0.8578 - val_accuracy: 0.5682\n", 721 | "Epoch 5/10\n", 722 | "104/104 [==============================] - 10s 95ms/step - loss: 0.2096 - accuracy: 0.9400 - val_loss: 1.1416 - val_accuracy: 0.5576\n", 723 | "Epoch 6/10\n", 724 | "104/104 [==============================] - 10s 94ms/step - loss: 0.1408 - accuracy: 0.9670 - val_loss: 1.0305 - val_accuracy: 0.5625\n" 725 | ] 726 | } 727 | ], 728 | "source": [ 729 | "import numpy as np\n", 730 | "from sklearn.model_selection import train_test_split\n", 731 | "from tensorflow.keras.models import Sequential\n", 732 | "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense\n", 733 | "from tensorflow.keras.utils import to_categorical\n", 734 | "\n", 735 | "# Assuming merged_features_train contains merged features for training data\n", 736 | "# Assuming merged_features_test contains merged features for test data\n", 737 | "# Assuming y_train contains labels for training data\n", 738 | "# Assuming y_test contains labels for test data\n", 739 | "\n", 740 | "# Convert lists to numpy arrays\n", 741 | "X_train = np.array(merged_features_train)\n", 742 | "X_test = np.array(merged_features_test)\n", 743 | "y_train = np.array(y_train)\n", 744 | "y_test = np.array(y_test)\n", 745 | "\n", 746 | "# Convert labels to categorical\n", 747 | "y_train_categorical = to_categorical(y_train, num_classes=2)\n", 748 | "y_test_categorical = to_categorical(y_test, num_classes=2)\n", 749 | "\n", 750 | "# Reshape features for Conv1D input shape\n", 751 | "X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)\n", 752 | "X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)\n", 753 | "\n", 754 | "# Define the CNN model\n", 755 | "model = Sequential()\n", 756 | "model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)))\n", 757 | "model.add(MaxPooling1D(pool_size=2))\n", 758 | "model.add(Conv1D(64, 3, activation='relu'))\n", 759 | "model.add(MaxPooling1D(pool_size=2))\n", 760 | "model.add(Flatten())\n", 761 | "model.add(Dense(128, activation='relu'))\n", 762 | "model.add(Dense(2, activation='sigmoid')) # Output layer with sigmoid activation for binary classification\n", 763 | "\n", 764 | "# Compile the model\n", 765 | "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", 766 | "\n", 767 | "from tensorflow.keras.callbacks import EarlyStopping\n", 768 | "\n", 769 | "# Define early stopping callback\n", 770 | "early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n", 771 | "\n", 772 | "# Train the model with early stopping\n", 773 | "history = model.fit(X_train, y_train_categorical, epochs=10, batch_size=32, \n", 774 | " validation_data=(X_test, y_test_categorical), callbacks=[early_stopping])\n" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 38, 780 | "id": "0d06238d", 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])\n" 788 | ] 789 | }, 790 | { 791 | "data": { 792 | "image/png": "", 793 | "text/plain": [ 794 | "
" 795 | ] 796 | }, 797 | "metadata": {}, 798 | "output_type": "display_data" 799 | } 800 | ], 801 | "source": [ 802 | "import matplotlib.pyplot as plt\n", 803 | "# Print the keys of the history\n", 804 | "print(history.history.keys())\n", 805 | "\n", 806 | "# Summarize history for accuracy\n", 807 | "plt.plot(history.history['accuracy'])\n", 808 | "plt.plot(history.history['val_accuracy'])\n", 809 | "plt.title('Model Accuracy')\n", 810 | "plt.ylabel('Accuracy')\n", 811 | "plt.xlabel('Epoch')\n", 812 | "plt.legend(['Train', 'Validation'], loc='upper left')\n", 813 | "plt.show()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 39, 819 | "id": "387b3171", 820 | "metadata": {}, 821 | "outputs": [ 822 | { 823 | "name": "stdout", 824 | "output_type": "stream", 825 | "text": [ 826 | "Model saved successfully to: C:\\Users\\jmdgo\\Downloads\n" 827 | ] 828 | } 829 | ], 830 | "source": [ 831 | "model_file_path = r'C:\\Users\\jmdgo\\Downloads'\n", 832 | "model.save(\"WordRep_mzjs_model.h5\")\n", 833 | "print(\"Model saved successfully to:\", model_file_path)" 834 | ] 835 | } 836 | ], 837 | "metadata": { 838 | "kernelspec": { 839 | "display_name": "Python 3 (ipykernel)", 840 | "language": "python", 841 | "name": "python3" 842 | }, 843 | "language_info": { 844 | "codemirror_mode": { 845 | "name": "ipython", 846 | "version": 3 847 | }, 848 | "file_extension": ".py", 849 | "mimetype": "text/x-python", 850 | "name": "python", 851 | "nbconvert_exporter": "python", 852 | "pygments_lexer": "ipython3", 853 | "version": "3.11.4" 854 | } 855 | }, 856 | "nbformat": 4, 857 | "nbformat_minor": 5 858 | } 859 | -------------------------------------------------------------------------------- /SoundRep - Deep Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4512109-72c3-4639-9f84-c4ded9559659", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import os\n", 12 | "import librosa\n", 13 | "import librosa.display\n", 14 | "import pandas as pd\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import sklearn.preprocessing\n", 17 | "from scipy.signal import lfilter\n", 18 | "from joblib import Parallel, delayed\n", 19 | "from sklearn.model_selection import train_test_split\n", 20 | "from sklearn.ensemble import RandomForestClassifier\n", 21 | "from sklearn.model_selection import GridSearchCV\n", 22 | "import parselmouth\n", 23 | "from sklearn.decomposition import PCA" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "e26e9fbd-fd8d-4a1b-8dc3-d9a68e8ffbb8", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.read_csv(r\"C:\\Users\\jmdgo\\Downloads\\binary_labeled_dataset.csv\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "82127c18-f3c7-4c59-91e1-96ea6e2d7acf", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "clip_path = r\"C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extracted_clips\"" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "id": "c8e02825-5606-4530-ae17-0e5eea811e28", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "audio_paths = []\n", 54 | "for folder in os.listdir(clip_path):\n", 55 | " folder_path = os.path.join(clip_path, folder)\n", 56 | " #episode_paths = []\n", 57 | " for episode in os.listdir(folder_path):\n", 58 | " episode_path = os.path.join(folder_path, episode)\n", 59 | " for wav in os.listdir(episode_path):\n", 60 | " wav_path = os.path.join(episode_path, wav)\n", 61 | " audio_paths.append(wav_path)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "id": "cded2107-5a65-41cc-80fd-7862a35a912a", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df['audio_path'] = ['default'] * len(df)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "id": "49178e08-ce96-4858-97fa-387a1c7106f0", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "for path in audio_paths:\n", 82 | " filename = os.path.basename(path)\n", 83 | " parts = filename.split('_')\n", 84 | " podcast_name = parts[0]\n", 85 | " episode_number = int(parts[1])\n", 86 | " clip_number = int(parts[2].split('.')[0]) \n", 87 | "\n", 88 | " match_row = df[(df['Show'] == podcast_name) & (df['EpId'] == episode_number) & (df['ClipId'] == clip_number)]\n", 89 | " if not match_row.empty:\n", 90 | " match_index = match_row.index[0]\n", 91 | " df.at[match_index, 'audio_path'] = path" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "id": "1a039cfe-617b-4fab-9f73-9cf73262b00a", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "0 60.677123\n", 105 | "1 39.322877\n", 106 | "Name: Stutter, dtype: float64\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "value_counts = df['Stutter'].value_counts()\n", 112 | "total_rows = df.shape[0]\n", 113 | "\n", 114 | "percentage_per_class = (value_counts / total_rows) * 100\n", 115 | "\n", 116 | "print(percentage_per_class)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "id": "99d679ea-5661-4f3d-912c-f8bcec07633e", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "def extract_mfcc(audio_file, num_mfcc=40):\n", 127 | " audio, sr = librosa.load(audio_file, sr=None)\n", 128 | " mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc)\n", 129 | " mfccs_flat = mfccs.T.flatten()\n", 130 | " return mfccs_flat\n", 131 | "\n", 132 | "def extract_zcr(audio_file, frame_length=2048, hop_length=512):\n", 133 | " audio, sr = librosa.load(audio_file, sr=None)\n", 134 | " zcr = librosa.feature.zero_crossing_rate(audio, frame_length=frame_length, hop_length=hop_length)\n", 135 | " return zcr.flatten()\n", 136 | "\n", 137 | "def extract_jitter_shimmer(audio_file):\n", 138 | " sound = parselmouth.Sound(audio_file)\n", 139 | " pitch = sound.to_pitch()\n", 140 | " pulses = parselmouth.praat.call([sound, pitch], \"To PointProcess (cc)\")\n", 141 | " jitter = parselmouth.praat.call(pulses, \"Get jitter (local)\", 0, 0, 0.0001, 0.02, 1.3)\n", 142 | " shimmer = parselmouth.praat.call([sound, pulses], \"Get shimmer (local)\", 0, 0, 0.0001, 0.02, 1.3, 1.6)\n", 143 | " return jitter, shimmer\n", 144 | "\n", 145 | "def extract_all_features(audio_file, num_mfcc=40, frame_length=2048, hop_length=512):\n", 146 | " mfcc_features = extract_mfcc(audio_file, num_mfcc)\n", 147 | " zcr_features = extract_zcr(audio_file, frame_length, hop_length)\n", 148 | " jitter, shimmer = extract_jitter_shimmer(audio_file)\n", 149 | " return np.concatenate([mfcc_features, zcr_features, [jitter, shimmer]])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "id": "1986acfd-999f-4b6d-a5ef-83c4283fa41e", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "audio_files = df['audio_path'] \n", 160 | "labels = df['SoundRep']" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 10, 166 | "id": "f18f0517-edf7-48ac-beb9-455c724add88", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df_SoundRep = df[['audio_path', 'SoundRep']]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 11, 176 | "id": "6a98512a-db92-42d3-be8d-7faf74367b19", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "df_SoundRep_1 = df_SoundRep[df_SoundRep['SoundRep']==1]" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 12, 186 | "id": "4f4f7e7f-0148-4642-be12-1366c45aa674", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "df_SoundRep_0 = df[(df['Prolongation']==0) & \n", 191 | " (df['Block']==0) & \n", 192 | " (df['Interjection']==0) & \n", 193 | " (df['WordRep']==0) & \n", 194 | " (df['SoundRep']==0) &\n", 195 | " (df['NoStutteredWords']==1)]\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 13, 201 | "id": "58215a72-ffed-45fc-a98d-c83c3d79b42a", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "(1950, 2)" 208 | ] 209 | }, 210 | "execution_count": 13, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "df_SoundRep_1.shape" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 15, 222 | "id": "77dfe2ad-b0ed-4291-93b1-c79b1719201c", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "df_SoundRep_0_sampled = df_SoundRep_0.sample(1950)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 16, 232 | "id": "695a90ae-1f44-43a7-bdd2-5bdde9340ee5", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "(1950, 17)" 239 | ] 240 | }, 241 | "execution_count": 16, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "df_SoundRep_0_sampled.shape" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 17, 253 | "id": "ce16530e-e3c0-4322-a556-bc0eadab17ba", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "df_SoundRep_data = pd.concat([df_SoundRep_0_sampled, df_SoundRep_1], ignore_index=True, axis=0)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 18, 263 | "id": "e76bd344-df9c-4b5c-a4d6-7992f5e2aece", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "(3900, 17)" 270 | ] 271 | }, 272 | "execution_count": 18, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "df_SoundRep_data.shape" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 19, 284 | "id": "91a4fd62-172b-416e-8c72-b2b942c8dc22", 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/html": [ 290 | "
\n", 291 | "\n", 304 | "\n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | "
audio_pathSoundRep
0C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra...0
1C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra...0
\n", 325 | "
" 326 | ], 327 | "text/plain": [ 328 | " audio_path SoundRep\n", 329 | "0 C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra... 0\n", 330 | "1 C:\\Users\\jmdgo\\Downloads\\extracted_clips\\extra... 0" 331 | ] 332 | }, 333 | "execution_count": 19, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "df_SoundRep.head(2)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 20, 345 | "id": "659f2a48-a4ec-4b22-ab53-0439f4bea6a3", 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "0 50.0\n", 353 | "1 50.0\n", 354 | "Name: SoundRep, dtype: float64\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "value_counts = df_SoundRep_data['SoundRep'].value_counts()\n", 360 | "total_rows = df_SoundRep_data.shape[0]\n", 361 | "\n", 362 | "percentage_per_class = (value_counts / total_rows) * 100\n", 363 | "\n", 364 | "print(percentage_per_class)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 21, 370 | "id": "6ac7b6a0-0b0b-4eb1-92b2-6d4b834e12aa", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "X = np.array(df_SoundRep_data['audio_path'])\n", 375 | "y = np.array(df_SoundRep_data['SoundRep'])" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 22, 381 | "id": "02ad3a9a-237b-4e62-8d65-506c7ad0472f", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify = y)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 23, 391 | "id": "30457ffd-8255-4ebc-974d-38f33db7afcb", 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "mfcc_features_train = Parallel(n_jobs=-1)(delayed(extract_mfcc)(audio_file) for audio_file in X_train)\n", 396 | "mfcc_features_test = Parallel(n_jobs=-1)(delayed(extract_mfcc)(audio_file) for audio_file in X_test)\n", 397 | "zcr_features_train = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_train)\n", 398 | "zcr_features_test = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_test)\n", 399 | "\n", 400 | "jitter_shimmer_train = Parallel(n_jobs=-1)(delayed(extract_jitter_shimmer)(audio_file) for audio_file in X_train)\n", 401 | "jitter_shimmer_test = Parallel(n_jobs=-1)(delayed(extract_jitter_shimmer)(audio_file) for audio_file in X_test)\n", 402 | "\n", 403 | "jitter_train, shimmer_train = zip(*jitter_shimmer_train)\n", 404 | "jitter_test, shimmer_test = zip(*jitter_shimmer_test)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "id": "6973590c-5fc5-40f4-a7fb-10d05dd9df47", 410 | "metadata": {}, 411 | "source": [ 412 | "zcr_features_train = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_train)\n", 413 | "zcr_features_test = Parallel(n_jobs=-1)(delayed(extract_zcr)(audio_file) for audio_file in X_test)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "id": "660938cb-e39a-450a-9061-6c24c02bd983", 419 | "metadata": {}, 420 | "source": [ 421 | "features_train = np.array([np.hstack((mfcc, zcr)) for mfcc, zcr in zip(mfcc_features_train, zcr_features_train)])\n", 422 | "features_test = np.array([np.hstack((mfcc, zcr)) for mfcc, zcr in zip(mfcc_features_test, zcr_features_test)])" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 24, 428 | "id": "ef378e5f", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "(2730, 3856)\n", 436 | "(1170, 3856)\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "merged_features_train = np.column_stack((mfcc_features_train, zcr_features_train, jitter_train, shimmer_train))\n", 442 | "merged_features_test = np.column_stack((mfcc_features_test, zcr_features_test, jitter_test, shimmer_test))\n", 443 | "\n", 444 | "print(merged_features_train.shape)\n", 445 | "print(merged_features_test.shape)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 25, 451 | "id": "edabd8ce", 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "((2730,), (1170,), (2730,), (1170,))" 458 | ] 459 | }, 460 | "execution_count": 25, 461 | "metadata": {}, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "X_train.shape,X_test.shape,y_train.shape,y_test.shape" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 26, 472 | "id": "b0716fea", 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "NaN values found in training data at indices: [[ 147 3854]\n", 480 | " [ 147 3855]\n", 481 | " [ 325 3855]\n", 482 | " [1870 3854]\n", 483 | " [1870 3855]\n", 484 | " [2060 3854]\n", 485 | " [2060 3855]]\n", 486 | "NaN values found in testing data at indices: [[ 138 3854]\n", 487 | " [ 138 3855]\n", 488 | " [ 140 3854]\n", 489 | " [ 140 3855]\n", 490 | " [ 161 3854]\n", 491 | " [ 161 3855]\n", 492 | " [ 605 3854]\n", 493 | " [ 605 3855]]\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "nan_indices_train = np.argwhere(np.isnan(merged_features_train))\n", 499 | "if len(nan_indices_train) > 0:\n", 500 | " print(\"NaN values found in training data at indices:\", nan_indices_train)\n", 501 | "else:\n", 502 | " print(\"No NaN values found in training data\")\n", 503 | "\n", 504 | "# Check for NaN values in testing data\n", 505 | "nan_indices_test = np.argwhere(np.isnan(merged_features_test))\n", 506 | "if len(nan_indices_test) > 0:\n", 507 | " print(\"NaN values found in testing data at indices:\", nan_indices_test)\n", 508 | "else:\n", 509 | " print(\"No NaN values found in testing data\")" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 27, 515 | "id": "d47130de", 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "name": "stdout", 520 | "output_type": "stream", 521 | "text": [ 522 | "No NaN values found in imputed training data\n", 523 | "No NaN values found in imputed testing data\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "from sklearn.impute import KNNImputer\n", 529 | "# KNN imputation\n", 530 | "imputer = KNNImputer(n_neighbors=5) # You can adjust the number of neighbors as needed\n", 531 | "\n", 532 | "# Fit imputer on training data and transform both training and testing data\n", 533 | "merged_features_train = imputer.fit_transform(merged_features_train)\n", 534 | "merged_features_test = imputer.transform(merged_features_test)\n", 535 | "\n", 536 | "# Check if any NaN values remain after imputation\n", 537 | "nan_indices_train = np.argwhere(np.isnan(merged_features_train))\n", 538 | "if len(nan_indices_train) > 0:\n", 539 | " print(\"NaN values still present in imputed training data at indices:\", nan_indices_train)\n", 540 | "else:\n", 541 | " print(\"No NaN values found in imputed training data\")\n", 542 | "\n", 543 | "nan_indices_test = np.argwhere(np.isnan(merged_features_test))\n", 544 | "if len(nan_indices_test) > 0:\n", 545 | " print(\"NaN values still present in imputed testing data at indices:\", nan_indices_test)\n", 546 | "else:\n", 547 | " print(\"No NaN values found in imputed testing data\")\n" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 29, 553 | "id": "ab2ae65f", 554 | "metadata": {}, 555 | "outputs": [ 556 | { 557 | "name": "stdout", 558 | "output_type": "stream", 559 | "text": [ 560 | "Explained variance ratio: [0.23046092 0.1690381 0.05182512 0.03873228 0.03267255 0.03213632\n", 561 | " 0.03040406 0.01967926 0.01881117 0.01597497 0.01437103 0.01180179\n", 562 | " 0.01135941 0.01020829 0.00928691 0.00887731 0.00776032 0.00679666\n", 563 | " 0.00657214 0.00648516 0.00598973 0.00572877 0.00556341 0.0049202\n", 564 | " 0.00474634 0.00465325 0.00444663 0.00401209 0.00388667 0.00380322\n", 565 | " 0.00367577 0.00361095 0.00345658 0.00316666 0.00307042 0.00297792\n", 566 | " 0.00292586 0.00288976 0.00276818 0.00265818 0.00263326 0.00248504\n", 567 | " 0.00245014 0.00236614 0.00228997 0.00225381 0.0022253 0.00220008\n", 568 | " 0.00202645 0.00197055]\n" 569 | ] 570 | } 571 | ], 572 | "source": [ 573 | "from sklearn.decomposition import PCA\n", 574 | "\n", 575 | "# Assuming merged_features_train and merged_features_test are your feature arrays\n", 576 | "\n", 577 | "# Merge the training and testing data\n", 578 | "merged_data = np.concatenate((merged_features_train, merged_features_test), axis=0)\n", 579 | "\n", 580 | "# Apply PCA\n", 581 | "pca = PCA(n_components=50) # You can specify the number of principal components as needed\n", 582 | "transformed_data = pca.fit_transform(merged_data)\n", 583 | "\n", 584 | "# Print the explained variance ratio\n", 585 | "print(\"Explained variance ratio:\", pca.explained_variance_ratio_)\n" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 30, 591 | "id": "3f94b2af-6ef7-4b4c-bb18-f970a4e39fda", 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "name": "stdout", 596 | "output_type": "stream", 597 | "text": [ 598 | "Distribution of 0s and 1s in y_train:\n", 599 | "1 1365\n", 600 | "0 1365\n", 601 | "dtype: int64\n", 602 | "\n", 603 | "Distribution of 0s and 1s in y_test:\n", 604 | "0 585\n", 605 | "1 585\n", 606 | "dtype: int64\n" 607 | ] 608 | } 609 | ], 610 | "source": [ 611 | "y_train_series = pd.Series(y_train)\n", 612 | "y_test_series = pd.Series(y_test)\n", 613 | "\n", 614 | "# Count the occurrences of each unique value in y_train\n", 615 | "train_distribution = y_train_series.value_counts()\n", 616 | "\n", 617 | "# Count the occurrences of each unique value in y_test\n", 618 | "test_distribution = y_test_series.value_counts()\n", 619 | "\n", 620 | "print(\"Distribution of 0s and 1s in y_train:\")\n", 621 | "print(train_distribution)\n", 622 | "\n", 623 | "print(\"\\nDistribution of 0s and 1s in y_test:\")\n", 624 | "print(test_distribution)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 31, 630 | "id": "890da51e", 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "Epoch 1/10\n", 638 | "86/86 [==============================] - 9s 99ms/step - loss: 18.9509 - accuracy: 0.5524 - val_loss: 0.7147 - val_accuracy: 0.5966\n", 639 | "Epoch 2/10\n", 640 | "86/86 [==============================] - 8s 94ms/step - loss: 0.6035 - accuracy: 0.6905 - val_loss: 0.6925 - val_accuracy: 0.6120\n", 641 | "Epoch 3/10\n", 642 | "86/86 [==============================] - 8s 94ms/step - loss: 0.4626 - accuracy: 0.7916 - val_loss: 0.7216 - val_accuracy: 0.6137\n", 643 | "Epoch 4/10\n", 644 | "86/86 [==============================] - 8s 92ms/step - loss: 0.3684 - accuracy: 0.8502 - val_loss: 0.8918 - val_accuracy: 0.5863\n", 645 | "Epoch 5/10\n", 646 | "86/86 [==============================] - 8s 94ms/step - loss: 0.2461 - accuracy: 0.9190 - val_loss: 0.8337 - val_accuracy: 0.6026\n", 647 | "Epoch 6/10\n", 648 | "86/86 [==============================] - 8s 92ms/step - loss: 0.1465 - accuracy: 0.9674 - val_loss: 0.8887 - val_accuracy: 0.6000\n", 649 | "Epoch 7/10\n", 650 | "86/86 [==============================] - 8s 93ms/step - loss: 0.0811 - accuracy: 0.9901 - val_loss: 1.0207 - val_accuracy: 0.6068\n", 651 | "Epoch 8/10\n", 652 | "86/86 [==============================] - 8s 92ms/step - loss: 0.0490 - accuracy: 0.9974 - val_loss: 1.0908 - val_accuracy: 0.5957\n", 653 | "Epoch 9/10\n", 654 | "86/86 [==============================] - 8s 92ms/step - loss: 0.0274 - accuracy: 0.9993 - val_loss: 1.2084 - val_accuracy: 0.5991\n", 655 | "Epoch 10/10\n", 656 | "86/86 [==============================] - 8s 92ms/step - loss: 0.0161 - accuracy: 1.0000 - val_loss: 1.2317 - val_accuracy: 0.5974\n" 657 | ] 658 | }, 659 | { 660 | "data": { 661 | "text/plain": [ 662 | "" 663 | ] 664 | }, 665 | "execution_count": 31, 666 | "metadata": {}, 667 | "output_type": "execute_result" 668 | } 669 | ], 670 | "source": [ 671 | "import numpy as np\n", 672 | "from sklearn.model_selection import train_test_split\n", 673 | "from tensorflow.keras.models import Sequential\n", 674 | "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense\n", 675 | "from tensorflow.keras.utils import to_categorical\n", 676 | "\n", 677 | "X_train = np.array(merged_features_train)\n", 678 | "X_test = np.array(merged_features_test)\n", 679 | "y_train = np.array(y_train)\n", 680 | "y_test = np.array(y_test)\n", 681 | "\n", 682 | "# Convert labels to categorical\n", 683 | "y_train_categorical = to_categorical(y_train, num_classes=2)\n", 684 | "y_test_categorical = to_categorical(y_test, num_classes=2)\n", 685 | "\n", 686 | "# Reshape features for Conv1D input shape\n", 687 | "X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)\n", 688 | "X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)\n", 689 | "\n", 690 | "# Define the CNN model\n", 691 | "model = Sequential()\n", 692 | "model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)))\n", 693 | "model.add(MaxPooling1D(pool_size=2))\n", 694 | "model.add(Conv1D(64, 3, activation='relu'))\n", 695 | "model.add(MaxPooling1D(pool_size=2))\n", 696 | "model.add(Flatten())\n", 697 | "model.add(Dense(128, activation='relu'))\n", 698 | "model.add(Dense(2, activation='sigmoid')) # Output layer with sigmoid activation for binary classification\n", 699 | "\n", 700 | "# Compile the model\n", 701 | "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", 702 | "\n", 703 | "# Train the model\n", 704 | "model.fit(X_train, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test, y_test_categorical))\n" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 35, 710 | "id": "ed530069", 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "name": "stdout", 715 | "output_type": "stream", 716 | "text": [ 717 | "Epoch 1/10\n", 718 | "86/86 [==============================] - 9s 96ms/step - loss: 14.9529 - accuracy: 0.5513 - val_loss: 0.7588 - val_accuracy: 0.5607\n", 719 | "Epoch 2/10\n", 720 | "86/86 [==============================] - 8s 94ms/step - loss: 0.5402 - accuracy: 0.7374 - val_loss: 0.6884 - val_accuracy: 0.6077\n", 721 | "Epoch 3/10\n", 722 | "86/86 [==============================] - 8s 94ms/step - loss: 0.3890 - accuracy: 0.8421 - val_loss: 0.7183 - val_accuracy: 0.6120\n", 723 | "Epoch 4/10\n", 724 | "86/86 [==============================] - 8s 94ms/step - loss: 0.2228 - accuracy: 0.9330 - val_loss: 0.7829 - val_accuracy: 0.6197\n", 725 | "Epoch 5/10\n", 726 | "86/86 [==============================] - 8s 94ms/step - loss: 0.1110 - accuracy: 0.9839 - val_loss: 0.9572 - val_accuracy: 0.6162\n" 727 | ] 728 | } 729 | ], 730 | "source": [ 731 | "import numpy as np\n", 732 | "from sklearn.model_selection import train_test_split\n", 733 | "from tensorflow.keras.models import Sequential\n", 734 | "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense\n", 735 | "from tensorflow.keras.utils import to_categorical\n", 736 | "\n", 737 | "# Assuming merged_features_train contains merged features for training data\n", 738 | "# Assuming merged_features_test contains merged features for test data\n", 739 | "# Assuming y_train contains labels for training data\n", 740 | "# Assuming y_test contains labels for test data\n", 741 | "\n", 742 | "# Convert lists to numpy arrays\n", 743 | "X_train = np.array(merged_features_train)\n", 744 | "X_test = np.array(merged_features_test)\n", 745 | "y_train = np.array(y_train)\n", 746 | "y_test = np.array(y_test)\n", 747 | "\n", 748 | "# Convert labels to categorical\n", 749 | "y_train_categorical = to_categorical(y_train, num_classes=2)\n", 750 | "y_test_categorical = to_categorical(y_test, num_classes=2)\n", 751 | "\n", 752 | "# Reshape features for Conv1D input shape\n", 753 | "X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)\n", 754 | "X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)\n", 755 | "\n", 756 | "# Define the CNN model\n", 757 | "model = Sequential()\n", 758 | "model.add(Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)))\n", 759 | "model.add(MaxPooling1D(pool_size=2))\n", 760 | "model.add(Conv1D(64, 3, activation='relu'))\n", 761 | "model.add(MaxPooling1D(pool_size=2))\n", 762 | "model.add(Flatten())\n", 763 | "model.add(Dense(128, activation='relu'))\n", 764 | "model.add(Dense(2, activation='sigmoid')) # Output layer with softmax activation for binary classification\n", 765 | "\n", 766 | "# Compile the model\n", 767 | "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", 768 | "\n", 769 | "from tensorflow.keras.callbacks import EarlyStopping\n", 770 | "\n", 771 | "# Define early stopping callback\n", 772 | "early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n", 773 | "\n", 774 | "# Train the model with early stopping\n", 775 | "history = model.fit(X_train, y_train_categorical, epochs=10, batch_size=32, \n", 776 | " validation_data=(X_test, y_test_categorical), callbacks=[early_stopping])\n" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 36, 782 | "id": "0d06238d", 783 | "metadata": {}, 784 | "outputs": [ 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])\n" 790 | ] 791 | }, 792 | { 793 | "data": { 794 | "image/png": "", 795 | "text/plain": [ 796 | "
" 797 | ] 798 | }, 799 | "metadata": {}, 800 | "output_type": "display_data" 801 | } 802 | ], 803 | "source": [ 804 | "import matplotlib.pyplot as plt\n", 805 | "# Print the keys of the history\n", 806 | "print(history.history.keys())\n", 807 | "\n", 808 | "# Summarize history for accuracy\n", 809 | "plt.plot(history.history['accuracy'])\n", 810 | "plt.plot(history.history['val_accuracy'])\n", 811 | "plt.title('Model Accuracy')\n", 812 | "plt.ylabel('Accuracy')\n", 813 | "plt.xlabel('Epoch')\n", 814 | "plt.legend(['Train', 'Validation'], loc='upper left')\n", 815 | "plt.show()" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 37, 821 | "id": "387b3171", 822 | "metadata": {}, 823 | "outputs": [ 824 | { 825 | "name": "stdout", 826 | "output_type": "stream", 827 | "text": [ 828 | "Model saved successfully to: C:\\Users\\jmdgo\\Downloads\n" 829 | ] 830 | } 831 | ], 832 | "source": [ 833 | "model_file_path = r'C:\\Users\\jmdgo\\Downloads'\n", 834 | "model.save(\"SoundRep_mzjs_model.h5\")\n", 835 | "print(\"Model saved successfully to:\", model_file_path)" 836 | ] 837 | } 838 | ], 839 | "metadata": { 840 | "kernelspec": { 841 | "display_name": "Python 3 (ipykernel)", 842 | "language": "python", 843 | "name": "python3" 844 | }, 845 | "language_info": { 846 | "codemirror_mode": { 847 | "name": "ipython", 848 | "version": 3 849 | }, 850 | "file_extension": ".py", 851 | "mimetype": "text/x-python", 852 | "name": "python", 853 | "nbconvert_exporter": "python", 854 | "pygments_lexer": "ipython3", 855 | "version": "3.11.4" 856 | } 857 | }, 858 | "nbformat": 4, 859 | "nbformat_minor": 5 860 | } 861 | --------------------------------------------------------------------------------