├── Data_corpus └── train_data │ ├── sample_0.wav │ ├── sample_1.wav │ └── sample_6.wav ├── README.md ├── model ├── cp.ckpt └── model.h5 ├── notebook ├── additional_notebook │ ├── notebook │ ├── speech recognition (1).ipynb │ └── voice_recognition_1.ipynb ├── speech_recognition (2).ipynb └── voice-recognition (2).ipynb ├── results └── loss_file │ └── Capture-2.PNG └── utiles └── utility /Data_corpus/train_data/sample_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_0.wav -------------------------------------------------------------------------------- /Data_corpus/train_data/sample_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_1.wav -------------------------------------------------------------------------------- /Data_corpus/train_data/sample_6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_6.wav -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bangla-deep-speech-Recognition 2 | Bangla deep speech recognition is a deep bidirectional RNN based bangla speech to text transcription system. 3 | Major focusing for this project is to empower industrial application like searching a product by voice command using bangla speech recognition end to end model, via an easy-to-use, efficient, smaller and scalable implementation, including training, inference & testing module,and deployment. 4 | 5 | # Dataset 6 | There is two parts of voice datasets: 7 | 1)This is own collected dataset and voice corpus generated on based of company product.Here I'am used a small size of voice corpuses like size 40-50 audio files.I can add more voice corpuses to get better result to mitigate overfitting. 8 | 2)Bengali ASR training data set containing ~196K utterances. 9 | Dataset link:http://openslr.org/53/ 10 | # Annotation Tools 11 | 1)https://online-audio-converter.com/ 12 | 2)https://twistedwave.com/online 13 | 14 | # Model 15 | 1)rnn model,Lstm model,bidirectional-rnn model,Deep model 16 | 2)working on Rnn_Transducer_model on going 17 | 18 | # Dependency 19 | Python 3.7 20 | tensorflow 2.0.0 21 | 22 | # Project Structure: 23 | Run above command: 24 | _speech_recognition (2).ipynb_ 25 | 26 | # results: 27 | ![Capture-1](https://user-images.githubusercontent.com/45398575/120583800-ad5e5380-c450-11eb-84b9-85779bf71f13.PNG) 28 | 29 | # References 30 | -------------------------------------------------------------------------------- /model/cp.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/model/cp.ckpt -------------------------------------------------------------------------------- /model/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/model/model.h5 -------------------------------------------------------------------------------- /notebook/additional_notebook/notebook: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /notebook/additional_notebook/voice_recognition_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Using TensorFlow backend.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import matplotlib.pyplot as plt\n", 18 | "import numpy as np\n", 19 | "import librosa\n", 20 | "import librosa.display\n", 21 | "import IPython.display as ipd\n", 22 | "import os \n", 23 | "\n", 24 | " \n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "from keras.utils import to_categorical\n", 27 | "from tqdm import tqdm" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "root_dir=os.listdir(\"E:/speech_recognition/data/\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "E:/speech_recognition/data/audio_1.wav\n", 49 | "E:/speech_recognition/data/audio_2.wav\n", 50 | "E:/speech_recognition/data/audio_3.wav\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "mfcc=[]\n", 56 | "for i in root_dir:\n", 57 | " audio_paths=f'E:/speech_recognition/data/{i}'\n", 58 | " print(audio_paths)\n", 59 | " \n", 60 | " # load audio file and slice it to ensure length consistency among different files\n", 61 | " signal,sample_rate = librosa.load(audio_paths)\n", 62 | " \n", 63 | " # extract MFCCs\n", 64 | " MFCCs = librosa.feature.mfcc(signal, sample_rate)\n", 65 | " mfcc.append( MFCCs) " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "(20, 171)\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "print(mfcc[2].shape)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "def plot_mfcc_feature(vis_mfcc_feature):\n", 92 | " # plot the MFCC feature\n", 93 | " fig = plt.figure(figsize=(12,5))\n", 94 | " ax = fig.add_subplot(111)\n", 95 | " im = ax.imshow(vis_mfcc_feature, cmap=plt.cm.jet, aspect='auto')\n", 96 | " plt.title('Normalized MFCC')\n", 97 | " plt.ylabel('Time')\n", 98 | " plt.xlabel('MFCC Coefficient')\n", 99 | " divider = make_axes_locatable(ax)\n", 100 | " cax = divider.append_axes(\"right\", size=\"5%\", pad=0.05)\n", 101 | " plt.colorbar(im, cax=cax)\n", 102 | " ax.set_xticks(np.arange(0, 13, 2), minor=False);\n", 103 | " plt.show()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "#plot_mfcc_feature(mfcc)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "#labels=[\"আমি এসএসএল ওয়ারলেসে জব\",\n", 122 | " #\"আমি ডাটা টিমের সদস্য\",\n", 123 | " #\"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\"]\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 8, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "def get_labels():\n", 133 | " labels =[\"আমি এসএসএল ওয়ারলেসে জব\",\n", 134 | " \"আমি ডাটা টিমের সদস্য\",\n", 135 | " \"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\"] \n", 136 | " #labels=os.listdir(path)\n", 137 | " print(len(labels))\n", 138 | " label_indices = np.arange(0, len(labels))\n", 139 | " print(label_indices)\n", 140 | " return labels, label_indices, to_categorical(label_indices)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 9, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "3\n", 153 | "[0 1 2]\n", 154 | "['আমি এসএসএল ওয়ারলেসে জব', 'আমি ডাটা টিমের সদস্য', 'আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে']\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "#data_path=\"E:/speech_recognition/data/\"\n", 160 | "labels,label_indices,_=get_labels()\n", 161 | "print(labels)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 10, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# Mel-frequency cepstral coefficients\n", 171 | "def wav2mfcc(file_path,max_len, n_mfcc):\n", 172 | " wave, sr = librosa.load(file_path, mono=True, sr=None)\n", 173 | " wave = np.asfortranarray(wave[::3])\n", 174 | " mfcc = librosa.feature.mfcc(wave, sr=16000, n_mfcc=n_mfcc)\n", 175 | "\n", 176 | " # If maximum length exceeds mfcc lengths then pad the remaining ones\n", 177 | " if (max_len > mfcc.shape[1]):\n", 178 | " pad_width = max_len - mfcc.shape[1]\n", 179 | " mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')\n", 180 | "\n", 181 | " # Else cutoff the remaining parts\n", 182 | " else:\n", 183 | " mfcc = mfcc[:, :max_len]\n", 184 | " \n", 185 | " return mfcc" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 11, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "\n", 195 | "def save_data_to_array(path, max_len, n_mfcc):\n", 196 | " #labels,_, _ = get_labels()\n", 197 | "\n", 198 | " #for label in labels:\n", 199 | " # Init mfcc vectors\n", 200 | " mfcc_vectors = []\n", 201 | " \n", 202 | " for wav in path:\n", 203 | " wavfile=f'E:/speech_recognition/data/{wav}'\n", 204 | " print(wavfile)\n", 205 | " mfcc = wav2mfcc(wavfile, max_len, n_mfcc)\n", 206 | " mfcc_vectors.append(mfcc)\n", 207 | " np.save('E:/speech_recognition/wav', mfcc_vectors)\n", 208 | " return mfcc_vectors" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 12, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "E:/speech_recognition/data/audio_1.wav\n", 221 | "E:/speech_recognition/data/audio_2.wav\n", 222 | "E:/speech_recognition/data/audio_3.wav\n", 223 | "(20, 200)\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "mfcc_vec=save_data_to_array(root_dir,200,20)\n", 229 | "print(mfcc_vec[1].shape)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 13, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "3\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "x= np.load('E:/speech_recognition/wav.npy')\n", 247 | "print(len(x))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 14, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# Getting first arrays\n", 257 | "#X = np.load(labels[0] + '.npy')\n", 258 | "#y = np.zeros(X.shape[0])\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 15, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "def get_train_test(split_ratio=0.6, random_state=42):\n", 268 | " # Get available labels\n", 269 | " labels, indices, _ = get_labels()\n", 270 | "\n", 271 | " # Getting first arrays\n", 272 | " X = np.load('E:/speech_recognition/wav.npy')\n", 273 | " y = np.zeros(X.shape[0])\n", 274 | " #print(y)\n", 275 | " # Append all of the dataset into one single array, same goes for y\n", 276 | " for i, label in enumerate(labels[1:]):\n", 277 | " x = np.load('E:/speech_recognition/wav.npy')\n", 278 | " X = np.vstack((X, x))\n", 279 | " y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))\n", 280 | " print(y)\n", 281 | " assert X.shape[0] == len(y)\n", 282 | "\n", 283 | " return train_test_split(X, y, test_size=0.2,shuffle=True)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 16, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "#get_train_test(split_ratio=0.6, random_state=42)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 17, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "X_train, X_test, y_train, y_test =train_test_split(x,label_indices,test_size=0.1,\n", 302 | " random_state=42)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 18, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "(2, 20, 200)" 314 | ] 315 | }, 316 | "execution_count": 18, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "X_train.shape" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 19, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# # Feature dimension\n", 332 | "channels = 1\n", 333 | "max_len = 200\n", 334 | "buckets = 20\n", 335 | "epochs = 48\n", 336 | "batch_size = 100\n", 337 | "\n", 338 | "num_classes = 3\n", 339 | "\n", 340 | "#X_train = X_train.reshape(X_train.shape[0],buckets, max_len, channels)\n", 341 | "#X_test = X_test.reshape(X_test.shape[0],buckets,max_len, channels)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 20, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "(2, 20, 200) (1, 20, 200)\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "print(X_train.shape,X_test.shape)\n", 359 | "#plt.imshow(X_train[:, :, :, 0])" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 21, 365 | "metadata": { 366 | "scrolled": false 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "[1 2] [0]\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "print(y_train,y_test)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 22, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "(2,) (1,)\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "print(y_train.shape,y_test.shape)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 23, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "[[0. 1. 0.]\n", 408 | " [0. 0. 1.]] [[1.]]\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "y_train_hot = to_categorical(y_train)\n", 414 | "y_test_hot = to_categorical(y_test)\n", 415 | "print(y_train_hot,y_test_hot)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 24, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/plain": [ 426 | "(2, 3)" 427 | ] 428 | }, 429 | "execution_count": 24, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "y_train_hot.shape" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 25, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "(1, 1)" 447 | ] 448 | }, 449 | "execution_count": 25, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "y_test_hot.shape" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 26, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "#from preprocess import *\n", 465 | "import keras\n", 466 | "from keras.models import Sequential\n", 467 | "from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM\n", 468 | "from keras.utils import to_categorical\n", 469 | "#import wandb\n", 470 | "#from wandb.keras import WandbCallback\n", 471 | "import matplotlib.pyplot as plt" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 27, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "num_classes=3\n", 481 | "\n", 482 | "#build a simple cnn model\n", 483 | "\n", 484 | "model = Sequential()\n", 485 | "model.add(Flatten(input_shape=(buckets,max_len)))\n", 486 | "#model.add(Flatten())\n", 487 | "model.add(Dense(3, activation='softmax'))\n", 488 | "model.compile(loss=\"sparse_categorical_crossentropy\",\n", 489 | " optimizer=\"adam\",\n", 490 | " metrics=['accuracy'])" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 28, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "Model: \"sequential_1\"\n", 503 | "_________________________________________________________________\n", 504 | "Layer (type) Output Shape Param # \n", 505 | "=================================================================\n", 506 | "flatten_1 (Flatten) (None, 4000) 0 \n", 507 | "_________________________________________________________________\n", 508 | "dense_1 (Dense) (None, 3) 12003 \n", 509 | "=================================================================\n", 510 | "Total params: 12,003\n", 511 | "Trainable params: 12,003\n", 512 | "Non-trainable params: 0\n", 513 | "_________________________________________________________________\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "model.summary()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 36, 524 | "metadata": {}, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "2\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "print(X_train.shape[0])\n", 536 | "X_train = X_train.reshape(X_train.shape[0],buckets, max_len)\n", 537 | "X_test = X_test.reshape(X_test.shape[0],buckets,max_len)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 30, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "data": { 547 | "text/plain": [ 548 | "(2, 20, 200)" 549 | ] 550 | }, 551 | "execution_count": 30, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "X_train.shape" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 31, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "(1, 20, 200)" 569 | ] 570 | }, 571 | "execution_count": 31, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "X_test.shape" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 32, 583 | "metadata": {}, 584 | "outputs": [ 585 | { 586 | "name": "stdout", 587 | "output_type": "stream", 588 | "text": [ 589 | "Train on 2 samples, validate on 1 samples\n", 590 | "Epoch 1/48\n", 591 | "2/2 [==============================] - 0s 31ms/step - loss: 122.4453 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 1.0000\n", 592 | "Epoch 2/48\n", 593 | "2/2 [==============================] - 0s 998us/step - loss: 10.0779 - accuracy: 0.0000e+00 - val_loss: 8.1836 - val_accuracy: 0.0000e+00\n", 594 | "Epoch 3/48\n", 595 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 48.7357 - val_accuracy: 0.0000e+00\n", 596 | "Epoch 4/48\n", 597 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 81.9535 - val_accuracy: 0.0000e+00\n", 598 | "Epoch 5/48\n", 599 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 110.0301 - val_accuracy: 0.0000e+00\n", 600 | "Epoch 6/48\n", 601 | "2/2 [==============================] - 0s 998us/step - loss: 2.9802e-07 - accuracy: 1.0000 - val_loss: 134.2287 - val_accuracy: 0.0000e+00\n", 602 | "Epoch 7/48\n", 603 | "2/2 [==============================] - 0s 997us/step - loss: 1.5715e-04 - accuracy: 1.0000 - val_loss: 155.3666 - val_accuracy: 0.0000e+00\n", 604 | "Epoch 8/48\n", 605 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0345 - accuracy: 1.0000 - val_loss: 175.0989 - val_accuracy: 0.0000e+00\n", 606 | "Epoch 9/48\n", 607 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0756 - accuracy: 1.0000 - val_loss: 194.8384 - val_accuracy: 0.0000e+00\n", 608 | "Epoch 10/48\n", 609 | "2/2 [==============================] - 0s 997us/step - loss: 1.2372e-04 - accuracy: 1.0000 - val_loss: 212.4595 - val_accuracy: 0.0000e+00\n", 610 | "Epoch 11/48\n", 611 | "2/2 [==============================] - 0s 997us/step - loss: 3.5763e-07 - accuracy: 1.0000 - val_loss: 228.2513 - val_accuracy: 0.0000e+00\n", 612 | "Epoch 12/48\n", 613 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 242.4503 - val_accuracy: 0.0000e+00\n", 614 | "Epoch 13/48\n", 615 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 255.2506 - val_accuracy: 0.0000e+00\n", 616 | "Epoch 14/48\n", 617 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 266.8147 - val_accuracy: 0.0000e+00\n", 618 | "Epoch 15/48\n", 619 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 277.2800 - val_accuracy: 0.0000e+00\n", 620 | "Epoch 16/48\n", 621 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 286.7642 - val_accuracy: 0.0000e+00\n", 622 | "Epoch 17/48\n", 623 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 295.3692 - val_accuracy: 0.0000e+00\n", 624 | "Epoch 18/48\n", 625 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 303.1837 - val_accuracy: 0.0000e+00\n", 626 | "Epoch 19/48\n", 627 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 310.2859 - val_accuracy: 0.0000e+00\n", 628 | "Epoch 20/48\n", 629 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 316.7448 - val_accuracy: 0.0000e+00\n", 630 | "Epoch 21/48\n", 631 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 322.6215 - val_accuracy: 0.0000e+00\n", 632 | "Epoch 22/48\n", 633 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 327.9705 - val_accuracy: 0.0000e+00\n", 634 | "Epoch 23/48\n", 635 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 332.8409 - val_accuracy: 0.0000e+00\n", 636 | "Epoch 24/48\n", 637 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 337.2766 - val_accuracy: 0.0000e+00\n", 638 | "Epoch 25/48\n", 639 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 341.3170 - val_accuracy: 0.0000e+00\n", 640 | "Epoch 26/48\n", 641 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 344.9977 - val_accuracy: 0.0000e+00\n", 642 | "Epoch 27/48\n", 643 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 348.3513 - val_accuracy: 0.0000e+00\n", 644 | "Epoch 28/48\n", 645 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 351.4067 - val_accuracy: 0.0000e+00\n", 646 | "Epoch 29/48\n", 647 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 354.1908 - val_accuracy: 0.0000e+00\n", 648 | "Epoch 30/48\n", 649 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 356.7273 - val_accuracy: 0.0000e+00\n", 650 | "Epoch 31/48\n", 651 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 359.0380 - val_accuracy: 0.0000e+00\n", 652 | "Epoch 32/48\n", 653 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 361.1434 - val_accuracy: 0.0000e+00\n", 654 | "Epoch 33/48\n", 655 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 363.0612 - val_accuracy: 0.0000e+00\n", 656 | "Epoch 34/48\n", 657 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 364.8080 - val_accuracy: 0.0000e+00\n", 658 | "Epoch 35/48\n", 659 | "2/2 [==============================] - 0s 2ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 366.3990 - val_accuracy: 0.0000e+00\n", 660 | "Epoch 36/48\n", 661 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 367.8478 - val_accuracy: 0.0000e+00\n", 662 | "Epoch 37/48\n", 663 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 369.1672 - val_accuracy: 0.0000e+00\n", 664 | "Epoch 38/48\n", 665 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 370.3681 - val_accuracy: 0.0000e+00\n", 666 | "Epoch 39/48\n", 667 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 371.4615 - val_accuracy: 0.0000e+00\n", 668 | "Epoch 40/48\n", 669 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 372.4566 - val_accuracy: 0.0000e+00\n", 670 | "Epoch 41/48\n", 671 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 373.3623 - val_accuracy: 0.0000e+00\n", 672 | "Epoch 42/48\n", 673 | "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 374.1862 - val_accuracy: 0.0000e+00\n", 674 | "Epoch 43/48\n", 675 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 374.9359 - val_accuracy: 0.0000e+00\n", 676 | "Epoch 44/48\n", 677 | "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 375.6178 - val_accuracy: 0.0000e+00\n", 678 | "Epoch 45/48\n", 679 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 376.2379 - val_accuracy: 0.0000e+00\n", 680 | "Epoch 46/48\n", 681 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 376.8019 - val_accuracy: 0.0000e+00\n", 682 | "Epoch 47/48\n", 683 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 377.3149 - val_accuracy: 0.0000e+00\n", 684 | "Epoch 48/48\n", 685 | "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 377.7810 - val_accuracy: 0.0000e+00\n" 686 | ] 687 | }, 688 | { 689 | "data": { 690 | "text/plain": [ 691 | "" 692 | ] 693 | }, 694 | "execution_count": 32, 695 | "metadata": {}, 696 | "output_type": "execute_result" 697 | } 698 | ], 699 | "source": [ 700 | "model.fit(X_train, y_train, epochs=epochs, \n", 701 | " validation_data=(X_test,y_test),\n", 702 | " )" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 33, 708 | "metadata": {}, 709 | "outputs": [ 710 | { 711 | "name": "stdout", 712 | "output_type": "stream", 713 | "text": [ 714 | "(1, 20, 200)\n", 715 | "[1]\n" 716 | ] 717 | } 718 | ], 719 | "source": [ 720 | "# make a prediction\n", 721 | "import cv2\n", 722 | "from numpy import zeros, newaxis\n", 723 | "#print(mfcc_vec[1].shape)\n", 724 | "\n", 725 | "\n", 726 | "#.........take a random voice..............\n", 727 | "mfcc_1=mfcc_vec[1][newaxis,:, :,]\n", 728 | "print(mfcc_1.shape)\n", 729 | "predict = model.predict_classes(mfcc_1)\n", 730 | "print(predict)" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 34, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "name": "stdout", 740 | "output_type": "stream", 741 | "text": [ 742 | "আমি ডাটা টিমের সদস্য\n" 743 | ] 744 | } 745 | ], 746 | "source": [ 747 | "if predict==[0]:\n", 748 | " print(\"আমি এসএসএল ওয়ারলেসে জব\")\n", 749 | "elif predict==[1]:\n", 750 | " print(\"আমি ডাটা টিমের সদস্য\")\n", 751 | "else:\n", 752 | " print(\"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\")" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 35, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "name": "stdout", 762 | "output_type": "stream", 763 | "text": [ 764 | "2\n" 765 | ] 766 | } 767 | ], 768 | "source": [ 769 | "x=[ 5,40, 51, 81, 12, 46 ,12]\n", 770 | "print(len(x[:2]))" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": null, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [] 786 | } 787 | ], 788 | "metadata": { 789 | "kernelspec": { 790 | "display_name": "Python 3", 791 | "language": "python", 792 | "name": "python3" 793 | }, 794 | "language_info": { 795 | "codemirror_mode": { 796 | "name": "ipython", 797 | "version": 3 798 | }, 799 | "file_extension": ".py", 800 | "mimetype": "text/x-python", 801 | "name": "python", 802 | "nbconvert_exporter": "python", 803 | "pygments_lexer": "ipython3", 804 | "version": "3.7.4" 805 | } 806 | }, 807 | "nbformat": 4, 808 | "nbformat_minor": 2 809 | } 810 | -------------------------------------------------------------------------------- /notebook/voice-recognition (2).ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport tensorflow as tf\n\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nimport tensorflow as tf\n\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.layers import (Input, Lambda, BatchNormalization,LSTM,TimeDistributed,Activation,Dense)\nfrom tensorflow.keras.optimizers import SGD\nfrom tensorflow.keras.callbacks import ModelCheckpoint \nimport os\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport librosa\nimport librosa.display\nimport IPython.display as ipd\nimport os \n\n \nfrom sklearn.model_selection import train_test_split\nfrom keras.utils import to_categorical\nfrom tqdm import tqdm\nimport random\n\nvoice_data=[]\nfor dirname, _, filenames in os.walk('../input/voicerecognitiondata/asr_bengali/data'):\n for filename in filenames:\n voice_data.append(os.path.join(dirname, filename))\n #print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"cf7be286-df93-4c2f-997a-98912a353525","_cell_guid":"a4ebc242-53a4-4cca-a062-bd7f3b649c51","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:26.705295Z","iopub.execute_input":"2021-07-19T05:37:26.705623Z","iopub.status.idle":"2021-07-19T05:37:35.00382Z","shell.execute_reply.started":"2021-07-19T05:37:26.705548Z","shell.execute_reply":"2021-07-19T05:37:35.002868Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tf.test.is_gpu_available()","metadata":{"_uuid":"a6f94e4e-94af-4ebb-b77e-625b50b16ed2","_cell_guid":"e1b4ef81-2f80-4867-a262-37c47df78e12","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:35.006996Z","iopub.execute_input":"2021-07-19T05:37:35.00727Z","iopub.status.idle":"2021-07-19T05:37:36.75772Z","shell.execute_reply.started":"2021-07-19T05:37:35.007242Z","shell.execute_reply":"2021-07-19T05:37:36.756903Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#..........fixed the seed................\ndef reset_random_seeds():\n seed_num=9\n os.environ['PYTHONHASHSEED']=str(seed_num)\n tf.random.set_seed(seed_num)\n np.random.seed(seed_num)\n random.seed(seed_num)","metadata":{"_uuid":"34242cdc-fb17-47f6-89f2-37bf7985142c","_cell_guid":"104d8bca-8e62-42aa-b938-531c62c8517a","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:36.75955Z","iopub.execute_input":"2021-07-19T05:37:36.76007Z","iopub.status.idle":"2021-07-19T05:37:36.768183Z","shell.execute_reply.started":"2021-07-19T05:37:36.76003Z","shell.execute_reply":"2021-07-19T05:37:36.767435Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#read voice data\n#print(len(voice_data))\ntsv_data=pd.read_csv(\"../input/voicerecognitiondata/asr_bengali/utt_spk_text.tsv\",sep=\"\\t\",header=0)\n#from kaggle_datasets import KaggleDatasets\n#Datset=\"/kaggle/input/voicerecognitiondata/asr_bengali/data\"\n#GCS_PATH = KaggleDatasets().get_gcs_path(Datset )\n\n#train_filenames1 = tf.io.gfile.glob(GCS_PATH1 + '*/*.flac')\ntsv_data.head()","metadata":{"_uuid":"6e4bd403-25f5-4cb3-901e-49bcf8d823f0","_cell_guid":"79de69ef-09da-43c3-b292-435c66e14f7f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:36.769696Z","iopub.execute_input":"2021-07-19T05:37:36.770231Z","iopub.status.idle":"2021-07-19T05:37:37.190023Z","shell.execute_reply.started":"2021-07-19T05:37:36.770185Z","shell.execute_reply":"2021-07-19T05:37:37.188494Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#add header in tsv file\n#tsv_data_1=pd.DataFrame(tsv_data,columns = [\"wav_name\",\"id\",\"label\"])\n\n#tsv_data_1.head()\ntsv_data.columns=[\"wav_name\",\"id\",\"label\"]\ntsv_data_1=tsv_data","metadata":{"_uuid":"72a86d78-dba9-49ce-8ce3-d1cb1b90eb44","_cell_guid":"e357c0a6-02ae-4a78-ad9e-9c653d22eb79","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.191303Z","iopub.execute_input":"2021-07-19T05:37:37.191646Z","iopub.status.idle":"2021-07-19T05:37:37.196208Z","shell.execute_reply.started":"2021-07-19T05:37:37.191611Z","shell.execute_reply":"2021-07-19T05:37:37.195287Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tsv_data.shape[0]","metadata":{"_uuid":"fd7f778f-139d-423d-aeb2-91f3faf22130","_cell_guid":"636213e0-ac29-40f5-bbcd-572b6410aecf","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.197678Z","iopub.execute_input":"2021-07-19T05:37:37.19805Z","iopub.status.idle":"2021-07-19T05:37:37.206819Z","shell.execute_reply.started":"2021-07-19T05:37:37.19801Z","shell.execute_reply":"2021-07-19T05:37:37.205873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(voice_data[0])\nprint(voice_data[0].split(\"/\")[-1].replace(\".flac\",\"\"))","metadata":{"_uuid":"54546114-927b-40d9-b9df-a37894473b18","_cell_guid":"4a6eb836-c182-4a7b-bff3-dc70498a93c4","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.208286Z","iopub.execute_input":"2021-07-19T05:37:37.208619Z","iopub.status.idle":"2021-07-19T05:37:37.216147Z","shell.execute_reply.started":"2021-07-19T05:37:37.208585Z","shell.execute_reply":"2021-07-19T05:37:37.215248Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#import zipfile\n#with zipfile.ZipFile(\"/kaggle/input/voicerecognitiondata/asr_bengali/data/\",'r') as zip_ref:\n #zip_ref.extractall(\"/kaggle/input/voicerecognitiondata/\")","metadata":{"_uuid":"4c491c04-19eb-43ed-b8b4-521adeb02d58","_cell_guid":"f6298f0a-f43f-46fe-8fc7-dc369d3956e3","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.218769Z","iopub.execute_input":"2021-07-19T05:37:37.219146Z","iopub.status.idle":"2021-07-19T05:37:37.22355Z","shell.execute_reply.started":"2021-07-19T05:37:37.21911Z","shell.execute_reply":"2021-07-19T05:37:37.222577Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#convert voice corpuses from .flac to wav format\n\nfrom pydub import AudioSegment\n\n#save wav file this location\n#wav_root=\"../input/voicerecognitiondata/asr_bengali/data/wav\"\n\nos.makedirs('../outputs')\nfor root, _, files in os.walk('../input/voicerecognitiondata/asr_bengali/data'):\n for file in files:\n wav_name = file.replace(\".flac\",\".wav\")\n #print(wav_name)\n #print(root)\n try:\n # convert wav to mp3 \n sound = AudioSegment.from_file(\"{}/{}\".format(root,file))\n #print(sound)\n sound.export(\"{}/{}\".format('../outputs', wav_name), format=\"wav\")\n except Exception as e:\n pass","metadata":{"_uuid":"0d2fe713-b9fd-4919-b468-c9d8d8732639","_cell_guid":"04366df9-30ff-4d79-80e6-43720fbd2690","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.225557Z","iopub.execute_input":"2021-07-19T05:37:37.225935Z","iopub.status.idle":"2021-07-19T05:50:23.340872Z","shell.execute_reply.started":"2021-07-19T05:37:37.225899Z","shell.execute_reply":"2021-07-19T05:50:23.339697Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.........read data wav data..................\n#for dirname, _, filenames in os.walk(\"../outputs\"):\n #for filename in filenames:\n #voice_data.append(\n #print(os.path.join(dirname, filename))","metadata":{"_uuid":"11ada899-21c7-47b3-b576-ba5b489708ea","_cell_guid":"bd8ae802-3868-4207-89af-7b6e7e52cd1f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.342552Z","iopub.execute_input":"2021-07-19T05:50:23.342943Z","iopub.status.idle":"2021-07-19T05:50:23.348409Z","shell.execute_reply.started":"2021-07-19T05:50:23.3429Z","shell.execute_reply":"2021-07-19T05:50:23.347341Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# ............Mel-frequency cepstral coefficients..........................\n#and converting vector form that are readable by deep learning model\n\ndef wav2mfcc(file_path,max_len, n_mfcc):\n wave, sr = librosa.load(file_path, mono=True, sr=None)\n #print(wave)\n wave = np.asfortranarray(wave[::3])\n #print(wave)\n #print(sr)\n \n mfcc = librosa.feature.mfcc(wave,sr=sr,n_mfcc=n_mfcc) #sr means sampling rate=16000\n mfcc_1=mfcc\n #print(mfcc.shape[0],mfcc.shape[1])\n \n # If maximum length exceeds mfcc lengths then pad the remaining ones\n if (max_len > mfcc.shape[1]):\n pad_width = max_len - mfcc.shape[1]\n mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')\n\n # Else cutoff the remaining parts\n else:\n mfcc = mfcc[:, :max_len]\n \n return mfcc,wave,mfcc_1.shape[0],mfcc_1.shape[1]","metadata":{"_uuid":"6d166be9-ab1b-40aa-b71c-3d810004e8c9","_cell_guid":"ff14b7b8-81a0-4d03-8de3-e66cddbe46fe","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.351817Z","iopub.execute_input":"2021-07-19T05:50:23.352247Z","iopub.status.idle":"2021-07-19T05:50:23.385642Z","shell.execute_reply.started":"2021-07-19T05:50:23.352205Z","shell.execute_reply":"2021-07-19T05:50:23.384847Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#ordering the voice corpuses\nfrom tkinter import Tcl\n\ndef save_data_to_array(path, max_len, n_mfcc):\n #labels,_, _ = get_labels()\n\n #for label in labels:\n # Init mfcc vectors\n \n #read audio file\n audio=[]\n for wav in path:\n wavfile=f'../outputs/{wav}'\n #wavfile=Tcl().call('lsort', '-dict',wavfile)\n #print(wavfile)\n audio.append(wavfile)\n \n #.............sort audio file.......\n sort_audio=Tcl().call('lsort', '-dict',audio) \n print(sort_audio[0])\n #print(audio)\n audio_path=[]\n frequency_m=[]\n amplitude_m=[]\n mfcc_vectors = []\n for path in sort_audio:\n mfcc,wave,fre_sh,amp_sh= wav2mfcc(path, max_len, n_mfcc)\n audio_path.append(path.split(\"/\")[-1].replace(\".wav\",\"\"))\n mfcc_vectors.append(mfcc)\n frequency_m.append(fre_sh)\n amplitude_m.append(amp_sh)\n \n print(max(frequency_m)) \n print(max(amplitude_m))\n #np.save('E:/speech_recognition/wav', mfcc_vectors)\n return mfcc_vectors,wave,audio,audio_path","metadata":{"_uuid":"98255c23-deea-4265-bcdf-8d5672f28091","_cell_guid":"ded85699-8488-412a-b7bb-bfadb35a38fe","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.387156Z","iopub.execute_input":"2021-07-19T05:50:23.387525Z","iopub.status.idle":"2021-07-19T05:50:23.435896Z","shell.execute_reply.started":"2021-07-19T05:50:23.387489Z","shell.execute_reply":"2021-07-19T05:50:23.435168Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#read wav file path\nroot_dir=os.listdir(\"../outputs\")\nmfcc_vec,wave,audio,audio_path=save_data_to_array(root_dir,177,34)#34 means time frequency and 384 amplitude\n#print(mfcc_vec[0].shape[0])","metadata":{"_uuid":"c5de1ed8-537e-47c7-8197-9df291c3a668","_cell_guid":"e5d77018-df63-4ba9-b6eb-7083ab96d90e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.437111Z","iopub.execute_input":"2021-07-19T05:50:23.437489Z","iopub.status.idle":"2021-07-19T05:52:09.518913Z","shell.execute_reply.started":"2021-07-19T05:50:23.437452Z","shell.execute_reply":"2021-07-19T05:52:09.51793Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"data={\"mfcc\":mfcc_vec,\n \"wav_name\":audio_path}\ndf=pd.DataFrame(data,columns=['mfcc',\"wav_name\"])\ndf.head()","metadata":{"_uuid":"277535a4-fb35-471c-9dee-161e3d5fe34c","_cell_guid":"50d26627-e379-4c81-8356-77d90ae05ed0","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.523137Z","iopub.execute_input":"2021-07-19T05:52:09.525442Z","iopub.status.idle":"2021-07-19T05:52:09.854391Z","shell.execute_reply.started":"2021-07-19T05:52:09.525394Z","shell.execute_reply":"2021-07-19T05:52:09.853652Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"_uuid":"9ca32317-e68f-4542-9961-429e83e02502","_cell_guid":"c2996eb6-f113-40c6-94ae-27f699de514f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.855662Z","iopub.execute_input":"2021-07-19T05:52:09.856022Z","iopub.status.idle":"2021-07-19T05:52:09.868292Z","shell.execute_reply.started":"2021-07-19T05:52:09.855987Z","shell.execute_reply":"2021-07-19T05:52:09.867482Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.shape[0]","metadata":{"_uuid":"be98e034-73d9-49d6-962c-da21275f1212","_cell_guid":"e70d4771-7615-4421-ad52-504954b4ca11","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.870638Z","iopub.execute_input":"2021-07-19T05:52:09.87092Z","iopub.status.idle":"2021-07-19T05:52:09.876904Z","shell.execute_reply.started":"2021-07-19T05:52:09.870885Z","shell.execute_reply":"2021-07-19T05:52:09.875944Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tsv_data_1.head()","metadata":{"_uuid":"3adf1410-961e-452f-b307-9936d0453c35","_cell_guid":"15b01511-b552-466d-8a21-2cd0bee253a4","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.878389Z","iopub.execute_input":"2021-07-19T05:52:09.878747Z","iopub.status.idle":"2021-07-19T05:52:09.89129Z","shell.execute_reply.started":"2021-07-19T05:52:09.878703Z","shell.execute_reply":"2021-07-19T05:52:09.890469Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df.shape[0])\nprint(tsv_data_1.shape[0])","metadata":{"_uuid":"346b1baf-1d5b-42bb-992c-3dbcedf8d54f","_cell_guid":"75b906df-7cd4-4a0d-b188-ad6680fba4d7","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.89251Z","iopub.execute_input":"2021-07-19T05:52:09.892891Z","iopub.status.idle":"2021-07-19T05:52:09.898238Z","shell.execute_reply.started":"2021-07-19T05:52:09.892856Z","shell.execute_reply":"2021-07-19T05:52:09.89734Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#merge the on basis of wav_name\ndf_new=pd.merge(df,tsv_data_1,on=\"wav_name\",how=\"outer\")\nprint(df_new.shape[0])","metadata":{"_uuid":"f5018410-30e5-43e7-b6bb-c39845b56438","_cell_guid":"a4d7e9f4-62a8-4761-9288-f33bddf487d1","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.899542Z","iopub.execute_input":"2021-07-19T05:52:09.900137Z","iopub.status.idle":"2021-07-19T05:52:10.03072Z","shell.execute_reply.started":"2021-07-19T05:52:09.900095Z","shell.execute_reply":"2021-07-19T05:52:10.029767Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df_new.isnull().sum())","metadata":{"_uuid":"5cddcf22-3f3c-47ea-a707-cef9deecf85f","_cell_guid":"06820be4-cc2d-4821-a4e1-98c3f07ddb44","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.035935Z","iopub.execute_input":"2021-07-19T05:52:10.038446Z","iopub.status.idle":"2021-07-19T05:52:10.109696Z","shell.execute_reply.started":"2021-07-19T05:52:10.038402Z","shell.execute_reply":"2021-07-19T05:52:10.10873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#now remove those rows which have a Nan value.\n\ndf_new.dropna(subset = [\"label\",\"mfcc\"], inplace=True)\ndf_new.head()","metadata":{"_uuid":"890cb3a6-6370-4829-8cab-805d9ef02021","_cell_guid":"5a361a44-6de5-440e-bd06-6a7e5616e9cd","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.113436Z","iopub.execute_input":"2021-07-19T05:52:10.115408Z","iopub.status.idle":"2021-07-19T05:52:10.575568Z","shell.execute_reply.started":"2021-07-19T05:52:10.115369Z","shell.execute_reply":"2021-07-19T05:52:10.574782Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df_new.isnull().sum())","metadata":{"_uuid":"61d8754d-3585-407c-bace-6e7d6178368e","_cell_guid":"acbc5a81-33d3-47d8-9999-c84f6937f8cb","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.576832Z","iopub.execute_input":"2021-07-19T05:52:10.577177Z","iopub.status.idle":"2021-07-19T05:52:10.589147Z","shell.execute_reply.started":"2021-07-19T05:52:10.577147Z","shell.execute_reply":"2021-07-19T05:52:10.588093Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_new.shape[0]","metadata":{"_uuid":"f7b5da99-4af8-402d-b883-ac58067a0198","_cell_guid":"9f8a4bf9-776f-46cf-b781-1bac89999eb8","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.593604Z","iopub.execute_input":"2021-07-19T05:52:10.593864Z","iopub.status.idle":"2021-07-19T05:52:10.599153Z","shell.execute_reply.started":"2021-07-19T05:52:10.59384Z","shell.execute_reply":"2021-07-19T05:52:10.598135Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#find a label by mapping\n#new_label=[]\n#for i in range(df.shape[0]):\n #for j in range(tsv_data_1.shape[0]):\n \n #if df[\"audio_path\"][i]==tsv_data_1[\"wav_name\"][j]:\n \n #new_label.append(tsv_data_1[\"label\"][j])\n #break \n #else:\n #continue","metadata":{"_uuid":"52308c78-1cf0-4fae-b73e-2c75dc507339","_cell_guid":"a52f99df-b724-449e-8097-719d85037c1e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.601515Z","iopub.execute_input":"2021-07-19T05:52:10.602234Z","iopub.status.idle":"2021-07-19T05:52:10.606497Z","shell.execute_reply.started":"2021-07-19T05:52:10.602197Z","shell.execute_reply":"2021-07-19T05:52:10.605576Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#df[\"transcript_label\"]=new_label","metadata":{"_uuid":"bffa51d7-34fd-4da5-a241-d166d68ed676","_cell_guid":"cf8ce979-d0fa-492b-ae9c-f642459cb537","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.608054Z","iopub.execute_input":"2021-07-19T05:52:10.60846Z","iopub.status.idle":"2021-07-19T05:52:10.614427Z","shell.execute_reply.started":"2021-07-19T05:52:10.608426Z","shell.execute_reply":"2021-07-19T05:52:10.613645Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#..........clean text labels like html tags,quotation etc.............\nimport re\ndef clean(text):\n \n #text=text.map(lambda i:re.sub(r'[\\\\|!|?|\\'|\"|#|,|।|-|.|)|(|\\|/|\"\"|\\n|]',r' ',str(i)))\n #text=text.map(lambda i:re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(i)))\n text=re.sub(r'[\\\\|!|?|\\'|\"|#|,|।|-|.|)|(|\\|/|\"\"|\\n|]',r' ',str(text))\n text=re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(text))\n text=re.sub(r\"[\\u200d|’|\\x93|\\x93|\\u200c|v|b|s|]\", '', str(text))\n \n text=text.replace(\"।\",\"\")\n text=text.replace('-','')\n text=text.replace(':','')\n text=text.replace(\"\\x94\",\"\")\n return text\n\ndf_new[\"label\"]=df_new[\"label\"].apply(clean)\n#df_new[\"label\"]=df_new[\"label\"].apply(lambda i:re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(i)))\n#df_new[\"label\"]=clean(df_new[\"label\"])\n#print(clean(df_new[\"label\"][32:35]))\ndf_new.head(3)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:10.615702Z","iopub.execute_input":"2021-07-19T05:52:10.616129Z","iopub.status.idle":"2021-07-19T05:52:10.875533Z","shell.execute_reply.started":"2021-07-19T05:52:10.616091Z","shell.execute_reply":"2021-07-19T05:52:10.874595Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def csv_file(df_new):\n return df_new\n\n#csv_file(df_new)","metadata":{"_uuid":"0a55e0f4-e406-4544-9493-805f06ba2178","_cell_guid":"290c0be7-ba61-4f1d-9313-d9fe16f98408","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.876762Z","iopub.execute_input":"2021-07-19T05:52:10.877119Z","iopub.status.idle":"2021-07-19T05:52:10.882317Z","shell.execute_reply.started":"2021-07-19T05:52:10.877084Z","shell.execute_reply":"2021-07-19T05:52:10.881532Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"final_df=df_new","metadata":{"_uuid":"b8543e14-59d9-4388-944c-4d627e2f359d","_cell_guid":"1cf0e766-15d2-4722-acfa-01ccf53733ea","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.883472Z","iopub.execute_input":"2021-07-19T05:52:10.88394Z","iopub.status.idle":"2021-07-19T05:52:10.891006Z","shell.execute_reply.started":"2021-07-19T05:52:10.883904Z","shell.execute_reply":"2021-07-19T05:52:10.890243Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"final_df.to_csv('final_df.csv',index=False)","metadata":{"_uuid":"a52f017d-9b8c-45e1-96db-7887ef77dec4","_cell_guid":"ef40f9cf-f155-4d44-b0c8-36ebf6002d90","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.892252Z","iopub.execute_input":"2021-07-19T05:52:10.892639Z","iopub.status.idle":"2021-07-19T05:52:15.724376Z","shell.execute_reply.started":"2021-07-19T05:52:10.892602Z","shell.execute_reply":"2021-07-19T05:52:15.723513Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#to reduce size the sample for memory issue\nfinal_df=final_df[0:2000]\nfinal_df.head()","metadata":{"_uuid":"5340b791-82e6-4e99-b5f6-bc8e692d3826","_cell_guid":"d931c769-34ca-4414-8385-b09db485a72f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:15.725527Z","iopub.execute_input":"2021-07-19T05:52:15.725883Z","iopub.status.idle":"2021-07-19T05:52:16.0232Z","shell.execute_reply.started":"2021-07-19T05:52:15.725846Z","shell.execute_reply":"2021-07-19T05:52:16.022189Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#...measure a label of label string.....................\nfinal_df[\"label_len\"]=[len(lab) for lab in final_df[\"label\"]]\nfinal_df.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.024721Z","iopub.execute_input":"2021-07-19T05:52:16.025098Z","iopub.status.idle":"2021-07-19T05:52:16.311501Z","shell.execute_reply.started":"2021-07-19T05:52:16.025062Z","shell.execute_reply":"2021-07-19T05:52:16.310707Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#....remove rows which have label length greater input length............\n\n#i=final_df[len(final_df.label>120)].index\n\nfinal_df.drop(final_df[final_df.label_len>30].index, inplace=True)\nfinal_df.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.312682Z","iopub.execute_input":"2021-07-19T05:52:16.313045Z","iopub.status.idle":"2021-07-19T05:52:16.590531Z","shell.execute_reply.started":"2021-07-19T05:52:16.313009Z","shell.execute_reply":"2021-07-19T05:52:16.589587Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(final_df.shape)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.591867Z","iopub.execute_input":"2021-07-19T05:52:16.592217Z","iopub.status.idle":"2021-07-19T05:52:16.598427Z","shell.execute_reply.started":"2021-07-19T05:52:16.592181Z","shell.execute_reply":"2021-07-19T05:52:16.597543Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#convert bangla character to number value\nchar_map_str = \"\"\"\nঀ 0\nঁ 1\nং 2\nঃ 3\nঅ 4\nআ 5\nই 6\nঈ 7\nউ 8\nঊ 9\nঋ 10\nঌ 11\nএ 12\nঐ 13\nও 14\nঔ 15\nক 16\nখ 17\nগ 18\nঘ 19\nঙ 20\nচ 21\nছ 22\nজ 23\nঝ 24\nঞ 25\nট 26\nঠ 27\nড 28\nঢ 29\nণ 30\nত 31\nথ 32\nদ 33\nধ 34\nন 35\nপ 36\nফ 37\nব 38\nভ 39\nম 40\nয 41\nর 42\nল 43\nশ 44\nষ 45\nস 46\nহ 47\n় 48\nঽ 49\nা 50\nি 51\nী 52\nু 53\nূ 54\nৃ 55\nৄ 56\nে 57\nৈ 58\nো 59\nৌ 60\n্ 61\nৎ 62\nৗ 63\nড় 64\nঢ় 65\nয় 66\nৠ 67\n০ 68\n১ 69\n২ 70\n৩ 71\n৪ 72\n৫ 73\n৬ 74\n৭ 75\n৮ 76\n৯ 77\nৱ 78\n৲ 79\n৴ 80\n 81\n\"\"\"\n# the \"blank\" character is mapped to 81\nchar_map = {}\nindex_map = {}\nfor line in char_map_str.strip().split('\\n'):\n ch, index = line.split()\n char_map[ch] = int(index)\n index_map[int(index)] = ch\nindex_map[81] = ' '\n ","metadata":{"_uuid":"f9756570-932b-4a37-8756-78603ddcc9d7","_cell_guid":"0a76de59-0135-4be1-a7f5-b04aaee81b54","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.599782Z","iopub.execute_input":"2021-07-19T05:52:16.600433Z","iopub.status.idle":"2021-07-19T05:52:16.60787Z","shell.execute_reply.started":"2021-07-19T05:52:16.600391Z","shell.execute_reply":"2021-07-19T05:52:16.607042Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(char_map)\nprint(\"****************************************************\")\nprint(index_map)","metadata":{"_uuid":"d0f8178a-d571-4522-bded-e0a29913ee8f","_cell_guid":"f0416633-1ade-4d44-a964-511cfe287413","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.609002Z","iopub.execute_input":"2021-07-19T05:52:16.60952Z","iopub.status.idle":"2021-07-19T05:52:16.619238Z","shell.execute_reply.started":"2021-07-19T05:52:16.609485Z","shell.execute_reply":"2021-07-19T05:52:16.618192Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_number_of_char_classes(char_map):\n ## TODO would be better to check with dataset (once cleaned)\n num_classes = len(char_map)+1 #need +1 for ctc null char +1 pad\n return num_classes\nget_number_of_char_classes(char_map)","metadata":{"_uuid":"2dae4c78-08c1-4da8-8e6f-cfaab998e8aa","_cell_guid":"24f1a361-5aad-4327-8bfd-4cf41c740b40","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.620656Z","iopub.execute_input":"2021-07-19T05:52:16.621098Z","iopub.status.idle":"2021-07-19T05:52:16.629562Z","shell.execute_reply.started":"2021-07-19T05:52:16.621064Z","shell.execute_reply":"2021-07-19T05:52:16.62874Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def text_to_int_sequence(text):\n \"\"\" Convert text to an integer sequence \"\"\"\n int_sequence = []\n for c in text:\n if c == ' ':\n ch = char_map['']\n else:\n ch = char_map[c]\n int_sequence.append(ch)\n return int_sequence\n\ndef int_sequence_to_text(int_sequence):\n \"\"\" Convert an integer sequence to text \"\"\"\n text = []\n for c in int_sequence:\n ch = index_map[c]\n text.append(ch)\n return text","metadata":{"_uuid":"dab382c1-7af4-4e90-a7b0-370bebfcf9b0","_cell_guid":"a37c8388-35e2-429e-9f4e-595ff9bb9d0e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.630879Z","iopub.execute_input":"2021-07-19T05:52:16.631271Z","iopub.status.idle":"2021-07-19T05:52:16.639186Z","shell.execute_reply.started":"2021-07-19T05:52:16.631236Z","shell.execute_reply":"2021-07-19T05:52:16.638316Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#...........normalize the feature...............\ndef calc_feat_dim(window, max_freq):\n return int(0.001 * window * max_freq) + 1\n\ndef normalize_feature(feature, eps=1e-14):\n \n #feat_dim = calc_feat_dim(34,8000)\n #feats_mean = np.zeros((feat_dim,))\n #feats_std = np.ones((feat_dim,))\n \n feats = np.vstack(feature)\n feats_mean = np.mean(feats, axis=0)\n feats_std = np.std(feats, axis=0)\n \n return (feature - feats_mean) / (feats_std + eps)","metadata":{"_uuid":"c12e2897-481d-4ad2-9acf-7b76fd7140ae","_cell_guid":"7d7d2623-1a44-40d3-9dff-06ba8a74ad4f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.640579Z","iopub.execute_input":"2021-07-19T05:52:16.641018Z","iopub.status.idle":"2021-07-19T05:52:16.651051Z","shell.execute_reply.started":"2021-07-19T05:52:16.640984Z","shell.execute_reply":"2021-07-19T05:52:16.650224Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.........data preparation for model........\nfrom past.builtins import xrange\n\ndef map_input_data(final_df):\n \n max_length = max([x.shape[0] for x in final_df['mfcc']])\n max_string_length = max([len(x) for x in final_df['label']])\n #print(max_string_length)\n X_data = np.zeros([final_df.shape[0], max_length,177])#here 177 means input shape\n #print(X_data)\n labels = np.ones([final_df.shape[0], max_string_length])*81\n input_length = np.zeros([final_df.shape[0], 1])\n label_length = np.zeros([final_df.shape[0], 1])\n #print(len(label_length))\n for i in range(0, final_df.shape[0]):\n feat = final_df.iloc[i]['mfcc']\n input_length[i] = feat.shape[0]\n #print(input_length)\n X_data[i, :feat.shape[0], :] = feat\n #print(X_data[i, :feat.shape[0], :])\n #print(len(final_df.iloc[i]['utterance']))\n # calculate labels & label_length\n #label = np.array(final_df.iloc[i]['utterance'])\n y=final_df.iloc[i]['label']\n #y=[' '.join(y[i:i+19]) for i in xrange(0,len(y),19)] \n #print(y)\n label=np.array(text_to_int_sequence(y))\n #print((label))\n #print(len(label))\n labels[i, :len(label)] = label\n #if feat.shape[0]<=len(label):\n #print(feat.shape[0])\n #label_length[i]=[' '.join(str(label[i:i+19])) for i in range(0,len(label),19)] \n label_length[i] = len(label)\n\n #print(X_data)\n X_data=normalize_feature(X_data,eps=1e-14)\n # return the arrays\n outputs = {'ctc': np.zeros([final_df.shape[0]])}\n #outputs = {'ctc': np.zeros(20)}\n \n print(max(label_length)) \n inputs = {'input': X_data, \n 'labels': labels,\n 'input_length':input_length ,\n 'label_length':label_length\n }\n \n \n #return (inputs,outputs),(inputs_1,outputs_1),X_data,labels,input_length,label_length\n return (inputs,outputs)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.652279Z","iopub.execute_input":"2021-07-19T05:52:16.652656Z","iopub.status.idle":"2021-07-19T05:52:16.68111Z","shell.execute_reply.started":"2021-07-19T05:52:16.65262Z","shell.execute_reply":"2021-07-19T05:52:16.680254Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"(input_dict,outputs)=map_input_data(final_df)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.682245Z","iopub.execute_input":"2021-07-19T05:52:16.682574Z","iopub.status.idle":"2021-07-19T05:52:17.384765Z","shell.execute_reply.started":"2021-07-19T05:52:16.682541Z","shell.execute_reply":"2021-07-19T05:52:17.383696Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#print(input_dict)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.386183Z","iopub.execute_input":"2021-07-19T05:52:17.386554Z","iopub.status.idle":"2021-07-19T05:52:17.390138Z","shell.execute_reply.started":"2021-07-19T05:52:17.386516Z","shell.execute_reply":"2021-07-19T05:52:17.389303Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#x=pd.DataFrame.from_dict(input_dict)\n#x.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.391419Z","iopub.execute_input":"2021-07-19T05:52:17.391956Z","iopub.status.idle":"2021-07-19T05:52:17.404476Z","shell.execute_reply.started":"2021-07-19T05:52:17.391918Z","shell.execute_reply":"2021-07-19T05:52:17.403554Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#define ctc_lambda and ctc cost function\ndef ctc_lambda_func(args):\n #import tensorflow as tf\n \n y_pred, labels, input_length, label_length = args\n #y_pred = y_pred[:, 2:, :]\n #print(y_pred.shape)\n #print(input_length)\n #print(labels.shape)\n return K.ctc_batch_cost(labels, y_pred, input_length, label_length)\n #return tf.compat.v1.nn.ctc_loss(labels, y_pred, input_length,label_length,\n #ignore_longer_outputs_than_inputs=True)\n\ndef add_ctc_loss(input_to_softmax):\n the_labels = Input(name='labels', shape=(None,), dtype='float32')\n input_lengths = Input(name='input_length', shape=(1,), dtype='int64')\n label_lengths = Input(name='label_length', shape=(1,), dtype='int64')\n output_lengths = Lambda(input_to_softmax.output_length)(input_lengths)\n print(output_lengths)\n loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')(\n [input_to_softmax.output, the_labels, output_lengths, label_lengths])\n model = Model(\n inputs=[input_to_softmax.input, the_labels, input_lengths,label_lengths],\n outputs=loss_out)\n return model\n","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.40591Z","iopub.execute_input":"2021-07-19T05:52:17.406259Z","iopub.status.idle":"2021-07-19T05:52:17.415177Z","shell.execute_reply.started":"2021-07-19T05:52:17.406225Z","shell.execute_reply":"2021-07-19T05:52:17.414369Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def cnn_output_length(input_length, filter_size, border_mode, stride,\n dilation=1):\n \"\"\" Compute the length of the output sequence after 1D convolution along\n time. Note that this function is in line with the function used in\n Convolution1D class from Keras.\n Params:\n input_length (int): Length of the input sequence.\n filter_size (int): Width of the convolution kernel.\n border_mode (str): Only support `same` or `valid`.\n stride (int): Stride size used in 1D convolution.\n dilation (int)\n \"\"\"\n if input_length is None:\n return None\n assert border_mode in {'same', 'valid'}\n dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)\n if border_mode == 'same':\n output_length = input_length\n elif border_mode == 'valid':\n output_length = input_length - dilated_filter_size + 1\n #print(output_length )\n return (output_length + stride - 1) // stride\n #return output_length","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.416343Z","iopub.execute_input":"2021-07-19T05:52:17.416737Z","iopub.status.idle":"2021-07-19T05:52:17.425376Z","shell.execute_reply.started":"2021-07-19T05:52:17.416703Z","shell.execute_reply":"2021-07-19T05:52:17.424571Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.............................model-2...........................................\n#...........................bi-directional deep rnn.........\n\nfrom keras.layers import Dense, Activation, Bidirectional, Reshape, Flatten, Lambda, Input,\\\n Masking, Convolution1D, BatchNormalization, GRU, Conv1D, RepeatVector, Conv2D,Dropout\nfrom keras.optimizers import SGD, Adam\n\ndef final_model_1(input_dim, filters,kernel_size,conv_stride,\n conv_border_mode,output_dim,number_of_layers,units):\n \"\"\" Build a deep network for speech \n \"\"\"\n dropout_rate=0.5\n cell=GRU \n activation='tanh'\n \n # Main acoustic input\n input_data = Input(name='input', shape=(None, input_dim))\n # TODO: Specify the layers in your network\n conv_1d = Conv1D(filters, kernel_size,\n strides=conv_stride,\n padding=conv_border_mode,\n activation='relu',\n name='layer_1_conv',\n dilation_rate=1)(input_data)\n conv_bn = BatchNormalization(name='conv_batch_norm')(conv_1d)\n \n conv_bn=Dropout(0.25)(conv_bn)\n if number_of_layers == 1:\n layer = cell(units, activation=activation,\n return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate)(conv_bn)\n layer = BatchNormalization(name='bt_rnn_1')(layer)\n layer=Dropout(0.25)(layer)\n else:\n layer = cell(units, activation=activation,\n return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate)(conv_bn)\n \n layer = BatchNormalization(name='bt_rnn_1')(layer)\n layer=Dropout(0.25)(layer)\n\n for i in range(number_of_layers - 2):\n layer = cell(units, activation=activation,\n return_sequences=True, implementation=2, name='rnn_{}'.format(i + 2), dropout=dropout_rate)(layer)\n \n layer = BatchNormalization(name='bt_rnn_{}'.format(i + 2))(layer)\n\n layer = cell(units, activation=activation,\n return_sequences=True, implementation=2, name='final_layer_of_rnn')(layer)\n layer=Dropout(0.25)(layer)\n layer = BatchNormalization(name='bt_rnn_final')(layer)\n\n time_dense = TimeDistributed(Dense(output_dim))(layer)\n time_dense=Dropout(0.5)(time_dense)\n # TODO: Add softmax activation layer\n y_pred = Activation('softmax', name='softmax')(time_dense)\n # Specify the model\n model = Model(inputs=input_data, outputs=y_pred)\n # TODO: Specify model.output_length\n #model=add(Dropout(0.5))\n model.output_length = lambda x:x\n #cnn_output_length(x,5,conv_border_mode,2)\n #cnn_output_length(x, kernel_size,conv_border_mode,conv_stride)\n \n print(model.summary())\n return model","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.427604Z","iopub.execute_input":"2021-07-19T05:52:17.427909Z","iopub.status.idle":"2021-07-19T05:52:17.444522Z","shell.execute_reply.started":"2021-07-19T05:52:17.427875Z","shell.execute_reply":"2021-07-19T05:52:17.443758Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.................run the model.......................\n\ndef train_model(X,Y,optimizer=SGD(lr=0.002, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),epochs=20,verbose=1):\n import tensorflow as tf\n import math\n \n reset_random_seeds()\n #model_1 = deep_rnn_model(input_dim=384)\n model_1=final_model_1(input_dim=177,filters=200,kernel_size=1,conv_stride=1,\n conv_border_mode='valid',\n output_dim=83,\n number_of_layers=3,\n units=200)\n model=model_1\n model = add_ctc_loss(model)\n model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)\n #model.compile(loss=ctc, optimizer=optimizer)\n #model.compile(loss=tf.compat.v1.nn.ctc_loss(the_labels=ctc,\n #ignore_longer_outputs_than_inputs=True), optimizer=optimizer)\n \n #if not os.path.exists('results'):\n #os.makedirs('results')\n #checkpointer = ModelCheckpoint(filepath='results/ASR_model.h5', verbose=0)\n #hist = model.fit(x=X,y=Y,batch_size=10,epochs=epochs,validation_split=0.25)\n #x=(X,Y)\n #x_1=(X_1,Y_1)\n #n_points =3\n #batch_size =10\n train_steps = math.ceil(3/10)\n valid_steps=math.ceil(1/10)\n \n #os.makedirs('../outputs1/cp.ckpt')\n \n #checkpoint_path = \"E:/ssl_speech_recognition/cp.ckpt\"\n checkpoint_dir = os.path.dirname(\"../outputs1/cp.ckpt\")\n\n # Create a callback that saves the model's weights\n cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,\n save_weights_only=True,\n verbose=1)\n\n #hist = model.fit_generator(generator=next_train(x,10),\n #steps_per_epoch=train_steps,\n #epochs=20,\n #validation_data=next_valid(x_1,10),\n #validation_steps=valid_steps,\n #callbacks=[cp_callback])\n \n hist = model.fit(x=X,y=Y,batch_size=10,epochs=500,validation_split=0.35,\n callbacks=[cp_callback]) #epoches=300,50000 \n return (hist,model,model_1) ","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.445686Z","iopub.execute_input":"2021-07-19T05:52:17.446045Z","iopub.status.idle":"2021-07-19T05:52:17.461015Z","shell.execute_reply.started":"2021-07-19T05:52:17.446009Z","shell.execute_reply":"2021-07-19T05:52:17.460022Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"hist,model,model_1=train_model(input_dict,outputs)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.462167Z","iopub.execute_input":"2021-07-19T05:52:17.46249Z","iopub.status.idle":"2021-07-19T06:15:26.271404Z","shell.execute_reply.started":"2021-07-19T05:52:17.462458Z","shell.execute_reply":"2021-07-19T06:15:26.270398Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#plot the loss \nhistory_dict = hist.history\nkeys = history_dict.keys()\nloss = history_dict['loss']\nval_loss = history_dict['val_loss']\n \nplt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k')\nepochs = range(1, len(loss) + 1)\nplt.plot(epochs, loss, label='Training Loss')\nplt.plot(epochs, val_loss, label='Validation Loss')\nplt.title('Loss vs Epochs', fontsize = 25)\nplt.xlabel('Epochs', fontsize = 15)\nplt.ylabel('Loss', fontsize = 15)\nplt.legend()\nplt.grid(True)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T06:15:26.273131Z","iopub.execute_input":"2021-07-19T06:15:26.273483Z","iopub.status.idle":"2021-07-19T06:15:26.473292Z","shell.execute_reply.started":"2021-07-19T06:15:26.273444Z","shell.execute_reply":"2021-07-19T06:15:26.472519Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /results/loss_file/Capture-2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/results/loss_file/Capture-2.PNG -------------------------------------------------------------------------------- /utiles/utility: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------