├── Data_corpus
    └── train_data
    │   ├── sample_0.wav
    │   ├── sample_1.wav
    │   └── sample_6.wav
├── README.md
├── model
    ├── cp.ckpt
    └── model.h5
├── notebook
    ├── additional_notebook
    │   ├── notebook
    │   ├── speech recognition (1).ipynb
    │   └── voice_recognition_1.ipynb
    ├── speech_recognition (2).ipynb
    └── voice-recognition (2).ipynb
├── results
    └── loss_file
    │   └── Capture-2.PNG
└── utiles
    └── utility


/Data_corpus/train_data/sample_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_0.wav


--------------------------------------------------------------------------------
/Data_corpus/train_data/sample_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_1.wav


--------------------------------------------------------------------------------
/Data_corpus/train_data/sample_6.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/Data_corpus/train_data/sample_6.wav


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bangla-deep-speech-Recognition
 2 | Bangla deep speech recognition is a deep bidirectional RNN based bangla speech to text transcription system.
 3 | Major focusing for this project is to empower industrial application like searching a product by voice command using bangla speech recognition  end to end model, via an easy-to-use, efficient, smaller and scalable implementation, including training, inference & testing module,and deployment.
 4 | 
 5 | # Dataset
 6 | There is two parts of voice datasets:   
 7 | 1)This is own collected dataset and voice corpus generated on based of company product.Here I'am used a small size of voice corpuses like size 40-50 audio files.I can add more voice corpuses to get better result to mitigate overfitting.  
 8 | 2)Bengali ASR training data set containing ~196K utterances.
 9 |   Dataset link:http://openslr.org/53/
10 | # Annotation Tools   
11 | 1)https://online-audio-converter.com/  
12 | 2)https://twistedwave.com/online   
13 | 
14 | # Model  
15 | 1)rnn model,Lstm model,bidirectional-rnn model,Deep model  
16 | 2)working on Rnn_Transducer_model on going
17 | 
18 | # Dependency    
19 |  Python 3.7   
20 |  tensorflow 2.0.0  
21 | 
22 | # Project Structure:  
23 | Run above command:  
24 | _speech_recognition (2).ipynb_  
25 | 
26 | # results:  
27 | ![Capture-1](https://user-images.githubusercontent.com/45398575/120583800-ad5e5380-c450-11eb-84b9-85779bf71f13.PNG)
28 | 
29 | # References
30 | 


--------------------------------------------------------------------------------
/model/cp.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/model/cp.ckpt


--------------------------------------------------------------------------------
/model/model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/model/model.h5


--------------------------------------------------------------------------------
/notebook/additional_notebook/notebook:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/notebook/additional_notebook/voice_recognition_1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import numpy as np\n",
 19 |     "import librosa\n",
 20 |     "import librosa.display\n",
 21 |     "import IPython.display as ipd\n",
 22 |     "import os \n",
 23 |     "\n",
 24 |     "  \n",
 25 |     "from sklearn.model_selection import train_test_split\n",
 26 |     "from keras.utils import to_categorical\n",
 27 |     "from tqdm import tqdm"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "root_dir=os.listdir(\"E:/speech_recognition/data/\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "E:/speech_recognition/data/audio_1.wav\n",
 49 |       "E:/speech_recognition/data/audio_2.wav\n",
 50 |       "E:/speech_recognition/data/audio_3.wav\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "mfcc=[]\n",
 56 |     "for i in root_dir:\n",
 57 |     "    audio_paths=f'E:/speech_recognition/data/{i}'\n",
 58 |     "    print(audio_paths)\n",
 59 |     "    \n",
 60 |     "    # load audio file and slice it to ensure length consistency among different files\n",
 61 |     "    signal,sample_rate = librosa.load(audio_paths)\n",
 62 |     "    \n",
 63 |     "    # extract MFCCs\n",
 64 |     "    MFCCs = librosa.feature.mfcc(signal, sample_rate)\n",
 65 |     "    mfcc.append( MFCCs)                                           "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "(20, 171)\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "print(mfcc[2].shape)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "def plot_mfcc_feature(vis_mfcc_feature):\n",
 92 |     "    # plot the MFCC feature\n",
 93 |     "    fig = plt.figure(figsize=(12,5))\n",
 94 |     "    ax = fig.add_subplot(111)\n",
 95 |     "    im = ax.imshow(vis_mfcc_feature, cmap=plt.cm.jet, aspect='auto')\n",
 96 |     "    plt.title('Normalized MFCC')\n",
 97 |     "    plt.ylabel('Time')\n",
 98 |     "    plt.xlabel('MFCC Coefficient')\n",
 99 |     "    divider = make_axes_locatable(ax)\n",
100 |     "    cax = divider.append_axes(\"right\", size=\"5%\", pad=0.05)\n",
101 |     "    plt.colorbar(im, cax=cax)\n",
102 |     "    ax.set_xticks(np.arange(0, 13, 2), minor=False);\n",
103 |     "    plt.show()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "#plot_mfcc_feature(mfcc)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "#labels=[\"আমি এসএসএল ওয়ারলেসে জব\",\n",
122 |     "        #\"আমি ডাটা টিমের সদস্য\",\n",
123 |     "        #\"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\"]\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 8,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "def get_labels():\n",
133 |     "    labels =[\"আমি এসএসএল ওয়ারলেসে জব\",\n",
134 |     "            \"আমি ডাটা টিমের সদস্য\",\n",
135 |     "            \"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\"] \n",
136 |     "    #labels=os.listdir(path)\n",
137 |     "    print(len(labels))\n",
138 |     "    label_indices = np.arange(0, len(labels))\n",
139 |     "    print(label_indices)\n",
140 |     "    return labels, label_indices, to_categorical(label_indices)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 9,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "3\n",
153 |       "[0 1 2]\n",
154 |       "['আমি এসএসএল ওয়ারলেসে জব', 'আমি ডাটা টিমের সদস্য', 'আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে']\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "#data_path=\"E:/speech_recognition/data/\"\n",
160 |     "labels,label_indices,_=get_labels()\n",
161 |     "print(labels)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 10,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "# Mel-frequency cepstral coefficients\n",
171 |     "def wav2mfcc(file_path,max_len, n_mfcc):\n",
172 |     "    wave, sr = librosa.load(file_path, mono=True, sr=None)\n",
173 |     "    wave = np.asfortranarray(wave[::3])\n",
174 |     "    mfcc = librosa.feature.mfcc(wave, sr=16000, n_mfcc=n_mfcc)\n",
175 |     "\n",
176 |     "    # If maximum length exceeds mfcc lengths then pad the remaining ones\n",
177 |     "    if (max_len > mfcc.shape[1]):\n",
178 |     "        pad_width = max_len - mfcc.shape[1]\n",
179 |     "        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')\n",
180 |     "\n",
181 |     "    # Else cutoff the remaining parts\n",
182 |     "    else:\n",
183 |     "        mfcc = mfcc[:, :max_len]\n",
184 |     "    \n",
185 |     "    return mfcc"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 11,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "\n",
195 |     "def save_data_to_array(path, max_len, n_mfcc):\n",
196 |     "    #labels,_, _ = get_labels()\n",
197 |     "\n",
198 |     "    #for label in labels:\n",
199 |     "        # Init mfcc vectors\n",
200 |     "    mfcc_vectors = []\n",
201 |     "   \n",
202 |     "    for wav in path:\n",
203 |     "        wavfile=f'E:/speech_recognition/data/{wav}'\n",
204 |     "        print(wavfile)\n",
205 |     "        mfcc = wav2mfcc(wavfile, max_len, n_mfcc)\n",
206 |     "        mfcc_vectors.append(mfcc)\n",
207 |     "    np.save('E:/speech_recognition/wav', mfcc_vectors)\n",
208 |     "    return mfcc_vectors"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 12,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "E:/speech_recognition/data/audio_1.wav\n",
221 |       "E:/speech_recognition/data/audio_2.wav\n",
222 |       "E:/speech_recognition/data/audio_3.wav\n",
223 |       "(20, 200)\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "mfcc_vec=save_data_to_array(root_dir,200,20)\n",
229 |     "print(mfcc_vec[1].shape)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 13,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "name": "stdout",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "3\n"
242 |      ]
243 |     }
244 |    ],
245 |    "source": [
246 |     "x= np.load('E:/speech_recognition/wav.npy')\n",
247 |     "print(len(x))"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 14,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# Getting first arrays\n",
257 |     "#X = np.load(labels[0] + '.npy')\n",
258 |     "#y = np.zeros(X.shape[0])\n"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 15,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "def get_train_test(split_ratio=0.6, random_state=42):\n",
268 |     "    # Get available labels\n",
269 |     "    labels, indices, _ = get_labels()\n",
270 |     "\n",
271 |     "    # Getting first arrays\n",
272 |     "    X = np.load('E:/speech_recognition/wav.npy')\n",
273 |     "    y = np.zeros(X.shape[0])\n",
274 |     "    #print(y)\n",
275 |     "    # Append all of the dataset into one single array, same goes for y\n",
276 |     "    for i, label in enumerate(labels[1:]):\n",
277 |     "        x = np.load('E:/speech_recognition/wav.npy')\n",
278 |     "        X = np.vstack((X, x))\n",
279 |     "        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))\n",
280 |     "        print(y)\n",
281 |     "    assert X.shape[0] == len(y)\n",
282 |     "\n",
283 |     "    return train_test_split(X, y, test_size=0.2,shuffle=True)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 16,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "#get_train_test(split_ratio=0.6, random_state=42)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 17,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "X_train, X_test, y_train, y_test =train_test_split(x,label_indices,test_size=0.1,\n",
302 |     "                                                   random_state=42)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 18,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "(2, 20, 200)"
314 |       ]
315 |      },
316 |      "execution_count": 18,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "X_train.shape"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 19,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "# # Feature dimension\n",
332 |     "channels = 1\n",
333 |     "max_len = 200\n",
334 |     "buckets = 20\n",
335 |     "epochs = 48\n",
336 |     "batch_size = 100\n",
337 |     "\n",
338 |     "num_classes = 3\n",
339 |     "\n",
340 |     "#X_train = X_train.reshape(X_train.shape[0],buckets, max_len, channels)\n",
341 |     "#X_test = X_test.reshape(X_test.shape[0],buckets,max_len, channels)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 20,
347 |    "metadata": {},
348 |    "outputs": [
349 |     {
350 |      "name": "stdout",
351 |      "output_type": "stream",
352 |      "text": [
353 |       "(2, 20, 200) (1, 20, 200)\n"
354 |      ]
355 |     }
356 |    ],
357 |    "source": [
358 |     "print(X_train.shape,X_test.shape)\n",
359 |     "#plt.imshow(X_train[:, :, :, 0])"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 21,
365 |    "metadata": {
366 |     "scrolled": false
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "[1 2] [0]\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "print(y_train,y_test)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 22,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "name": "stdout",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "(2,) (1,)\n"
391 |      ]
392 |     }
393 |    ],
394 |    "source": [
395 |     "print(y_train.shape,y_test.shape)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 23,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "[[0. 1. 0.]\n",
408 |       " [0. 0. 1.]] [[1.]]\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "y_train_hot = to_categorical(y_train)\n",
414 |     "y_test_hot = to_categorical(y_test)\n",
415 |     "print(y_train_hot,y_test_hot)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 24,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/plain": [
426 |        "(2, 3)"
427 |       ]
428 |      },
429 |      "execution_count": 24,
430 |      "metadata": {},
431 |      "output_type": "execute_result"
432 |     }
433 |    ],
434 |    "source": [
435 |     "y_train_hot.shape"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 25,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "text/plain": [
446 |        "(1, 1)"
447 |       ]
448 |      },
449 |      "execution_count": 25,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "y_test_hot.shape"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 26,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "#from preprocess import *\n",
465 |     "import keras\n",
466 |     "from keras.models import Sequential\n",
467 |     "from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM\n",
468 |     "from keras.utils import to_categorical\n",
469 |     "#import wandb\n",
470 |     "#from wandb.keras import WandbCallback\n",
471 |     "import matplotlib.pyplot as plt"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 27,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "num_classes=3\n",
481 |     "\n",
482 |     "#build a simple cnn model\n",
483 |     "\n",
484 |     "model = Sequential()\n",
485 |     "model.add(Flatten(input_shape=(buckets,max_len)))\n",
486 |     "#model.add(Flatten())\n",
487 |     "model.add(Dense(3, activation='softmax'))\n",
488 |     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
489 |     "                  optimizer=\"adam\",\n",
490 |     "                  metrics=['accuracy'])"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 28,
496 |    "metadata": {},
497 |    "outputs": [
498 |     {
499 |      "name": "stdout",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "Model: \"sequential_1\"\n",
503 |       "_________________________________________________________________\n",
504 |       "Layer (type)                 Output Shape              Param #   \n",
505 |       "=================================================================\n",
506 |       "flatten_1 (Flatten)          (None, 4000)              0         \n",
507 |       "_________________________________________________________________\n",
508 |       "dense_1 (Dense)              (None, 3)                 12003     \n",
509 |       "=================================================================\n",
510 |       "Total params: 12,003\n",
511 |       "Trainable params: 12,003\n",
512 |       "Non-trainable params: 0\n",
513 |       "_________________________________________________________________\n"
514 |      ]
515 |     }
516 |    ],
517 |    "source": [
518 |     "model.summary()"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": 36,
524 |    "metadata": {},
525 |    "outputs": [
526 |     {
527 |      "name": "stdout",
528 |      "output_type": "stream",
529 |      "text": [
530 |       "2\n"
531 |      ]
532 |     }
533 |    ],
534 |    "source": [
535 |     "print(X_train.shape[0])\n",
536 |     "X_train = X_train.reshape(X_train.shape[0],buckets, max_len)\n",
537 |     "X_test = X_test.reshape(X_test.shape[0],buckets,max_len)"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 30,
543 |    "metadata": {},
544 |    "outputs": [
545 |     {
546 |      "data": {
547 |       "text/plain": [
548 |        "(2, 20, 200)"
549 |       ]
550 |      },
551 |      "execution_count": 30,
552 |      "metadata": {},
553 |      "output_type": "execute_result"
554 |     }
555 |    ],
556 |    "source": [
557 |     "X_train.shape"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": 31,
563 |    "metadata": {},
564 |    "outputs": [
565 |     {
566 |      "data": {
567 |       "text/plain": [
568 |        "(1, 20, 200)"
569 |       ]
570 |      },
571 |      "execution_count": 31,
572 |      "metadata": {},
573 |      "output_type": "execute_result"
574 |     }
575 |    ],
576 |    "source": [
577 |     "X_test.shape"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 32,
583 |    "metadata": {},
584 |    "outputs": [
585 |     {
586 |      "name": "stdout",
587 |      "output_type": "stream",
588 |      "text": [
589 |       "Train on 2 samples, validate on 1 samples\n",
590 |       "Epoch 1/48\n",
591 |       "2/2 [==============================] - 0s 31ms/step - loss: 122.4453 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 1.0000\n",
592 |       "Epoch 2/48\n",
593 |       "2/2 [==============================] - 0s 998us/step - loss: 10.0779 - accuracy: 0.0000e+00 - val_loss: 8.1836 - val_accuracy: 0.0000e+00\n",
594 |       "Epoch 3/48\n",
595 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 48.7357 - val_accuracy: 0.0000e+00\n",
596 |       "Epoch 4/48\n",
597 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 81.9535 - val_accuracy: 0.0000e+00\n",
598 |       "Epoch 5/48\n",
599 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 110.0301 - val_accuracy: 0.0000e+00\n",
600 |       "Epoch 6/48\n",
601 |       "2/2 [==============================] - 0s 998us/step - loss: 2.9802e-07 - accuracy: 1.0000 - val_loss: 134.2287 - val_accuracy: 0.0000e+00\n",
602 |       "Epoch 7/48\n",
603 |       "2/2 [==============================] - 0s 997us/step - loss: 1.5715e-04 - accuracy: 1.0000 - val_loss: 155.3666 - val_accuracy: 0.0000e+00\n",
604 |       "Epoch 8/48\n",
605 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0345 - accuracy: 1.0000 - val_loss: 175.0989 - val_accuracy: 0.0000e+00\n",
606 |       "Epoch 9/48\n",
607 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0756 - accuracy: 1.0000 - val_loss: 194.8384 - val_accuracy: 0.0000e+00\n",
608 |       "Epoch 10/48\n",
609 |       "2/2 [==============================] - 0s 997us/step - loss: 1.2372e-04 - accuracy: 1.0000 - val_loss: 212.4595 - val_accuracy: 0.0000e+00\n",
610 |       "Epoch 11/48\n",
611 |       "2/2 [==============================] - 0s 997us/step - loss: 3.5763e-07 - accuracy: 1.0000 - val_loss: 228.2513 - val_accuracy: 0.0000e+00\n",
612 |       "Epoch 12/48\n",
613 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 242.4503 - val_accuracy: 0.0000e+00\n",
614 |       "Epoch 13/48\n",
615 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 255.2506 - val_accuracy: 0.0000e+00\n",
616 |       "Epoch 14/48\n",
617 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 266.8147 - val_accuracy: 0.0000e+00\n",
618 |       "Epoch 15/48\n",
619 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 277.2800 - val_accuracy: 0.0000e+00\n",
620 |       "Epoch 16/48\n",
621 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 286.7642 - val_accuracy: 0.0000e+00\n",
622 |       "Epoch 17/48\n",
623 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 295.3692 - val_accuracy: 0.0000e+00\n",
624 |       "Epoch 18/48\n",
625 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 303.1837 - val_accuracy: 0.0000e+00\n",
626 |       "Epoch 19/48\n",
627 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 310.2859 - val_accuracy: 0.0000e+00\n",
628 |       "Epoch 20/48\n",
629 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 316.7448 - val_accuracy: 0.0000e+00\n",
630 |       "Epoch 21/48\n",
631 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 322.6215 - val_accuracy: 0.0000e+00\n",
632 |       "Epoch 22/48\n",
633 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 327.9705 - val_accuracy: 0.0000e+00\n",
634 |       "Epoch 23/48\n",
635 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 332.8409 - val_accuracy: 0.0000e+00\n",
636 |       "Epoch 24/48\n",
637 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 337.2766 - val_accuracy: 0.0000e+00\n",
638 |       "Epoch 25/48\n",
639 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 341.3170 - val_accuracy: 0.0000e+00\n",
640 |       "Epoch 26/48\n",
641 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 344.9977 - val_accuracy: 0.0000e+00\n",
642 |       "Epoch 27/48\n",
643 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 348.3513 - val_accuracy: 0.0000e+00\n",
644 |       "Epoch 28/48\n",
645 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 351.4067 - val_accuracy: 0.0000e+00\n",
646 |       "Epoch 29/48\n",
647 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 354.1908 - val_accuracy: 0.0000e+00\n",
648 |       "Epoch 30/48\n",
649 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 356.7273 - val_accuracy: 0.0000e+00\n",
650 |       "Epoch 31/48\n",
651 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 359.0380 - val_accuracy: 0.0000e+00\n",
652 |       "Epoch 32/48\n",
653 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 361.1434 - val_accuracy: 0.0000e+00\n",
654 |       "Epoch 33/48\n",
655 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 363.0612 - val_accuracy: 0.0000e+00\n",
656 |       "Epoch 34/48\n",
657 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 364.8080 - val_accuracy: 0.0000e+00\n",
658 |       "Epoch 35/48\n",
659 |       "2/2 [==============================] - 0s 2ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 366.3990 - val_accuracy: 0.0000e+00\n",
660 |       "Epoch 36/48\n",
661 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 367.8478 - val_accuracy: 0.0000e+00\n",
662 |       "Epoch 37/48\n",
663 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 369.1672 - val_accuracy: 0.0000e+00\n",
664 |       "Epoch 38/48\n",
665 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 370.3681 - val_accuracy: 0.0000e+00\n",
666 |       "Epoch 39/48\n",
667 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 371.4615 - val_accuracy: 0.0000e+00\n",
668 |       "Epoch 40/48\n",
669 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 372.4566 - val_accuracy: 0.0000e+00\n",
670 |       "Epoch 41/48\n",
671 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 373.3623 - val_accuracy: 0.0000e+00\n",
672 |       "Epoch 42/48\n",
673 |       "2/2 [==============================] - 0s 997us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 374.1862 - val_accuracy: 0.0000e+00\n",
674 |       "Epoch 43/48\n",
675 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 374.9359 - val_accuracy: 0.0000e+00\n",
676 |       "Epoch 44/48\n",
677 |       "2/2 [==============================] - 0s 998us/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 375.6178 - val_accuracy: 0.0000e+00\n",
678 |       "Epoch 45/48\n",
679 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 376.2379 - val_accuracy: 0.0000e+00\n",
680 |       "Epoch 46/48\n",
681 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 376.8019 - val_accuracy: 0.0000e+00\n",
682 |       "Epoch 47/48\n",
683 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 377.3149 - val_accuracy: 0.0000e+00\n",
684 |       "Epoch 48/48\n",
685 |       "2/2 [==============================] - 0s 1ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 377.7810 - val_accuracy: 0.0000e+00\n"
686 |      ]
687 |     },
688 |     {
689 |      "data": {
690 |       "text/plain": [
691 |        "<keras.callbacks.callbacks.History at 0x276eb4f1b88>"
692 |       ]
693 |      },
694 |      "execution_count": 32,
695 |      "metadata": {},
696 |      "output_type": "execute_result"
697 |     }
698 |    ],
699 |    "source": [
700 |     "model.fit(X_train, y_train, epochs=epochs, \n",
701 |     "          validation_data=(X_test,y_test),\n",
702 |     "         )"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 33,
708 |    "metadata": {},
709 |    "outputs": [
710 |     {
711 |      "name": "stdout",
712 |      "output_type": "stream",
713 |      "text": [
714 |       "(1, 20, 200)\n",
715 |       "[1]\n"
716 |      ]
717 |     }
718 |    ],
719 |    "source": [
720 |     "# make a prediction\n",
721 |     "import cv2\n",
722 |     "from numpy import zeros, newaxis\n",
723 |     "#print(mfcc_vec[1].shape)\n",
724 |     "\n",
725 |     "\n",
726 |     "#.........take a random voice..............\n",
727 |     "mfcc_1=mfcc_vec[1][newaxis,:, :,]\n",
728 |     "print(mfcc_1.shape)\n",
729 |     "predict = model.predict_classes(mfcc_1)\n",
730 |     "print(predict)"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "code",
735 |    "execution_count": 34,
736 |    "metadata": {},
737 |    "outputs": [
738 |     {
739 |      "name": "stdout",
740 |      "output_type": "stream",
741 |      "text": [
742 |       "আমি ডাটা টিমের সদস্য\n"
743 |      ]
744 |     }
745 |    ],
746 |    "source": [
747 |     "if predict==[0]:\n",
748 |     "    print(\"আমি এসএসএল ওয়ারলেসে জব\")\n",
749 |     "elif predict==[1]:\n",
750 |     "    print(\"আমি ডাটা টিমের সদস্য\")\n",
751 |     "else:\n",
752 |     "    print(\"আমাদের ডেটা টিমে দুইজন জমজ ভাই আছে\")"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 35,
758 |    "metadata": {},
759 |    "outputs": [
760 |     {
761 |      "name": "stdout",
762 |      "output_type": "stream",
763 |      "text": [
764 |       "2\n"
765 |      ]
766 |     }
767 |    ],
768 |    "source": [
769 |     "x=[ 5,40, 51, 81, 12, 46 ,12]\n",
770 |     "print(len(x[:2]))"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "code",
775 |    "execution_count": null,
776 |    "metadata": {},
777 |    "outputs": [],
778 |    "source": []
779 |   },
780 |   {
781 |    "cell_type": "code",
782 |    "execution_count": null,
783 |    "metadata": {},
784 |    "outputs": [],
785 |    "source": []
786 |   }
787 |  ],
788 |  "metadata": {
789 |   "kernelspec": {
790 |    "display_name": "Python 3",
791 |    "language": "python",
792 |    "name": "python3"
793 |   },
794 |   "language_info": {
795 |    "codemirror_mode": {
796 |     "name": "ipython",
797 |     "version": 3
798 |    },
799 |    "file_extension": ".py",
800 |    "mimetype": "text/x-python",
801 |    "name": "python",
802 |    "nbconvert_exporter": "python",
803 |    "pygments_lexer": "ipython3",
804 |    "version": "3.7.4"
805 |   }
806 |  },
807 |  "nbformat": 4,
808 |  "nbformat_minor": 2
809 | }
810 | 


--------------------------------------------------------------------------------
/notebook/voice-recognition (2).ipynb:
--------------------------------------------------------------------------------
1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport tensorflow as tf\n\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nimport tensorflow as tf\n\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.layers import (Input, Lambda, BatchNormalization,LSTM,TimeDistributed,Activation,Dense)\nfrom tensorflow.keras.optimizers import SGD\nfrom tensorflow.keras.callbacks import ModelCheckpoint   \nimport os\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport librosa\nimport librosa.display\nimport IPython.display as ipd\nimport os \n\n  \nfrom sklearn.model_selection import train_test_split\nfrom keras.utils import to_categorical\nfrom tqdm import tqdm\nimport random\n\nvoice_data=[]\nfor dirname, _, filenames in os.walk('../input/voicerecognitiondata/asr_bengali/data'):\n    for filename in filenames:\n        voice_data.append(os.path.join(dirname, filename))\n        #print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"cf7be286-df93-4c2f-997a-98912a353525","_cell_guid":"a4ebc242-53a4-4cca-a062-bd7f3b649c51","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:26.705295Z","iopub.execute_input":"2021-07-19T05:37:26.705623Z","iopub.status.idle":"2021-07-19T05:37:35.00382Z","shell.execute_reply.started":"2021-07-19T05:37:26.705548Z","shell.execute_reply":"2021-07-19T05:37:35.002868Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tf.test.is_gpu_available()","metadata":{"_uuid":"a6f94e4e-94af-4ebb-b77e-625b50b16ed2","_cell_guid":"e1b4ef81-2f80-4867-a262-37c47df78e12","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:35.006996Z","iopub.execute_input":"2021-07-19T05:37:35.00727Z","iopub.status.idle":"2021-07-19T05:37:36.75772Z","shell.execute_reply.started":"2021-07-19T05:37:35.007242Z","shell.execute_reply":"2021-07-19T05:37:36.756903Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#..........fixed the seed................\ndef reset_random_seeds():\n    seed_num=9\n    os.environ['PYTHONHASHSEED']=str(seed_num)\n    tf.random.set_seed(seed_num)\n    np.random.seed(seed_num)\n    random.seed(seed_num)","metadata":{"_uuid":"34242cdc-fb17-47f6-89f2-37bf7985142c","_cell_guid":"104d8bca-8e62-42aa-b938-531c62c8517a","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:36.75955Z","iopub.execute_input":"2021-07-19T05:37:36.76007Z","iopub.status.idle":"2021-07-19T05:37:36.768183Z","shell.execute_reply.started":"2021-07-19T05:37:36.76003Z","shell.execute_reply":"2021-07-19T05:37:36.767435Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#read voice data\n#print(len(voice_data))\ntsv_data=pd.read_csv(\"../input/voicerecognitiondata/asr_bengali/utt_spk_text.tsv\",sep=\"\\t\",header=0)\n#from kaggle_datasets import KaggleDatasets\n#Datset=\"/kaggle/input/voicerecognitiondata/asr_bengali/data\"\n#GCS_PATH = KaggleDatasets().get_gcs_path(Datset )\n\n#train_filenames1 = tf.io.gfile.glob(GCS_PATH1 + '*/*.flac')\ntsv_data.head()","metadata":{"_uuid":"6e4bd403-25f5-4cb3-901e-49bcf8d823f0","_cell_guid":"79de69ef-09da-43c3-b292-435c66e14f7f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:36.769696Z","iopub.execute_input":"2021-07-19T05:37:36.770231Z","iopub.status.idle":"2021-07-19T05:37:37.190023Z","shell.execute_reply.started":"2021-07-19T05:37:36.770185Z","shell.execute_reply":"2021-07-19T05:37:37.188494Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#add header in tsv file\n#tsv_data_1=pd.DataFrame(tsv_data,columns = [\"wav_name\",\"id\",\"label\"])\n\n#tsv_data_1.head()\ntsv_data.columns=[\"wav_name\",\"id\",\"label\"]\ntsv_data_1=tsv_data","metadata":{"_uuid":"72a86d78-dba9-49ce-8ce3-d1cb1b90eb44","_cell_guid":"e357c0a6-02ae-4a78-ad9e-9c653d22eb79","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.191303Z","iopub.execute_input":"2021-07-19T05:37:37.191646Z","iopub.status.idle":"2021-07-19T05:37:37.196208Z","shell.execute_reply.started":"2021-07-19T05:37:37.191611Z","shell.execute_reply":"2021-07-19T05:37:37.195287Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tsv_data.shape[0]","metadata":{"_uuid":"fd7f778f-139d-423d-aeb2-91f3faf22130","_cell_guid":"636213e0-ac29-40f5-bbcd-572b6410aecf","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.197678Z","iopub.execute_input":"2021-07-19T05:37:37.19805Z","iopub.status.idle":"2021-07-19T05:37:37.206819Z","shell.execute_reply.started":"2021-07-19T05:37:37.19801Z","shell.execute_reply":"2021-07-19T05:37:37.205873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(voice_data[0])\nprint(voice_data[0].split(\"/\")[-1].replace(\".flac\",\"\"))","metadata":{"_uuid":"54546114-927b-40d9-b9df-a37894473b18","_cell_guid":"4a6eb836-c182-4a7b-bff3-dc70498a93c4","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.208286Z","iopub.execute_input":"2021-07-19T05:37:37.208619Z","iopub.status.idle":"2021-07-19T05:37:37.216147Z","shell.execute_reply.started":"2021-07-19T05:37:37.208585Z","shell.execute_reply":"2021-07-19T05:37:37.215248Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#import zipfile\n#with zipfile.ZipFile(\"/kaggle/input/voicerecognitiondata/asr_bengali/data/\",'r') as zip_ref:\n    #zip_ref.extractall(\"/kaggle/input/voicerecognitiondata/\")","metadata":{"_uuid":"4c491c04-19eb-43ed-b8b4-521adeb02d58","_cell_guid":"f6298f0a-f43f-46fe-8fc7-dc369d3956e3","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.218769Z","iopub.execute_input":"2021-07-19T05:37:37.219146Z","iopub.status.idle":"2021-07-19T05:37:37.22355Z","shell.execute_reply.started":"2021-07-19T05:37:37.21911Z","shell.execute_reply":"2021-07-19T05:37:37.222577Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#convert voice corpuses from .flac to wav format\n\nfrom pydub import AudioSegment\n\n#save wav file this location\n#wav_root=\"../input/voicerecognitiondata/asr_bengali/data/wav\"\n\nos.makedirs('../outputs')\nfor root, _, files in os.walk('../input/voicerecognitiondata/asr_bengali/data'):\n    for file in files:\n        wav_name = file.replace(\".flac\",\".wav\")\n        #print(wav_name)\n        #print(root)\n        try:\n            # convert wav to mp3 \n            sound = AudioSegment.from_file(\"{}/{}\".format(root,file))\n            #print(sound)\n            sound.export(\"{}/{}\".format('../outputs', wav_name), format=\"wav\")\n        except Exception as e:\n            pass","metadata":{"_uuid":"0d2fe713-b9fd-4919-b468-c9d8d8732639","_cell_guid":"04366df9-30ff-4d79-80e6-43720fbd2690","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:37:37.225557Z","iopub.execute_input":"2021-07-19T05:37:37.225935Z","iopub.status.idle":"2021-07-19T05:50:23.340872Z","shell.execute_reply.started":"2021-07-19T05:37:37.225899Z","shell.execute_reply":"2021-07-19T05:50:23.339697Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.........read data wav data..................\n#for dirname, _, filenames in os.walk(\"../outputs\"):\n    #for filename in filenames:\n        #voice_data.append(\n        #print(os.path.join(dirname, filename))","metadata":{"_uuid":"11ada899-21c7-47b3-b576-ba5b489708ea","_cell_guid":"bd8ae802-3868-4207-89af-7b6e7e52cd1f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.342552Z","iopub.execute_input":"2021-07-19T05:50:23.342943Z","iopub.status.idle":"2021-07-19T05:50:23.348409Z","shell.execute_reply.started":"2021-07-19T05:50:23.3429Z","shell.execute_reply":"2021-07-19T05:50:23.347341Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# ............Mel-frequency cepstral coefficients..........................\n#and converting vector form that are readable by deep learning model\n\ndef wav2mfcc(file_path,max_len, n_mfcc):\n    wave, sr = librosa.load(file_path, mono=True, sr=None)\n    #print(wave)\n    wave = np.asfortranarray(wave[::3])\n    #print(wave)\n    #print(sr)\n    \n    mfcc = librosa.feature.mfcc(wave,sr=sr,n_mfcc=n_mfcc) #sr means sampling rate=16000\n    mfcc_1=mfcc\n    #print(mfcc.shape[0],mfcc.shape[1])\n    \n    # If maximum length exceeds mfcc lengths then pad the remaining ones\n    if (max_len > mfcc.shape[1]):\n        pad_width = max_len - mfcc.shape[1]\n        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')\n\n    # Else cutoff the remaining parts\n    else:\n        mfcc = mfcc[:, :max_len]\n    \n    return mfcc,wave,mfcc_1.shape[0],mfcc_1.shape[1]","metadata":{"_uuid":"6d166be9-ab1b-40aa-b71c-3d810004e8c9","_cell_guid":"ff14b7b8-81a0-4d03-8de3-e66cddbe46fe","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.351817Z","iopub.execute_input":"2021-07-19T05:50:23.352247Z","iopub.status.idle":"2021-07-19T05:50:23.385642Z","shell.execute_reply.started":"2021-07-19T05:50:23.352205Z","shell.execute_reply":"2021-07-19T05:50:23.384847Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#ordering the voice corpuses\nfrom tkinter import Tcl\n\ndef save_data_to_array(path, max_len, n_mfcc):\n    #labels,_, _ = get_labels()\n\n    #for label in labels:\n        # Init mfcc vectors\n   \n    #read audio file\n    audio=[]\n    for wav in path:\n        wavfile=f'../outputs/{wav}'\n        #wavfile=Tcl().call('lsort', '-dict',wavfile)\n        #print(wavfile)\n        audio.append(wavfile)\n    \n    #.............sort audio file.......\n    sort_audio=Tcl().call('lsort', '-dict',audio) \n    print(sort_audio[0])\n    #print(audio)\n    audio_path=[]\n    frequency_m=[]\n    amplitude_m=[]\n    mfcc_vectors = []\n    for path in sort_audio:\n        mfcc,wave,fre_sh,amp_sh= wav2mfcc(path, max_len, n_mfcc)\n        audio_path.append(path.split(\"/\")[-1].replace(\".wav\",\"\"))\n        mfcc_vectors.append(mfcc)\n        frequency_m.append(fre_sh)\n        amplitude_m.append(amp_sh)\n    \n    print(max(frequency_m))    \n    print(max(amplitude_m))\n    #np.save('E:/speech_recognition/wav', mfcc_vectors)\n    return mfcc_vectors,wave,audio,audio_path","metadata":{"_uuid":"98255c23-deea-4265-bcdf-8d5672f28091","_cell_guid":"ded85699-8488-412a-b7bb-bfadb35a38fe","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.387156Z","iopub.execute_input":"2021-07-19T05:50:23.387525Z","iopub.status.idle":"2021-07-19T05:50:23.435896Z","shell.execute_reply.started":"2021-07-19T05:50:23.387489Z","shell.execute_reply":"2021-07-19T05:50:23.435168Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#read wav file path\nroot_dir=os.listdir(\"../outputs\")\nmfcc_vec,wave,audio,audio_path=save_data_to_array(root_dir,177,34)#34 means time frequency and 384 amplitude\n#print(mfcc_vec[0].shape[0])","metadata":{"_uuid":"c5de1ed8-537e-47c7-8197-9df291c3a668","_cell_guid":"e5d77018-df63-4ba9-b6eb-7083ab96d90e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:50:23.437111Z","iopub.execute_input":"2021-07-19T05:50:23.437489Z","iopub.status.idle":"2021-07-19T05:52:09.518913Z","shell.execute_reply.started":"2021-07-19T05:50:23.437452Z","shell.execute_reply":"2021-07-19T05:52:09.51793Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"data={\"mfcc\":mfcc_vec,\n      \"wav_name\":audio_path}\ndf=pd.DataFrame(data,columns=['mfcc',\"wav_name\"])\ndf.head()","metadata":{"_uuid":"277535a4-fb35-471c-9dee-161e3d5fe34c","_cell_guid":"50d26627-e379-4c81-8356-77d90ae05ed0","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.523137Z","iopub.execute_input":"2021-07-19T05:52:09.525442Z","iopub.status.idle":"2021-07-19T05:52:09.854391Z","shell.execute_reply.started":"2021-07-19T05:52:09.525394Z","shell.execute_reply":"2021-07-19T05:52:09.853652Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.isnull().sum()","metadata":{"_uuid":"9ca32317-e68f-4542-9961-429e83e02502","_cell_guid":"c2996eb6-f113-40c6-94ae-27f699de514f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.855662Z","iopub.execute_input":"2021-07-19T05:52:09.856022Z","iopub.status.idle":"2021-07-19T05:52:09.868292Z","shell.execute_reply.started":"2021-07-19T05:52:09.855987Z","shell.execute_reply":"2021-07-19T05:52:09.867482Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.shape[0]","metadata":{"_uuid":"be98e034-73d9-49d6-962c-da21275f1212","_cell_guid":"e70d4771-7615-4421-ad52-504954b4ca11","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.870638Z","iopub.execute_input":"2021-07-19T05:52:09.87092Z","iopub.status.idle":"2021-07-19T05:52:09.876904Z","shell.execute_reply.started":"2021-07-19T05:52:09.870885Z","shell.execute_reply":"2021-07-19T05:52:09.875944Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tsv_data_1.head()","metadata":{"_uuid":"3adf1410-961e-452f-b307-9936d0453c35","_cell_guid":"15b01511-b552-466d-8a21-2cd0bee253a4","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.878389Z","iopub.execute_input":"2021-07-19T05:52:09.878747Z","iopub.status.idle":"2021-07-19T05:52:09.89129Z","shell.execute_reply.started":"2021-07-19T05:52:09.878703Z","shell.execute_reply":"2021-07-19T05:52:09.890469Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df.shape[0])\nprint(tsv_data_1.shape[0])","metadata":{"_uuid":"346b1baf-1d5b-42bb-992c-3dbcedf8d54f","_cell_guid":"75b906df-7cd4-4a0d-b188-ad6680fba4d7","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.89251Z","iopub.execute_input":"2021-07-19T05:52:09.892891Z","iopub.status.idle":"2021-07-19T05:52:09.898238Z","shell.execute_reply.started":"2021-07-19T05:52:09.892856Z","shell.execute_reply":"2021-07-19T05:52:09.89734Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#merge the on basis of wav_name\ndf_new=pd.merge(df,tsv_data_1,on=\"wav_name\",how=\"outer\")\nprint(df_new.shape[0])","metadata":{"_uuid":"f5018410-30e5-43e7-b6bb-c39845b56438","_cell_guid":"a4d7e9f4-62a8-4761-9288-f33bddf487d1","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:09.899542Z","iopub.execute_input":"2021-07-19T05:52:09.900137Z","iopub.status.idle":"2021-07-19T05:52:10.03072Z","shell.execute_reply.started":"2021-07-19T05:52:09.900095Z","shell.execute_reply":"2021-07-19T05:52:10.029767Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df_new.isnull().sum())","metadata":{"_uuid":"5cddcf22-3f3c-47ea-a707-cef9deecf85f","_cell_guid":"06820be4-cc2d-4821-a4e1-98c3f07ddb44","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.035935Z","iopub.execute_input":"2021-07-19T05:52:10.038446Z","iopub.status.idle":"2021-07-19T05:52:10.109696Z","shell.execute_reply.started":"2021-07-19T05:52:10.038402Z","shell.execute_reply":"2021-07-19T05:52:10.10873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#now remove those rows which have a Nan value.\n\ndf_new.dropna(subset = [\"label\",\"mfcc\"], inplace=True)\ndf_new.head()","metadata":{"_uuid":"890cb3a6-6370-4829-8cab-805d9ef02021","_cell_guid":"5a361a44-6de5-440e-bd06-6a7e5616e9cd","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.113436Z","iopub.execute_input":"2021-07-19T05:52:10.115408Z","iopub.status.idle":"2021-07-19T05:52:10.575568Z","shell.execute_reply.started":"2021-07-19T05:52:10.115369Z","shell.execute_reply":"2021-07-19T05:52:10.574782Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df_new.isnull().sum())","metadata":{"_uuid":"61d8754d-3585-407c-bace-6e7d6178368e","_cell_guid":"acbc5a81-33d3-47d8-9999-c84f6937f8cb","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.576832Z","iopub.execute_input":"2021-07-19T05:52:10.577177Z","iopub.status.idle":"2021-07-19T05:52:10.589147Z","shell.execute_reply.started":"2021-07-19T05:52:10.577147Z","shell.execute_reply":"2021-07-19T05:52:10.588093Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_new.shape[0]","metadata":{"_uuid":"f7b5da99-4af8-402d-b883-ac58067a0198","_cell_guid":"9f8a4bf9-776f-46cf-b781-1bac89999eb8","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.593604Z","iopub.execute_input":"2021-07-19T05:52:10.593864Z","iopub.status.idle":"2021-07-19T05:52:10.599153Z","shell.execute_reply.started":"2021-07-19T05:52:10.59384Z","shell.execute_reply":"2021-07-19T05:52:10.598135Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#find a label by mapping\n#new_label=[]\n#for i in range(df.shape[0]):\n    #for j in range(tsv_data_1.shape[0]):\n        \n        #if df[\"audio_path\"][i]==tsv_data_1[\"wav_name\"][j]:\n             \n            #new_label.append(tsv_data_1[\"label\"][j])\n            #break      \n        #else:\n            #continue","metadata":{"_uuid":"52308c78-1cf0-4fae-b73e-2c75dc507339","_cell_guid":"a52f99df-b724-449e-8097-719d85037c1e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.601515Z","iopub.execute_input":"2021-07-19T05:52:10.602234Z","iopub.status.idle":"2021-07-19T05:52:10.606497Z","shell.execute_reply.started":"2021-07-19T05:52:10.602197Z","shell.execute_reply":"2021-07-19T05:52:10.605576Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#df[\"transcript_label\"]=new_label","metadata":{"_uuid":"bffa51d7-34fd-4da5-a241-d166d68ed676","_cell_guid":"cf8ce979-d0fa-492b-ae9c-f642459cb537","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.608054Z","iopub.execute_input":"2021-07-19T05:52:10.60846Z","iopub.status.idle":"2021-07-19T05:52:10.614427Z","shell.execute_reply.started":"2021-07-19T05:52:10.608426Z","shell.execute_reply":"2021-07-19T05:52:10.613645Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#..........clean text labels like html tags,quotation etc.............\nimport re\ndef clean(text):\n    \n    #text=text.map(lambda i:re.sub(r'[\\\\|!|?|\\'|\"|#|,|।|-|.|)|(|\\|/|\"\"|\\n|]',r' ',str(i)))\n    #text=text.map(lambda i:re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(i)))\n    text=re.sub(r'[\\\\|!|?|\\'|\"|#|,|।|-|.|)|(|\\|/|\"\"|\\n|]',r' ',str(text))\n    text=re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(text))\n    text=re.sub(r\"[\\u200d|’|\\x93|\\x93|\\u200c|v|b|s|]\", '', str(text))\n    \n    text=text.replace(\"।\",\"\")\n    text=text.replace('-','')\n    text=text.replace(':','')\n    text=text.replace(\"\\x94\",\"\")\n    return text\n\ndf_new[\"label\"]=df_new[\"label\"].apply(clean)\n#df_new[\"label\"]=df_new[\"label\"].apply(lambda i:re.sub(r\"\\s+[a-zA-Z]\\s+\", ' ', str(i)))\n#df_new[\"label\"]=clean(df_new[\"label\"])\n#print(clean(df_new[\"label\"][32:35]))\ndf_new.head(3)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:10.615702Z","iopub.execute_input":"2021-07-19T05:52:10.616129Z","iopub.status.idle":"2021-07-19T05:52:10.875533Z","shell.execute_reply.started":"2021-07-19T05:52:10.616091Z","shell.execute_reply":"2021-07-19T05:52:10.874595Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def csv_file(df_new):\n    return df_new\n\n#csv_file(df_new)","metadata":{"_uuid":"0a55e0f4-e406-4544-9493-805f06ba2178","_cell_guid":"290c0be7-ba61-4f1d-9313-d9fe16f98408","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.876762Z","iopub.execute_input":"2021-07-19T05:52:10.877119Z","iopub.status.idle":"2021-07-19T05:52:10.882317Z","shell.execute_reply.started":"2021-07-19T05:52:10.877084Z","shell.execute_reply":"2021-07-19T05:52:10.881532Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"final_df=df_new","metadata":{"_uuid":"b8543e14-59d9-4388-944c-4d627e2f359d","_cell_guid":"1cf0e766-15d2-4722-acfa-01ccf53733ea","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.883472Z","iopub.execute_input":"2021-07-19T05:52:10.88394Z","iopub.status.idle":"2021-07-19T05:52:10.891006Z","shell.execute_reply.started":"2021-07-19T05:52:10.883904Z","shell.execute_reply":"2021-07-19T05:52:10.890243Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"final_df.to_csv('final_df.csv',index=False)","metadata":{"_uuid":"a52f017d-9b8c-45e1-96db-7887ef77dec4","_cell_guid":"ef40f9cf-f155-4d44-b0c8-36ebf6002d90","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:10.892252Z","iopub.execute_input":"2021-07-19T05:52:10.892639Z","iopub.status.idle":"2021-07-19T05:52:15.724376Z","shell.execute_reply.started":"2021-07-19T05:52:10.892602Z","shell.execute_reply":"2021-07-19T05:52:15.723513Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#to reduce size the sample for memory issue\nfinal_df=final_df[0:2000]\nfinal_df.head()","metadata":{"_uuid":"5340b791-82e6-4e99-b5f6-bc8e692d3826","_cell_guid":"d931c769-34ca-4414-8385-b09db485a72f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:15.725527Z","iopub.execute_input":"2021-07-19T05:52:15.725883Z","iopub.status.idle":"2021-07-19T05:52:16.0232Z","shell.execute_reply.started":"2021-07-19T05:52:15.725846Z","shell.execute_reply":"2021-07-19T05:52:16.022189Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#...measure a label of label string.....................\nfinal_df[\"label_len\"]=[len(lab) for lab in final_df[\"label\"]]\nfinal_df.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.024721Z","iopub.execute_input":"2021-07-19T05:52:16.025098Z","iopub.status.idle":"2021-07-19T05:52:16.311501Z","shell.execute_reply.started":"2021-07-19T05:52:16.025062Z","shell.execute_reply":"2021-07-19T05:52:16.310707Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#....remove rows which have label length greater input length............\n\n#i=final_df[len(final_df.label>120)].index\n\nfinal_df.drop(final_df[final_df.label_len>30].index, inplace=True)\nfinal_df.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.312682Z","iopub.execute_input":"2021-07-19T05:52:16.313045Z","iopub.status.idle":"2021-07-19T05:52:16.590531Z","shell.execute_reply.started":"2021-07-19T05:52:16.313009Z","shell.execute_reply":"2021-07-19T05:52:16.589587Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(final_df.shape)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.591867Z","iopub.execute_input":"2021-07-19T05:52:16.592217Z","iopub.status.idle":"2021-07-19T05:52:16.598427Z","shell.execute_reply.started":"2021-07-19T05:52:16.592181Z","shell.execute_reply":"2021-07-19T05:52:16.597543Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#convert bangla character to number value\nchar_map_str = \"\"\"\nঀ 0\nঁ 1\nং 2\nঃ 3\nঅ 4\nআ 5\nই 6\nঈ 7\nউ 8\nঊ 9\nঋ 10\nঌ 11\nএ 12\nঐ 13\nও 14\nঔ 15\nক 16\nখ 17\nগ 18\nঘ 19\nঙ 20\nচ 21\nছ 22\nজ 23\nঝ 24\nঞ 25\nট 26\nঠ 27\nড 28\nঢ 29\nণ 30\nত 31\nথ 32\nদ 33\nধ 34\nন 35\nপ 36\nফ 37\nব 38\nভ 39\nম 40\nয 41\nর 42\nল 43\nশ 44\nষ 45\nস 46\nহ 47\n় 48\nঽ 49\nা 50\nি 51\nী 52\nু 53\nূ 54\nৃ 55\nৄ 56\nে 57\nৈ 58\nো 59\nৌ 60\n্ 61\nৎ 62\nৗ 63\nড় 64\nঢ় 65\nয় 66\nৠ 67\n০ 68\n১ 69\n২ 70\n৩ 71\n৪ 72\n৫ 73\n৬ 74\n৭ 75\n৮ 76\n৯ 77\nৱ 78\n৲ 79\n৴ 80\n<SPACE> 81\n\"\"\"\n# the \"blank\" character is mapped to 81\nchar_map = {}\nindex_map = {}\nfor line in char_map_str.strip().split('\\n'):\n    ch, index = line.split()\n    char_map[ch] = int(index)\n    index_map[int(index)] = ch\nindex_map[81] = ' '\n ","metadata":{"_uuid":"f9756570-932b-4a37-8756-78603ddcc9d7","_cell_guid":"0a76de59-0135-4be1-a7f5-b04aaee81b54","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.599782Z","iopub.execute_input":"2021-07-19T05:52:16.600433Z","iopub.status.idle":"2021-07-19T05:52:16.60787Z","shell.execute_reply.started":"2021-07-19T05:52:16.600391Z","shell.execute_reply":"2021-07-19T05:52:16.607042Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(char_map)\nprint(\"****************************************************\")\nprint(index_map)","metadata":{"_uuid":"d0f8178a-d571-4522-bded-e0a29913ee8f","_cell_guid":"f0416633-1ade-4d44-a964-511cfe287413","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.609002Z","iopub.execute_input":"2021-07-19T05:52:16.60952Z","iopub.status.idle":"2021-07-19T05:52:16.619238Z","shell.execute_reply.started":"2021-07-19T05:52:16.609485Z","shell.execute_reply":"2021-07-19T05:52:16.618192Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_number_of_char_classes(char_map):\n    ## TODO would be better to check with dataset (once cleaned)\n    num_classes = len(char_map)+1 #need +1 for ctc null char +1 pad\n    return num_classes\nget_number_of_char_classes(char_map)","metadata":{"_uuid":"2dae4c78-08c1-4da8-8e6f-cfaab998e8aa","_cell_guid":"24f1a361-5aad-4327-8bfd-4cf41c740b40","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.620656Z","iopub.execute_input":"2021-07-19T05:52:16.621098Z","iopub.status.idle":"2021-07-19T05:52:16.629562Z","shell.execute_reply.started":"2021-07-19T05:52:16.621064Z","shell.execute_reply":"2021-07-19T05:52:16.62874Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def text_to_int_sequence(text):\n    \"\"\" Convert text to an integer sequence \"\"\"\n    int_sequence = []\n    for c in text:\n        if c == ' ':\n            ch = char_map['<SPACE>']\n        else:\n            ch = char_map[c]\n        int_sequence.append(ch)\n    return int_sequence\n\ndef int_sequence_to_text(int_sequence):\n    \"\"\" Convert an integer sequence to text \"\"\"\n    text = []\n    for c in int_sequence:\n        ch = index_map[c]\n        text.append(ch)\n    return text","metadata":{"_uuid":"dab382c1-7af4-4e90-a7b0-370bebfcf9b0","_cell_guid":"a37c8388-35e2-429e-9f4e-595ff9bb9d0e","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.630879Z","iopub.execute_input":"2021-07-19T05:52:16.631271Z","iopub.status.idle":"2021-07-19T05:52:16.639186Z","shell.execute_reply.started":"2021-07-19T05:52:16.631236Z","shell.execute_reply":"2021-07-19T05:52:16.638316Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#...........normalize the feature...............\ndef calc_feat_dim(window, max_freq):\n    return int(0.001 * window * max_freq) + 1\n\ndef normalize_feature(feature, eps=1e-14):\n         \n        #feat_dim = calc_feat_dim(34,8000)\n        #feats_mean = np.zeros((feat_dim,))\n        #feats_std = np.ones((feat_dim,))\n        \n        feats = np.vstack(feature)\n        feats_mean = np.mean(feats, axis=0)\n        feats_std = np.std(feats, axis=0)\n        \n        return (feature - feats_mean) / (feats_std + eps)","metadata":{"_uuid":"c12e2897-481d-4ad2-9acf-7b76fd7140ae","_cell_guid":"7d7d2623-1a44-40d3-9dff-06ba8a74ad4f","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2021-07-19T05:52:16.640579Z","iopub.execute_input":"2021-07-19T05:52:16.641018Z","iopub.status.idle":"2021-07-19T05:52:16.651051Z","shell.execute_reply.started":"2021-07-19T05:52:16.640984Z","shell.execute_reply":"2021-07-19T05:52:16.650224Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.........data preparation for model........\nfrom past.builtins import xrange\n\ndef map_input_data(final_df):\n    \n    max_length = max([x.shape[0] for x in final_df['mfcc']])\n    max_string_length = max([len(x) for x in final_df['label']])\n    #print(max_string_length)\n    X_data = np.zeros([final_df.shape[0], max_length,177])#here 177 means input shape\n    #print(X_data)\n    labels = np.ones([final_df.shape[0], max_string_length])*81\n    input_length = np.zeros([final_df.shape[0], 1])\n    label_length = np.zeros([final_df.shape[0], 1])\n    #print(len(label_length))\n    for i in range(0, final_df.shape[0]):\n        feat = final_df.iloc[i]['mfcc']\n        input_length[i] = feat.shape[0]\n        #print(input_length)\n        X_data[i, :feat.shape[0], :] = feat\n        #print(X_data[i, :feat.shape[0], :])\n        #print(len(final_df.iloc[i]['utterance']))\n        # calculate labels & label_length\n        #label = np.array(final_df.iloc[i]['utterance'])\n        y=final_df.iloc[i]['label']\n        #y=[' '.join(y[i:i+19]) for i in xrange(0,len(y),19)]                                                  \n        #print(y)\n        label=np.array(text_to_int_sequence(y))\n        #print((label))\n        #print(len(label))\n        labels[i, :len(label)] = label\n        #if feat.shape[0]<=len(label):\n            #print(feat.shape[0])\n            #label_length[i]=[' '.join(str(label[i:i+19])) for i in range(0,len(label),19)] \n        label_length[i] = len(label)\n\n    #print(X_data)\n    X_data=normalize_feature(X_data,eps=1e-14)\n    # return the arrays\n    outputs = {'ctc': np.zeros([final_df.shape[0]])}\n    #outputs = {'ctc': np.zeros(20)}\n     \n    print(max(label_length))   \n    inputs = {'input': X_data,   \n              'labels': labels,\n              'input_length':input_length ,\n              'label_length':label_length\n              }\n     \n    \n    #return (inputs,outputs),(inputs_1,outputs_1),X_data,labels,input_length,label_length\n    return (inputs,outputs)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.652279Z","iopub.execute_input":"2021-07-19T05:52:16.652656Z","iopub.status.idle":"2021-07-19T05:52:16.68111Z","shell.execute_reply.started":"2021-07-19T05:52:16.65262Z","shell.execute_reply":"2021-07-19T05:52:16.680254Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"(input_dict,outputs)=map_input_data(final_df)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:16.682245Z","iopub.execute_input":"2021-07-19T05:52:16.682574Z","iopub.status.idle":"2021-07-19T05:52:17.384765Z","shell.execute_reply.started":"2021-07-19T05:52:16.682541Z","shell.execute_reply":"2021-07-19T05:52:17.383696Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#print(input_dict)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.386183Z","iopub.execute_input":"2021-07-19T05:52:17.386554Z","iopub.status.idle":"2021-07-19T05:52:17.390138Z","shell.execute_reply.started":"2021-07-19T05:52:17.386516Z","shell.execute_reply":"2021-07-19T05:52:17.389303Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#x=pd.DataFrame.from_dict(input_dict)\n#x.head()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.391419Z","iopub.execute_input":"2021-07-19T05:52:17.391956Z","iopub.status.idle":"2021-07-19T05:52:17.404476Z","shell.execute_reply.started":"2021-07-19T05:52:17.391918Z","shell.execute_reply":"2021-07-19T05:52:17.403554Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#define ctc_lambda and ctc cost function\ndef ctc_lambda_func(args):\n    #import tensorflow as tf\n    \n    y_pred, labels, input_length, label_length = args\n    #y_pred = y_pred[:, 2:, :]\n    #print(y_pred.shape)\n    #print(input_length)\n    #print(labels.shape)\n    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)\n    #return tf.compat.v1.nn.ctc_loss(labels, y_pred, input_length,label_length,\n                                    #ignore_longer_outputs_than_inputs=True)\n\ndef add_ctc_loss(input_to_softmax):\n    the_labels = Input(name='labels', shape=(None,), dtype='float32')\n    input_lengths = Input(name='input_length', shape=(1,), dtype='int64')\n    label_lengths = Input(name='label_length', shape=(1,), dtype='int64')\n    output_lengths = Lambda(input_to_softmax.output_length)(input_lengths)\n    print(output_lengths)\n    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')(\n    [input_to_softmax.output, the_labels, output_lengths, label_lengths])\n    model = Model(\n        inputs=[input_to_softmax.input, the_labels, input_lengths,label_lengths],\n        outputs=loss_out)\n    return model\n","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.40591Z","iopub.execute_input":"2021-07-19T05:52:17.406259Z","iopub.status.idle":"2021-07-19T05:52:17.415177Z","shell.execute_reply.started":"2021-07-19T05:52:17.406225Z","shell.execute_reply":"2021-07-19T05:52:17.414369Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def cnn_output_length(input_length, filter_size, border_mode, stride,\n                       dilation=1):\n    \"\"\" Compute the length of the output sequence after 1D convolution along\n        time. Note that this function is in line with the function used in\n        Convolution1D class from Keras.\n    Params:\n        input_length (int): Length of the input sequence.\n        filter_size (int): Width of the convolution kernel.\n        border_mode (str): Only support `same` or `valid`.\n        stride (int): Stride size used in 1D convolution.\n        dilation (int)\n    \"\"\"\n    if input_length is None:\n        return None\n    assert border_mode in {'same', 'valid'}\n    dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)\n    if border_mode == 'same':\n        output_length = input_length\n    elif border_mode == 'valid':\n        output_length = input_length - dilated_filter_size + 1\n        #print(output_length )\n    return (output_length + stride - 1) // stride\n    #return output_length","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.416343Z","iopub.execute_input":"2021-07-19T05:52:17.416737Z","iopub.status.idle":"2021-07-19T05:52:17.425376Z","shell.execute_reply.started":"2021-07-19T05:52:17.416703Z","shell.execute_reply":"2021-07-19T05:52:17.424571Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.............................model-2...........................................\n#...........................bi-directional deep rnn.........\n\nfrom keras.layers import Dense, Activation, Bidirectional, Reshape, Flatten, Lambda, Input,\\\n    Masking, Convolution1D, BatchNormalization, GRU, Conv1D, RepeatVector, Conv2D,Dropout\nfrom keras.optimizers import SGD, Adam\n\ndef final_model_1(input_dim, filters,kernel_size,conv_stride,\n                  conv_border_mode,output_dim,number_of_layers,units):\n    \"\"\" Build a deep network for speech \n    \"\"\"\n    dropout_rate=0.5\n    cell=GRU \n    activation='tanh'\n    \n    # Main acoustic input\n    input_data = Input(name='input', shape=(None, input_dim))\n    # TODO: Specify the layers in your network\n    conv_1d = Conv1D(filters, kernel_size,\n                     strides=conv_stride,\n                     padding=conv_border_mode,\n                     activation='relu',\n                     name='layer_1_conv',\n                     dilation_rate=1)(input_data)\n    conv_bn = BatchNormalization(name='conv_batch_norm')(conv_1d)\n    \n    conv_bn=Dropout(0.25)(conv_bn)\n    if number_of_layers == 1:\n        layer = cell(units, activation=activation,\n                     return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate)(conv_bn)\n        layer = BatchNormalization(name='bt_rnn_1')(layer)\n        layer=Dropout(0.25)(layer)\n    else:\n        layer = cell(units, activation=activation,\n                     return_sequences=True, implementation=2, name='rnn_1', dropout=dropout_rate)(conv_bn)\n        \n        layer = BatchNormalization(name='bt_rnn_1')(layer)\n        layer=Dropout(0.25)(layer)\n\n        for i in range(number_of_layers - 2):\n            layer = cell(units, activation=activation,\n                         return_sequences=True, implementation=2, name='rnn_{}'.format(i + 2), dropout=dropout_rate)(layer)\n            \n            layer = BatchNormalization(name='bt_rnn_{}'.format(i + 2))(layer)\n\n        layer = cell(units, activation=activation,\n                     return_sequences=True, implementation=2, name='final_layer_of_rnn')(layer)\n        layer=Dropout(0.25)(layer)\n        layer = BatchNormalization(name='bt_rnn_final')(layer)\n\n    time_dense = TimeDistributed(Dense(output_dim))(layer)\n    time_dense=Dropout(0.5)(time_dense)\n    # TODO: Add softmax activation layer\n    y_pred = Activation('softmax', name='softmax')(time_dense)\n    # Specify the model\n    model = Model(inputs=input_data, outputs=y_pred)\n    # TODO: Specify model.output_length\n    #model=add(Dropout(0.5))\n    model.output_length = lambda x:x\n          #cnn_output_length(x,5,conv_border_mode,2)\n         #cnn_output_length(x, kernel_size,conv_border_mode,conv_stride)\n    \n    print(model.summary())\n    return model","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.427604Z","iopub.execute_input":"2021-07-19T05:52:17.427909Z","iopub.status.idle":"2021-07-19T05:52:17.444522Z","shell.execute_reply.started":"2021-07-19T05:52:17.427875Z","shell.execute_reply":"2021-07-19T05:52:17.443758Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#.................run the model.......................\n\ndef train_model(X,Y,optimizer=SGD(lr=0.002, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),epochs=20,verbose=1):\n    import tensorflow as tf\n    import math\n    \n    reset_random_seeds()\n    #model_1 = deep_rnn_model(input_dim=384)\n    model_1=final_model_1(input_dim=177,filters=200,kernel_size=1,conv_stride=1,\n                                                    conv_border_mode='valid',\n                                                    output_dim=83,\n                                                    number_of_layers=3,\n                                                    units=200)\n    model=model_1\n    model = add_ctc_loss(model)\n    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)\n    #model.compile(loss=ctc, optimizer=optimizer)\n    #model.compile(loss=tf.compat.v1.nn.ctc_loss(the_labels=ctc,\n                            #ignore_longer_outputs_than_inputs=True), optimizer=optimizer)\n                                                    \n    #if not os.path.exists('results'):\n        #os.makedirs('results')\n        #checkpointer = ModelCheckpoint(filepath='results/ASR_model.h5', verbose=0)\n    #hist = model.fit(x=X,y=Y,batch_size=10,epochs=epochs,validation_split=0.25)\n    #x=(X,Y)\n    #x_1=(X_1,Y_1)\n    #n_points =3\n    #batch_size =10\n    train_steps = math.ceil(3/10)\n    valid_steps=math.ceil(1/10)\n    \n    #os.makedirs('../outputs1/cp.ckpt')\n    \n    #checkpoint_path = \"E:/ssl_speech_recognition/cp.ckpt\"\n    checkpoint_dir = os.path.dirname(\"../outputs1/cp.ckpt\")\n\n    # Create a callback that saves the model's weights\n    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,\n                                                 save_weights_only=True,\n                                                 verbose=1)\n\n    #hist = model.fit_generator(generator=next_train(x,10),\n                              #steps_per_epoch=train_steps,\n                              #epochs=20,\n                              #validation_data=next_valid(x_1,10),\n                               #validation_steps=valid_steps,\n                               #callbacks=[cp_callback])\n                                    \n    hist = model.fit(x=X,y=Y,batch_size=10,epochs=500,validation_split=0.35,\n                    callbacks=[cp_callback])   #epoches=300,50000                        \n    return (hist,model,model_1)                     ","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.445686Z","iopub.execute_input":"2021-07-19T05:52:17.446045Z","iopub.status.idle":"2021-07-19T05:52:17.461015Z","shell.execute_reply.started":"2021-07-19T05:52:17.446009Z","shell.execute_reply":"2021-07-19T05:52:17.460022Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"hist,model,model_1=train_model(input_dict,outputs)","metadata":{"execution":{"iopub.status.busy":"2021-07-19T05:52:17.462167Z","iopub.execute_input":"2021-07-19T05:52:17.46249Z","iopub.status.idle":"2021-07-19T06:15:26.271404Z","shell.execute_reply.started":"2021-07-19T05:52:17.462458Z","shell.execute_reply":"2021-07-19T06:15:26.270398Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#plot the loss \nhistory_dict = hist.history\nkeys = history_dict.keys()\nloss = history_dict['loss']\nval_loss = history_dict['val_loss']\n \nplt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k')\nepochs = range(1, len(loss) + 1)\nplt.plot(epochs, loss,  label='Training Loss')\nplt.plot(epochs, val_loss,  label='Validation Loss')\nplt.title('Loss vs Epochs', fontsize = 25)\nplt.xlabel('Epochs', fontsize = 15)\nplt.ylabel('Loss', fontsize = 15)\nplt.legend()\nplt.grid(True)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2021-07-19T06:15:26.273131Z","iopub.execute_input":"2021-07-19T06:15:26.273483Z","iopub.status.idle":"2021-07-19T06:15:26.473292Z","shell.execute_reply.started":"2021-07-19T06:15:26.273444Z","shell.execute_reply":"2021-07-19T06:15:26.472519Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}


--------------------------------------------------------------------------------
/results/loss_file/Capture-2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qyum/Bangla-deep-speech-Recognition/04c06b9de990acc099309ec6fe48460a468247f2/results/loss_file/Capture-2.PNG


--------------------------------------------------------------------------------
/utiles/utility:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------