Language Model Using Tensorflow & keras

├── 1.1-language-model
    ├── data.txt
    ├── language_model_keras.ipynb
    ├── language_model_pytorch.ipynb
    └── language_model_torch.ipynb
├── 1.10-question-answering
    └── question_answering_keras.ipynb
├── 1.12-text-summarization
    └── text-summarizer.ipynb
├── 1.2-sentiment-analysis
    ├── sentiment_classfication_bert_keras.ipynb
    └── sentiment_classfication_pytorch.ipynb
├── 1.3-semantic-similarity
    ├── README.md
    ├── semantic-similarity-BERT.ipynb
    └── try_cf.ipynb
├── 1.4-machine-translation
    ├── neural_machine_translation.ipynb
    └── seq2seq.ipynb
├── 1.5-named-entity-recognition
    ├── data_making.py
    ├── ner_bert.ipynb
    ├── ner_keras.ipynb
    ├── simple_ner-2.0.ipynb
    └── simple_ner.ipynb
├── 1.6-intent-classification
    ├── README.md
    ├── intent_classfication_bert.ipynb
    ├── intent_classfication_bert_keras.ipynb
    ├── intent_classfication_keras.ipynb
    └── text-classification-with-bert-pytorch.ipynb
├── 1.7-entity-recognition
    ├── entity_recognition_keras.ipynb
    └── resume-entities-for-ner.zip
├── 1.8-next-word-prediction
    ├── cab_booking.txt
    └── next_word_prediction_keras.ipynb
├── 1.9-smart-compose
    ├── README.md
    ├── data
    │   └── dataset.txt
    └── smart_compose_keras.ipynb
├── README.md
└── simple-efficient-summarizer.ipynb


/1.1-language-model/data.txt:
--------------------------------------------------------------------------------
1 | Jack and Jill went up the hill
2 | To fetch a pail of water
3 | Jack fell down and broke his crown
4 | And Jill came tumbling after
5 | 


--------------------------------------------------------------------------------
/1.1-language-model/language_model_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h1>Language Model Using Tensorflow & keras<h1> "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<h5>Importing Libraries<h5>"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from numpy import array\n",
 24 |     "import numpy as np\n",
 25 |     "import tensorflow as tf\n",
 26 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 27 |     "from tensorflow.keras.utils import to_categorical\n",
 28 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 29 |     "from tensorflow.keras.models import Sequential\n",
 30 |     "from tensorflow.keras.layers import Dense\n",
 31 |     "from tensorflow.keras.layers import LSTM\n",
 32 |     "from tensorflow.keras.layers import Dropout\n",
 33 |     "from tensorflow.keras.layers import Embedding\n",
 34 |     "from tensorflow.keras.models import load_model\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "<h5>Preprocessing Data<h5>"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 7,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "class Preprocessing():\n",
 51 |     "    \n",
 52 |     "    def __init__(self,input_file):\n",
 53 |     "        self.input_data_file = input_file\n",
 54 |     "        self.data = None\n",
 55 |     "        self.vocab_size = None\n",
 56 |     "        self.encoded_data = None\n",
 57 |     "        self.max_length = None\n",
 58 |     "        self.sequences = None\n",
 59 |     "        self.x = None\n",
 60 |     "        self.y = None\n",
 61 |     "        self.tokenizer = None\n",
 62 |     "    \n",
 63 |     "    def load_data(self):\n",
 64 |     "        fp = open(self.input_data_file,'r')\n",
 65 |     "        self.data = fp.read().splitlines()        \n",
 66 |     "        fp.close()\n",
 67 |     "        \n",
 68 |     "    def encode_data(self):\n",
 69 |     "        self.tokenizer = Tokenizer()\n",
 70 |     "        self.tokenizer.fit_on_texts(self.data)\n",
 71 |     "        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
 72 |     "        print(self.encoded_data)\n",
 73 |     "        self.vocab_size = len(self.tokenizer.word_counts)+1\n",
 74 |     "        \n",
 75 |     "    def generate_sequence(self):\n",
 76 |     "        seq_list = list()\n",
 77 |     "        for item in self.encoded_data:\n",
 78 |     "            l = len(item)\n",
 79 |     "            for id in range(1,l):\n",
 80 |     "                seq_list.append(item[:id+1])\n",
 81 |     "        self.max_length = max([len(seq) for seq in seq_list])\n",
 82 |     "        self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n",
 83 |     "        print(self.sequences)\n",
 84 |     "        self.sequences = array(self.sequences)\n",
 85 |     "            \n",
 86 |     "    def get_data(self):\n",
 87 |     "        self.x = self.sequences[:,:-1]\n",
 88 |     "        self.y = self.sequences[:,-1]\n",
 89 |     "        print(\"y before:\",self.y)\n",
 90 |     "        self.y = to_categorical(self.y,num_classes=self.vocab_size)\n",
 91 |     "        print(\"y After:\",self.y)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 8,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "[[2, 1, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13], [2, 14, 15, 1, 16, 17, 18], [1, 3, 19, 20, 21]]\n",
104 |       "[[ 0  0  0  0  0  2  1]\n",
105 |       " [ 0  0  0  0  2  1  3]\n",
106 |       " [ 0  0  0  2  1  3  4]\n",
107 |       " [ 0  0  2  1  3  4  5]\n",
108 |       " [ 0  2  1  3  4  5  6]\n",
109 |       " [ 2  1  3  4  5  6  7]\n",
110 |       " [ 0  0  0  0  0  8  9]\n",
111 |       " [ 0  0  0  0  8  9 10]\n",
112 |       " [ 0  0  0  8  9 10 11]\n",
113 |       " [ 0  0  8  9 10 11 12]\n",
114 |       " [ 0  8  9 10 11 12 13]\n",
115 |       " [ 0  0  0  0  0  2 14]\n",
116 |       " [ 0  0  0  0  2 14 15]\n",
117 |       " [ 0  0  0  2 14 15  1]\n",
118 |       " [ 0  0  2 14 15  1 16]\n",
119 |       " [ 0  2 14 15  1 16 17]\n",
120 |       " [ 2 14 15  1 16 17 18]\n",
121 |       " [ 0  0  0  0  0  1  3]\n",
122 |       " [ 0  0  0  0  1  3 19]\n",
123 |       " [ 0  0  0  1  3 19 20]\n",
124 |       " [ 0  0  1  3 19 20 21]]\n",
125 |       "y before: [ 1  3  4  5  6  7  9 10 11 12 13 14 15  1 16 17 18  3 19 20 21]\n",
126 |       "y After: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
127 |       " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
128 |       " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
129 |       " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
130 |       " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
131 |       " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
132 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
133 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
134 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
135 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
136 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
137 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
138 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
139 |       " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
140 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
141 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
142 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
143 |       " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
144 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n",
145 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
146 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "pr = Preprocessing('data.txt')\n",
152 |     "pr.load_data()\n",
153 |     "pr.encode_data()\n",
154 |     "pr.generate_sequence()\n",
155 |     "pr.get_data()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "<h3>Model"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "class Model():\n",
172 |     "    def __init__(self,params):\n",
173 |     "        self.model = None\n",
174 |     "        self.history = None\n",
175 |     "        self.x = None\n",
176 |     "        self.y = None\n",
177 |     "        self.vocab_size = params['vocab_size']\n",
178 |     "        self.max_len = params['max_len']\n",
179 |     "        self.activation = params['activation']\n",
180 |     "        self.optimizer = params['optimizer']\n",
181 |     "        self.epochs = params['epochs']\n",
182 |     "        self.metrics = params['metrics']\n",
183 |     "        \n",
184 |     "        \n",
185 |     "    def create_model(self):\n",
186 |     "        self.model = Sequential()\n",
187 |     "        self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n",
188 |     "        self.model.add(LSTM(50))\n",
189 |     "        self.model.add(Dropout(0.1))\n",
190 |     "        self.model.add(Dense(self.vocab_size,activation=self.activation))\n",
191 |     "        self.model.compile(loss='categorical_crossentropy',optimizer=self.optimizer,metrics=self.metrics)\n",
192 |     "        print(self.model.summary())\n",
193 |     "    def run(self):\n",
194 |     "        self.history = self.model.fit(self.x,self.y,epochs=self.epochs)\n",
195 |     "        \n",
196 |     "    def save(self):\n",
197 |     "        self.model.save(\"lang_model.h5\")\n",
198 |     "        "
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "params = {\"activation\":\"softmax\",\"epochs\":500,\"verbose\":2,\"loss\":\"categorical_crossentropy\",\n",
208 |     "          \"optimizer\":\"adam\",\"metrics\":['accuracy'],\"vocab_size\":pr.vocab_size,\"max_len\":pr.max_length}\n",
209 |     "model_obj = Model(params)\n",
210 |     "model_obj.x = pr.x\n",
211 |     "model_obj.y = pr.y\n",
212 |     "model_obj.create_model()"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "model_obj.run()\n",
222 |     "model_obj.save()"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "<h4>Prediction"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "class Prediction():\n",
239 |     "    def __init__(self,tokenizer,max_len):\n",
240 |     "        self.model = None\n",
241 |     "        self.tokenizer = tokenizer\n",
242 |     "        self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n",
243 |     "        self.max_length = max_len\n",
244 |     "    \n",
245 |     "    def load_model(self):\n",
246 |     "        self.model = load_model(\"lang_model.h5\")\n",
247 |     "        \n",
248 |     "    def predict_sequnce(self,text,num_words):\n",
249 |     "        for id in range(num_words):\n",
250 |     "            encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n",
251 |     "            padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n",
252 |     "            y_pred = self.model.predict(padded_data)\n",
253 |     "            y_pred = np.argmax(y_pred)\n",
254 |     "            predict_word = self.idx2word[y_pred]\n",
255 |     "            text += ' ' + predict_word\n",
256 |     "        return text"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "pred = Prediction(pr.tokenizer,pr.max_length)    \n",
266 |     "pred.load_model()\n",
267 |     "print(pred.predict_sequnce(\"Jack and\",5))\n",
268 |     "print(pred.predict_sequnce('And Jill', 4))\n",
269 |     "print(pred.predict_sequnce('fell down', 5))\n",
270 |     "print(pred.predict_sequnce('pail of', 3))"
271 |    ]
272 |   }
273 |  ],
274 |  "metadata": {
275 |   "kernelspec": {
276 |    "display_name": "Python 3",
277 |    "language": "python",
278 |    "name": "python3"
279 |   },
280 |   "language_info": {
281 |    "codemirror_mode": {
282 |     "name": "ipython",
283 |     "version": 3
284 |    },
285 |    "file_extension": ".py",
286 |    "mimetype": "text/x-python",
287 |    "name": "python",
288 |    "nbconvert_exporter": "python",
289 |    "pygments_lexer": "ipython3",
290 |    "version": "3.6.9"
291 |   }
292 |  },
293 |  "nbformat": 4,
294 |  "nbformat_minor": 2
295 | }
296 | 


--------------------------------------------------------------------------------
/1.1-language-model/language_model_pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 15,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "import torch.nn as nn\n",
 11 |     "import torch.nn.functional as F\n",
 12 |     "\n",
 13 |     "import numpy as np\n",
 14 |     "from collections import Counter\n",
 15 |     "import os"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 16,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "batch_size = 1\n",
 25 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 17,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "Vocabulary size 22\n"
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "class PreProcessing():\n",
 43 |     "    \n",
 44 |     "    def get_data_from_file(self,train_file, batch_size, seq_size):\n",
 45 |     "        with open(train_file, 'r', encoding='utf-8') as f:\n",
 46 |     "            text = f.read()\n",
 47 |     "        text = text.split()\n",
 48 |     "\n",
 49 |     "        word_counts = Counter(text)\n",
 50 |     "        sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)\n",
 51 |     "        int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}\n",
 52 |     "        vocab_to_int = {w: k for k, w in int_to_vocab.items()}\n",
 53 |     "        n_vocab = len(int_to_vocab)\n",
 54 |     "\n",
 55 |     "        print('Vocabulary size', n_vocab)\n",
 56 |     "\n",
 57 |     "        int_text = [vocab_to_int[w] for w in text]\n",
 58 |     "        num_batches = int(len(int_text) / (seq_size * batch_size))\n",
 59 |     "        in_text = int_text[:num_batches * batch_size * seq_size]\n",
 60 |     "        out_text = np.zeros_like(in_text)\n",
 61 |     "        out_text[:-1] = in_text[1:]\n",
 62 |     "        out_text[-1] = in_text[0]\n",
 63 |     "        in_text = np.reshape(in_text, (batch_size, -1))\n",
 64 |     "        out_text = np.reshape(out_text, (batch_size, -1))\n",
 65 |     "        return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text\n",
 66 |     "\n",
 67 |     "\n",
 68 |     "    def get_batches(self,in_text, out_text, batch_size, seq_size):\n",
 69 |     "        num_batches = np.prod(in_text.shape) // (seq_size * batch_size)\n",
 70 |     "        for i in range(0, num_batches * seq_size, seq_size):\n",
 71 |     "            yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]\n",
 72 |     "            \n",
 73 |     "            \n",
 74 |     "preprocess_obj = PreProcessing()\n",
 75 |     "int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = preprocess_obj.get_data_from_file(\"data.txt\",4,4)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 18,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "array([[ 0,  1,  2,  3],\n",
 87 |        "       [ 4,  5,  6,  7],\n",
 88 |        "       [ 8,  9, 10, 11],\n",
 89 |        "       [12,  0, 13, 14]])"
 90 |       ]
 91 |      },
 92 |      "execution_count": 18,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "in_text"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 19,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "array([[ 1,  2,  3,  4],\n",
110 |        "       [ 5,  6,  7,  8],\n",
111 |        "       [ 9, 10, 11, 12],\n",
112 |        "       [ 0, 13, 14,  0]])"
113 |       ]
114 |      },
115 |      "execution_count": 19,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "out_text"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 26,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "class RNNModule(nn.Module):\n",
131 |     "    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):\n",
132 |     "        super(RNNModule, self).__init__()\n",
133 |     "        self.seq_size = seq_size\n",
134 |     "        self.lstm_size = lstm_size\n",
135 |     "        self.embedding = nn.Embedding(n_vocab, embedding_size)\n",
136 |     "        self.lstm = nn.LSTM(embedding_size,\n",
137 |     "                            lstm_size,\n",
138 |     "                            batch_first=True)\n",
139 |     "        self.dense = nn.Linear(lstm_size, n_vocab)\n",
140 |     "\n",
141 |     "    def forward(self, x, prev_state):\n",
142 |     "        embed = self.embedding(x)\n",
143 |     "        output, state = self.lstm(embed, prev_state)\n",
144 |     "        logits = self.dense(output)\n",
145 |     "\n",
146 |     "        return logits, state\n",
147 |     "\n",
148 |     "    def zero_state(self, batch_size):\n",
149 |     "        return (torch.zeros(1, batch_size, self.lstm_size),\n",
150 |     "                torch.zeros(1, batch_size, self.lstm_size))\n",
151 |     "    \n",
152 |     "    def get_loss_and_train_op(self, net, lr=0.001):\n",
153 |     "        criterion = nn.CrossEntropyLoss()\n",
154 |     "        optimizer = torch.optim.Adam(net.parameters(), lr=lr)\n",
155 |     "\n",
156 |     "        return criterion, optimizer\n",
157 |     "    \n",
158 |     "    def train(self):\n",
159 |     "        iteration = 0\n",
160 |     "        gradients_norm=5\n",
161 |     "        for e in range(200):\n",
162 |     "            batches = preprocess_obj.get_batches(in_text, out_text, batch_size, seq_size)\n",
163 |     "            state_h, state_c = net.zero_state(batch_size)\n",
164 |     "            state_h = state_h.to(device)\n",
165 |     "            state_c = state_c.to(device)\n",
166 |     "            for x, y in batches:\n",
167 |     "                iteration += 1\n",
168 |     "                net.train()\n",
169 |     "\n",
170 |     "                optimizer.zero_grad()\n",
171 |     "\n",
172 |     "                x = torch.tensor(x).to(device)\n",
173 |     "                y = torch.tensor(y).to(device)\n",
174 |     "\n",
175 |     "                logits, (state_h, state_c) = net(x, (state_h, state_c))\n",
176 |     "                loss = criterion(logits.transpose(1, 2), y)\n",
177 |     "\n",
178 |     "                loss_value = loss.item()\n",
179 |     "\n",
180 |     "                loss.backward()\n",
181 |     "\n",
182 |     "                state_h = state_h.detach()\n",
183 |     "                state_c = state_c.detach()\n",
184 |     "\n",
185 |     "                _ = torch.nn.utils.clip_grad_norm_(\n",
186 |     "                    net.parameters(), gradients_norm)\n",
187 |     "\n",
188 |     "                optimizer.step()\n",
189 |     "\n",
190 |     "                if iteration % 100 == 0:\n",
191 |     "                    print('Epoch: {}/{}'.format(e, 200),\n",
192 |     "                          'Iteration: {}'.format(iteration),\n",
193 |     "                          'Loss: {}'.format(loss_value))\n",
194 |     "\n",
195 |     "                if iteration % 1000 == 0:\n",
196 |     "                    torch.save(net.state_dict(),\n",
197 |     "                               'checkpoint_pt/model-{}.pth'.format(iteration))\n",
198 |     "seq_size = 4\n",
199 |     "embedding_size = 22\n",
200 |     "lstm_size = 64\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 27,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "net = RNNModule(n_vocab, seq_size,embedding_size, lstm_size)\n",
210 |     "net = net.to(device)\n",
211 |     "criterion, optimizer = net.get_loss_and_train_op(net, 0.01)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 28,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "ename": "RecursionError",
221 |      "evalue": "maximum recursion depth exceeded",
222 |      "output_type": "error",
223 |      "traceback": [
224 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
225 |       "\u001b[0;31mRecursionError\u001b[0m                            Traceback (most recent call last)",
226 |       "\u001b[0;32m<ipython-input-28-8eea702cf98c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
227 |       "\u001b[0;32m<ipython-input-26-72bfa1280c44>\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     37\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     38\u001b[0m                 \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m                 \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     41\u001b[0m                 \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
228 |       "... last 1 frames repeated, from the frame below ...\n",
229 |       "\u001b[0;32m<ipython-input-26-72bfa1280c44>\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     37\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     38\u001b[0m                 \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m                 \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     41\u001b[0m                 \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
230 |       "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "net.train()"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": []
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "kernelspec": {
248 |    "display_name": "Python 3",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.6.9"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 2
267 | }
268 | 


--------------------------------------------------------------------------------
/1.1-language-model/language_model_torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h4>Importing Libraries"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 191,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import torch\n",
 19 |     "import torch.nn as nn\n",
 20 |     "import torch.optim as optim\n",
 21 |     "from torch.autograd import Variable"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 192,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "class Tokenizer():\n",
 31 |     "    def fit_on_texts(self,list_data):\n",
 32 |     "        word_list = \" \".join(list_data).split()\n",
 33 |     "        self.word_counts = list(set(word_list))\n",
 34 |     "        self.word_dict = {w: i for i, w in enumerate(self.word_counts)}\n",
 35 |     "        self.number_dict = {i: w for i, w in enumerate(self.word_counts)}\n",
 36 |     "        \n",
 37 |     "    def texts_to_sequences(self,data):\n",
 38 |     "        encoded_sequence = list()\n",
 39 |     "        for item in data:\n",
 40 |     "            encoded_sequence.append([self.word_dict[word] for word in item.split()])\n",
 41 |     "        return encoded_sequence\n",
 42 |     "    \n",
 43 |     "def pad_sequences(data,padding='pre',padding_value=0):\n",
 44 |     "    sequence = None\n",
 45 |     "    if isinstance(data,list):\n",
 46 |     "        maxlen = max(len(item) for item in data)\n",
 47 |     "        \n",
 48 |     "    if padding == 'pre':\n",
 49 |     "        for idx in range(len(data)):\n",
 50 |     "            data[idx] = [padding_value]*(maxlen-len(data[idx])) + data[idx]\n",
 51 |     "    else:\n",
 52 |     "        for idx in range(len(data)):\n",
 53 |     "            data[idx] = data[idx]+ [padding_value]*(maxlen-len(data[idx]))\n",
 54 |     "                                                    \n",
 55 |     "    return data\n",
 56 |     "def to_categorical(data, nb_classes):\n",
 57 |     "    targets = np.array(data).reshape(-1)\n",
 58 |     "    return np.eye(nb_classes)[targets]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 195,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "class Preprocessing():\n",
 68 |     "    \n",
 69 |     "    def __init__(self,input_file):\n",
 70 |     "        self.input_data_file = input_file\n",
 71 |     "        self.data = None\n",
 72 |     "        self.vocab_size = None\n",
 73 |     "        self.encoded_data = None\n",
 74 |     "        self.max_length = None\n",
 75 |     "        self.sequences = None\n",
 76 |     "        self.x = None\n",
 77 |     "        self.y = None\n",
 78 |     "        self.tokenizer = None\n",
 79 |     "    \n",
 80 |     "    def load_data(self):\n",
 81 |     "        fp = open(self.input_data_file,'r')\n",
 82 |     "        self.data = fp.read().splitlines()        \n",
 83 |     "        fp.close()\n",
 84 |     "        \n",
 85 |     "    def encode_data(self):\n",
 86 |     "        self.tokenizer = Tokenizer()\n",
 87 |     "        self.tokenizer.fit_on_texts(self.data)\n",
 88 |     "        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
 89 |     "        print(self.encoded_data)\n",
 90 |     "        self.vocab_size = len(self.tokenizer.word_counts)+1\n",
 91 |     "        \n",
 92 |     "    def generate_sequence(self):\n",
 93 |     "        seq_list = list()\n",
 94 |     "        for item in self.encoded_data:\n",
 95 |     "            l = len(item)\n",
 96 |     "            for id in range(1,l):\n",
 97 |     "                seq_list.append(item[:id+1])\n",
 98 |     "        #print(seq_list[0])\n",
 99 |     "        print(seq_list)\n",
100 |     "        self.sequences = pad_sequences(seq_list,padding='pre', padding_value=0)\n",
101 |     "        print(self.sequences)\n",
102 |     "        self.sequences = array(self.sequences)\n",
103 |     "            \n",
104 |     "    def get_data(self):\n",
105 |     "        self.x = self.sequences[:,:-1]\n",
106 |     "        self.y = self.sequences[:,-1]\n",
107 |     "        print(self.y)\n",
108 |     "        self.y = to_categorical(self.y,nb_classes=self.vocab_size)\n",
109 |     "        print(\"Y:{}\".format(self.y))\n",
110 |     "        print(\"X:{}\".format(self.x))\n",
111 |     "        return self.x,self.y"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 196,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "[[2, 12, 16, 4, 3, 14, 9], [17, 0, 18, 6, 19, 21], [2, 10, 8, 12, 11, 7, 15], [1, 16, 5, 20, 13]]\n",
124 |       "[[2, 12], [2, 12, 16], [2, 12, 16, 4], [2, 12, 16, 4, 3], [2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [17, 0], [17, 0, 18], [17, 0, 18, 6], [17, 0, 18, 6, 19], [17, 0, 18, 6, 19, 21], [2, 10], [2, 10, 8], [2, 10, 8, 12], [2, 10, 8, 12, 11], [2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [1, 16], [1, 16, 5], [1, 16, 5, 20], [1, 16, 5, 20, 13]]\n",
125 |       "[[0, 0, 0, 0, 0, 2, 12], [0, 0, 0, 0, 2, 12, 16], [0, 0, 0, 2, 12, 16, 4], [0, 0, 2, 12, 16, 4, 3], [0, 2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [0, 0, 0, 0, 0, 17, 0], [0, 0, 0, 0, 17, 0, 18], [0, 0, 0, 17, 0, 18, 6], [0, 0, 17, 0, 18, 6, 19], [0, 17, 0, 18, 6, 19, 21], [0, 0, 0, 0, 0, 2, 10], [0, 0, 0, 0, 2, 10, 8], [0, 0, 0, 2, 10, 8, 12], [0, 0, 2, 10, 8, 12, 11], [0, 2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [0, 0, 0, 0, 0, 1, 16], [0, 0, 0, 0, 1, 16, 5], [0, 0, 0, 1, 16, 5, 20], [0, 0, 1, 16, 5, 20, 13]]\n",
126 |       "[12 16  4  3 14  9  0 18  6 19 21 10  8 12 11  7 15 16  5 20 13]\n",
127 |       "Y:[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
128 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
129 |       " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
130 |       " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
131 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
132 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
133 |       " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
134 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
135 |       " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
136 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
137 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
138 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
139 |       " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
140 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
141 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
142 |       " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
143 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
144 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
145 |       " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
146 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n",
147 |       " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
148 |       "X:[[ 0  0  0  0  0  2]\n",
149 |       " [ 0  0  0  0  2 12]\n",
150 |       " [ 0  0  0  2 12 16]\n",
151 |       " [ 0  0  2 12 16  4]\n",
152 |       " [ 0  2 12 16  4  3]\n",
153 |       " [ 2 12 16  4  3 14]\n",
154 |       " [ 0  0  0  0  0 17]\n",
155 |       " [ 0  0  0  0 17  0]\n",
156 |       " [ 0  0  0 17  0 18]\n",
157 |       " [ 0  0 17  0 18  6]\n",
158 |       " [ 0 17  0 18  6 19]\n",
159 |       " [ 0  0  0  0  0  2]\n",
160 |       " [ 0  0  0  0  2 10]\n",
161 |       " [ 0  0  0  2 10  8]\n",
162 |       " [ 0  0  2 10  8 12]\n",
163 |       " [ 0  2 10  8 12 11]\n",
164 |       " [ 2 10  8 12 11  7]\n",
165 |       " [ 0  0  0  0  0  1]\n",
166 |       " [ 0  0  0  0  1 16]\n",
167 |       " [ 0  0  0  1 16  5]\n",
168 |       " [ 0  0  1 16  5 20]]\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "pr = Preprocessing('data.txt')\n",
174 |     "pr.load_data()\n",
175 |     "pr.encode_data()\n",
176 |     "pr.generate_sequence()\n",
177 |     "x,y = pr.get_data()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 184,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "n_step = 1 # n-1 in paper\n",
187 |     "n_hidden = 1 # h in paper\n",
188 |     "m = 1 # m in paper\n",
189 |     "n_class = pr.vocab_size\n",
190 |     "dtype = torch.FloatTensor\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 178,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "class NNLM(nn.Module):\n",
200 |     "    def __init__(self):\n",
201 |     "        super(NNLM, self).__init__()\n",
202 |     "        self.C = nn.Embedding(n_class, m)\n",
203 |     "        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))\n",
204 |     "        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))\n",
205 |     "        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))\n",
206 |     "        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))\n",
207 |     "        self.b = nn.Parameter(torch.randn(n_class).type(dtype))\n",
208 |     "\n",
209 |     "    def forward(self, X):\n",
210 |     "        X = self.C(X)\n",
211 |     "        X = X.view(-1, n_step * m) # [batch_size, n_step * n_class]\n",
212 |     "        tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]\n",
213 |     "        output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]\n",
214 |     "        return output\n",
215 |     "    \n",
216 |     "def train(x,y):\n",
217 |     "    model = NNLM()\n",
218 |     "    criterion = nn.CrossEntropyLoss()\n",
219 |     "    optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
220 |     "    # Training\n",
221 |     "    for epoch in range(100):\n",
222 |     "\n",
223 |     "        optimizer.zero_grad()\n",
224 |     "        output = model(x)\n",
225 |     "\n",
226 |     "        # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\n",
227 |     "        loss = criterion(output, y)\n",
228 |     "        if (epoch + 1)%1000 == 0:\n",
229 |     "            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
230 |     "\n",
231 |     "        loss.backward()\n",
232 |     "        optimizer.step()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 179,
238 |    "metadata": {},
239 |    "outputs": [
240 |     {
241 |      "ename": "ValueError",
242 |      "evalue": "Expected input batch_size (126) to match target batch_size (21).",
243 |      "output_type": "error",
244 |      "traceback": [
245 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
246 |       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
247 |       "\u001b[0;32m<ipython-input-179-812f92838505>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
248 |       "\u001b[0;32m<ipython-input-178-bf067ba688d9>\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     28\u001b[0m         \u001b[0;31m# output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m         \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     30\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m1000\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Epoch:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'%04d'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'cost ='\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'{:.6f}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
249 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    548\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    549\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 550\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    551\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    552\u001b[0m             \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
250 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m    930\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    931\u001b[0m         return F.cross_entropy(input, target, weight=self.weight,\n\u001b[0;32m--> 932\u001b[0;31m                                ignore_index=self.ignore_index, reduction=self.reduction)\n\u001b[0m\u001b[1;32m    933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    934\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
251 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m   2315\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0msize_average\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2316\u001b[0m         \u001b[0mreduction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlegacy_get_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize_average\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2317\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlog_softmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2319\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
252 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m   2111\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2112\u001b[0m         raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'\n\u001b[0;32m-> 2113\u001b[0;31m                          .format(input.size(0), target.size(0)))\n\u001b[0m\u001b[1;32m   2114\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mdim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2115\u001b[0m         \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_enum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
253 |       "\u001b[0;31mValueError\u001b[0m: Expected input batch_size (126) to match target batch_size (21)."
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "x = Variable(torch.LongTensor(x))\n",
259 |     "y = Variable(torch.LongTensor(y))\n",
260 |     "train(x,y)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": []
269 |   }
270 |  ],
271 |  "metadata": {
272 |   "kernelspec": {
273 |    "display_name": "Python 3",
274 |    "language": "python",
275 |    "name": "python3"
276 |   },
277 |   "language_info": {
278 |    "codemirror_mode": {
279 |     "name": "ipython",
280 |     "version": 3
281 |    },
282 |    "file_extension": ".py",
283 |    "mimetype": "text/x-python",
284 |    "name": "python",
285 |    "nbconvert_exporter": "python",
286 |    "pygments_lexer": "ipython3",
287 |    "version": "3.6.9"
288 |   }
289 |  },
290 |  "nbformat": 4,
291 |  "nbformat_minor": 2
292 | }
293 | 


--------------------------------------------------------------------------------
/1.12-text-summarization/text-summarizer.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport os\nimport tensorflow as tf\nfrom sklearn.model_selection import train_test_split\n\nfrom tensorflow.keras.preprocessing.text import Tokenizer \nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\n\nfrom tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint","execution_count":39,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Adding Attention Layer as its not a part of the keras\nhttps://www.kaggle.com/kweku20/attention"},{"metadata":{"trusted":true},"cell_type":"code","source":"from shutil import copyfile\ncopyfile(src = \"/kaggle/input/attention/attention.py\", dst = \"/kaggle/working/attention.py\")\nfrom attention import AttentionLayer","execution_count":40,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n    def __init__(self):\n        data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n        print(data.head())\n        self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n        ","execution_count":41,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":42,"outputs":[{"output_type":"stream","text":"   Id   ProductId          UserId                      ProfileName  \\\n0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   \n1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   \n2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres \"Natalia Corres\"   \n3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   \n4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham \"M. Wassir\"   \n\n   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \\\n0                     1                       1      5  1303862400   \n1                     0                       0      1  1346976000   \n2                     1                       1      4  1219017600   \n3                     3                       3      2  1307923200   \n4                     0                       0      5  1350777600   \n\n                 Summary                                               Text  \n0  Good Quality Dog Food  I have bought several of the Vitality canned d...  \n1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  \n2  \"Delight\" says it all  This is a confection that has been around a fe...  \n3         Cough Medicine  If you are looking for the secret ingredient i...  \n4            Great taffy  Great taffy at a great price.  There was a wid...  \n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"class PreprocessingData():\n    def __init__(self):\n        self.max_in_len = 100\n        self.max_tar_len = 10\n        \n    def preprocess_data(self,data):\n        data.dropna(axis=0,inplace=True)\n        data['Summary'] = data['Summary'].apply(lambda x : 'start '+ x + ' end')\n        return data\n    \n    def get_data(self,data):\n        x_train,x_val,y_train,y_val = train_test_split(np.array(data['Text']),np.array(data['Summary']),test_size=0.1,random_state=0,shuffle=True)\n        return x_train,x_val,y_train,y_val\n    \n    def encode_data(self,data,x_train,x_val,y_train,y_val):\n        \n        # Input Encoding\n        in_tokenizer = Tokenizer() \n        in_tokenizer.fit_on_texts(data[\"Text\"].tolist())\n\n        x_train_seq = in_tokenizer.texts_to_sequences(x_train) \n        x_val_seq = in_tokenizer.texts_to_sequences(x_val)\n\n        x_train = pad_sequences(x_train_seq, maxlen = self.max_in_len, padding='post')\n        x_val = pad_sequences(x_val_seq, maxlen = self.max_in_len, padding='post')\n\n        self.in_voc = len(in_tokenizer.word_counts) + 1\n        \n        # Target Encoding\n        tar_tokenizer = Tokenizer() \n        tar_tokenizer.fit_on_texts(data[\"Summary\"].tolist())\n\n        y_train_seq = tar_tokenizer.texts_to_sequences(y_train) \n        y_val_seq = tar_tokenizer.texts_to_sequences(y_val)\n\n        y_train = pad_sequences(y_train_seq,  maxlen = self.max_tar_len, padding='post')\n        y_val = pad_sequences(y_val_seq, maxlen = self.max_tar_len, padding='post')\n\n        self.tar_voc = len(tar_tokenizer.word_counts) + 1\n        return x_train,x_val,y_train,y_val","execution_count":43,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling preprocessing module on loading data"},{"metadata":{"trusted":true},"cell_type":"code","source":"preprocessing_data = PreprocessingData()\ndata = preprocessing_data.preprocess_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.get_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.encode_data(data,x_train,x_val,y_train,y_val)","execution_count":44,"outputs":[{"output_type":"error","ename":"TypeError","evalue":"unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)","\u001b[0;32m<ipython-input-44-29d3347b8de3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocess_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m<ipython-input-43-dbc313ffb414>\u001b[0m in \u001b[0;36mencode_data\u001b[0;34m(self, data, x_train, x_val, y_train, y_val)\u001b[0m\n\u001b[1;32m     25\u001b[0m         \u001b[0mx_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpad_sequences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val_seq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_in_len\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_voc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0min_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_counts\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     29\u001b[0m         \u001b[0;31m# Target Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'"]}]},{"metadata":{},"cell_type":"markdown","source":"Model Creation"},{"metadata":{"trusted":true},"cell_type":"code","source":"class Model():\n    def __init__(self):\n        self.model = None\n        \n    def define_model(self):\n        raise NotImplementedError","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4}


--------------------------------------------------------------------------------
/1.2-sentiment-analysis/sentiment_classfication_bert_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tensorflow as tf\n",
 10 |     "import pandas as pd\n",
 11 |     "import tensorflow_hub as hub\n",
 12 |     "import bert\n",
 13 |     "import os\n",
 14 |     "import re\n",
 15 |     "import numpy as np\n",
 16 |     "from tqdm import tqdm\n",
 17 |     "from tqdm import tqdm_notebook\n",
 18 |     "from tensorflow.keras import backend as K\n",
 19 |     "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n",
 20 |     "from tensorflow.keras.models import Sequential, Model"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "class LoadData():\n",
 30 |     "    def __init__(self,csv_file):\n",
 31 |     "        self.df = pd.read_csv(os.path.join(os.getcwd(),csv_file))\n",
 32 |     "        self.train_df = None\n",
 33 |     "        self.test_df = None\n",
 34 |     "    def load_data(self):\n",
 35 |     "        self.df.columns = ['sentence','sentiment']\n",
 36 |     "        self.train_df = self.df[self.df['sentiment']=='positive']\n",
 37 |     "        self.test_df = self.df[self.df['sentiment']=='negative']\n",
 38 |     "        self.train_df.loc[self.train_df['sentiment']=='positive','polarity'] = 1\n",
 39 |     "        self.test_df.loc[self.test_df['sentiment']=='negative','polarity'] = 0\n"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "loaddata_obj = LoadData(\"imdb_dataset_small.csv\") \n",
 49 |     "loaddata_obj.load_data()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "loaddata_obj.train_df.head()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "loaddata_obj.test_df.head()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "class BertModel(object):\n",
 77 |     "    \n",
 78 |     "    def __init__(self):\n",
 79 |     "        \n",
 80 |     "        self.max_len = 128\n",
 81 |     "        bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
 82 |     "        FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
 83 |     "        \n",
 84 |     "        self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
 85 |     "\n",
 86 |     "        self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
 87 |     "\n",
 88 |     "        self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
 89 |     "\n",
 90 |     "        self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
 91 |     "        \n",
 92 |     "    def get_masks(self,tokens, max_seq_length):\n",
 93 |     "        return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
 94 |     "\n",
 95 |     "    def get_segments(self,tokens, max_seq_length):\n",
 96 |     "        \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n",
 97 |     "        segments = []\n",
 98 |     "        current_segment_id = 0\n",
 99 |     "        for token in tokens:\n",
100 |     "            segments.append(current_segment_id)\n",
101 |     "            if token == \"[SEP]\":\n",
102 |     "                current_segment_id = 1\n",
103 |     "        return segments + [0] * (max_seq_length - len(tokens))\n",
104 |     "    \n",
105 |     "    def get_ids(self,tokens, tokenizer, max_seq_length):\n",
106 |     "        \"\"\"Token ids from Tokenizer vocab\"\"\"\n",
107 |     "        token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
108 |     "        input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n",
109 |     "        return input_ids\n",
110 |     "    def create_single_input(self,sentence,maxlen):\n",
111 |     "\n",
112 |     "        stokens = self.tokenizer.tokenize(sentence)\n",
113 |     "\n",
114 |     "        stokens = stokens[:maxlen]\n",
115 |     "\n",
116 |     "        stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n",
117 |     "\n",
118 |     "        ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n",
119 |     "        masks = self.get_masks(stokens, self.max_len)\n",
120 |     "        segments = self.get_segments(stokens, self.max_len)\n",
121 |     "\n",
122 |     "        return ids,masks,segments\n",
123 |     "\n",
124 |     "    def create_input_array(self,sentences):\n",
125 |     "        \n",
126 |     "        input_ids, input_masks, input_segments = [], [], []\n",
127 |     "\n",
128 |     "        for sentence in tqdm(sentences,position=0, leave=True):\n",
129 |     "            ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n",
130 |     "\n",
131 |     "            input_ids.append(ids)\n",
132 |     "            input_masks.append(masks)\n",
133 |     "            input_segments.append(segments)\n",
134 |     "            \n",
135 |     "        tensor = [np.asarray(input_ids, dtype=np.int32), \n",
136 |     "                np.asarray(input_masks, dtype=np.int32), \n",
137 |     "                np.asarray(input_segments, dtype=np.int32)]\n",
138 |     "        return tensor"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "class PreprocessingBertData():\n",
148 |     "    \n",
149 |     "    def prepare_data_x(self,train_sentences):\n",
150 |     "        x = bert_model_obj.get_input_array(train_sentences)\n",
151 |     "        return x\n",
152 |     "    \n",
153 |     "    def prepare_data_y(self,train_labels):\n",
154 |     "        y = list()\n",
155 |     "        for item in train_labels:\n",
156 |     "            label = item\n",
157 |     "            y.append(label)\n",
158 |     "        y = np.array(y)\n",
159 |     "        return y\n",
160 |     "        "
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "class BertModel(object):\n",
170 |     "    \n",
171 |     "    def __init__(self):\n",
172 |     "        \n",
173 |     "        self.max_len = 128\n",
174 |     "        bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
175 |     "        FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
176 |     "        \n",
177 |     "        self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
178 |     "\n",
179 |     "        self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
180 |     "\n",
181 |     "        self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
182 |     "\n",
183 |     "        self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
184 |     "        \n",
185 |     "    def get_masks(self,tokens, max_seq_length):\n",
186 |     "        mask_data =  [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
187 |     "        return mask_data\n",
188 |     "\n",
189 |     "    def get_segments(self,tokens, max_seq_length):\n",
190 |     "        '''\n",
191 |     "        Segments: 0 for the first sequence, \n",
192 |     "        1 for the second\n",
193 |     "        '''\n",
194 |     "        segments = []\n",
195 |     "        segment_id = 0\n",
196 |     "        for token in tokens:\n",
197 |     "            segments.append(current_segment_id)\n",
198 |     "            if token == \"[SEP]\":\n",
199 |     "                segment_id = 1\n",
200 |     "        '''Remaining are padded with 0'''\n",
201 |     "        remaining_segment = [0] * (max_seq_length - len(tokens))\n",
202 |     "        segment_data = segments + remaining_segment\n",
203 |     "        return segment_data\n",
204 |     "    \n",
205 |     "    def get_ids(self,tokens, tokenizer, max_seq_length):\n",
206 |     "        token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
207 |     "        remaining_ids = [0] * (max_seq_length-len(token_ids))\n",
208 |     "        input_ids = token_ids + remaining_ids\n",
209 |     "        return input_ids\n",
210 |     "    \n",
211 |     "    def get_input_data(self,sentence,maxlen):\n",
212 |     "\n",
213 |     "        sent_token = self.tokenizer.tokenize(sentence)\n",
214 |     "\n",
215 |     "        sent_token = sent_token[:maxlen]\n",
216 |     "\n",
217 |     "        sent_token = [\"[CLS]\"] + sent_token + [\"[SEP]\"]\n",
218 |     "\n",
219 |     "        id = self.get_ids(sent_token, self.tokenizer, self.max_len)\n",
220 |     "        mask = self.get_masks(sent_token, self.max_len)\n",
221 |     "        segment = self.get_segments(sent_token, self.max_len)\n",
222 |     "        input_data = [id,mask,segment]\n",
223 |     "        return input_data\n",
224 |     "\n",
225 |     "    def get_input_array(self,sentences):\n",
226 |     "        \n",
227 |     "        input_ids, input_masks, input_segments = [], [], []\n",
228 |     "\n",
229 |     "        for sentence in tqdm(sentences,position=0, leave=True):\n",
230 |     "            ids,masks,segments=self.get_input_data(sentence,self.max_len-2)\n",
231 |     "\n",
232 |     "            input_ids.append(ids)\n",
233 |     "            input_masks.append(masks)\n",
234 |     "            input_segments.append(segments)\n",
235 |     "            \n",
236 |     "        input_array = [np.asarray(input_ids, dtype=np.int32),np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]\n",
237 |     "        return input_array"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "bert_model_obj = BertModel()\n",
247 |     "preprocess_bert_data_obj = PreprocessingBertData()"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "train_sentences = loaddata_obj.train_df[\"sentence\"].tolist()\n",
257 |     "train_labels = loaddata_obj.train_df[\"polarity\"].tolist()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n",
267 |     "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n",
268 |     "\n",
269 |     "train_input_ids, train_input_masks, train_segment_ids = x\n",
270 |     "train_labels = y"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "class DesignModel():\n",
280 |     "    def __init__(self):\n",
281 |     "        self.model = None        \n",
282 |     "        self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n",
283 |     "        self.train_labels = train_labels\n",
284 |     "        \n",
285 |     "    def bert_model(self,max_seq_length): \n",
286 |     "        in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n",
287 |     "        in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n",
288 |     "        in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n",
289 |     "        \n",
290 |     "        bert_inputs = [in_id, in_mask, in_segment]\n",
291 |     "        bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n",
292 |     "        \n",
293 |     "        bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n",
294 |     "        bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n",
295 |     "        bert_outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\", name=\"dense_output\")(bert_sequence_output)\n",
296 |     "        self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n",
297 |     "        \n",
298 |     "        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
299 |     "        \n",
300 |     "        self.model.summary()\n",
301 |     "    \n",
302 |     "    def model_train(self,batch_size,num_epoch):\n",
303 |     "        print(\"Fitting to model\")\n",
304 |     "        \n",
305 |     "        self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n",
306 |     "        \n",
307 |     "        print(\"Model Training complete.\")\n",
308 |     "\n",
309 |     "    def save_model(self,model,model_name):    \n",
310 |     "        self.model.save(model_name+\".h5\")\n",
311 |     "        print(\"Model saved to Model folder.\")"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "model_obj = DesignModel()\n",
321 |     "model_obj.bert_model(bert_model_obj.max_len)\n",
322 |     "model_obj.model_train(32,1)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "model_obj.save_model(model_obj.model,\"bert\")"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "class Evaluation():\n",
341 |     "    def get_accuracy(self,actuals, predictions):\n",
342 |     "        acc = accuracy_score(actuals, predictions)\n",
343 |     "        return acc"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "class Prediction():\n",
353 |     "    def __init__(self):\n",
354 |     "        self.model = model_obj.model\n",
355 |     "        \n",
356 |     "    def predict_validation(self):\n",
357 |     "        valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n",
358 |     "        valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n",
359 |     "\n",
360 |     "        preprocess_bert_data_obj = PreprocessingBertData()\n",
361 |     "        val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n",
362 |     "        prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n",
363 |     "        return valid_labels,prediction_labels\n",
364 |     "        \n",
365 |     "    \n",
366 |     "    def predict(self,query):\n",
367 |     "        query_seq = bert_model_obj.get_input_array([query])\n",
368 |     "        pred = self.model.predict(query_seq)\n",
369 |     "        pred = np.argmax(pred)\n",
370 |     "        result = load_data_obj.cat_to_intent[pred]\n",
371 |     "        return result"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "pred_obj = Prediction()"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "eval_obj = Evaluation()\n",
390 |     "ytest,ypred = pred_obj.predict_validation()\n",
391 |     "acc = eval_obj.get_accuracy(ytest,ypred)\n",
392 |     "print(\"Auc: {:.2%}\".format(acc))"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": []
401 |   }
402 |  ],
403 |  "metadata": {
404 |   "kernelspec": {
405 |    "display_name": "Python 3",
406 |    "language": "python",
407 |    "name": "python3"
408 |   },
409 |   "language_info": {
410 |    "codemirror_mode": {
411 |     "name": "ipython",
412 |     "version": 3
413 |    },
414 |    "file_extension": ".py",
415 |    "mimetype": "text/x-python",
416 |    "name": "python",
417 |    "nbconvert_exporter": "python",
418 |    "pygments_lexer": "ipython3",
419 |    "version": "3.6.9"
420 |   }
421 |  },
422 |  "nbformat": 4,
423 |  "nbformat_minor": 2
424 | }
425 | 


--------------------------------------------------------------------------------
/1.3-semantic-similarity/README.md:
--------------------------------------------------------------------------------
1 | pip install -U sentence-transformers scipy
2 | 


--------------------------------------------------------------------------------
/1.3-semantic-similarity/semantic-similarity-BERT.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 7,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from sentence_transformers import SentenceTransformer\n",
10 |     "import scipy\n",
11 |     "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n",
12 |     "# Corpus with example sentences\n",
13 |     "\n",
14 |     "corpus = [\n",
15 |     "   'A man is eating a food.',\n",
16 |     "   'A man is eating a piece of bread.',\n",
17 |     "   'The girl is carrying a baby.',\n",
18 |     "   'A man is riding a horse.',\n",
19 |     "   'A woman is playing violin.',\n",
20 |     "   'Two men pushed carts through the woods.',\n",
21 |     "   'A man is riding a white horse on an enclosed ground.',\n",
22 |     "   'A monkey is playing drums.',\n",
23 |     "   'A cheetah is running behind its prey.']\n",
24 |     "queries = ['A man is eating pasta.', \n",
25 |     "           'Someone in a gorilla costume is playing a set of drums.', \n",
26 |     "           'A cheetah chases prey on across a field.']"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": 8,
32 |    "metadata": {},
33 |    "outputs": [],
34 |    "source": [
35 |     "corpus_embeddings = embedder.encode(corpus)\n",
36 |     "query_embeddings = embedder.encode(queries)"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "code",
41 |    "execution_count": 22,
42 |    "metadata": {},
43 |    "outputs": [
44 |     {
45 |      "name": "stdout",
46 |      "output_type": "stream",
47 |      "text": [
48 |       "[0.21805174 0.15202328 1.04767431 0.89392366 0.96026727 0.79048636\n",
49 |       " 0.8414415  0.80550679 0.90363039] A man is eating pasta.\n",
50 |       "The girl is carrying a baby.\n",
51 |       "[0.80833937 0.8089816  0.76493318 0.79766881 0.92636551 0.84321454\n",
52 |       " 0.80365482 0.20152853 0.71403489] Someone in a gorilla costume is playing a set of drums.\n",
53 |       "A woman is playing violin.\n",
54 |       "[0.97539473 0.95483563 0.87328057 0.7070155  0.94015294 0.63376342\n",
55 |       " 0.72819954 0.6939273  0.09933373] A cheetah chases prey on across a field.\n",
56 |       "A man is eating a food.\n"
57 |      ]
58 |     }
59 |    ],
60 |    "source": [
61 |     "import numpy as np\n",
62 |     "for query, query_embedding in zip(queries, query_embeddings):\n",
63 |     "    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n",
64 |     "    print(distances,query)\n",
65 |     "    print(corpus[np.argmax(distances)])"
66 |    ]
67 |   },
68 |   {
69 |    "cell_type": "code",
70 |    "execution_count": null,
71 |    "metadata": {},
72 |    "outputs": [],
73 |    "source": []
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.6.9"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 2
97 | }
98 | 


--------------------------------------------------------------------------------
/1.3-semantic-similarity/try_cf.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 38,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from sentence_transformers import SentenceTransformer\n",
10 |     "import scipy\n",
11 |     "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n",
12 |     "# Corpus with example sentences\n",
13 |     "\n",
14 |     "corpus = [\n",
15 |     "       'i would like to clean my XYZ',\n",
16 |     "    'book an appointment for XYZ cleaning',\n",
17 |     "    'schedule a XYZ cleaning services',\n",
18 |     "    'looking for XYZ cleaninng services',\n",
19 |     "    'want an appointment for XYZ cleaning'\n",
20 |     "]\n",
21 |     "queries = ['i would like to clean my XYZ','book a slot for XYZ cleaning','looking for XYZ cleaninng services']"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 39,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "corpus_embeddings = embedder.encode(corpus)\n",
31 |     "query_embeddings = embedder.encode(queries)"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 40,
37 |    "metadata": {},
38 |    "outputs": [
39 |     {
40 |      "name": "stdout",
41 |      "output_type": "stream",
42 |      "text": [
43 |       "[0.         0.31504001 0.23922292 0.20180429 0.21156411]\n",
44 |       "matched sent:  book an appointment for XYZ cleaning , id:  1\n",
45 |       "query:  i would like to clean my XYZ\n",
46 |       "[0.249604   0.09753854 0.14501476 0.18097553 0.14041007]\n",
47 |       "matched sent:  i would like to clean my XYZ , id:  0\n",
48 |       "query:  book a slot for XYZ cleaning\n",
49 |       "[2.01804328e-01 2.37920637e-01 1.28938306e-01 1.11133325e-13\n",
50 |       " 1.53037771e-01]\n",
51 |       "matched sent:  book an appointment for XYZ cleaning , id:  1\n",
52 |       "query:  looking for XYZ cleaninng services\n"
53 |      ]
54 |     }
55 |    ],
56 |    "source": [
57 |     "import numpy as np\n",
58 |     "for query, query_embedding in zip(queries, query_embeddings):\n",
59 |     "    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n",
60 |     "    print(distances)\n",
61 |     "    print(\"matched sent: \",corpus[np.argmax(distances)],\", id: \",np.argmax(distances))\n",
62 |     "    print(\"query: \",query)"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "metadata": {},
69 |    "outputs": [],
70 |    "source": []
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.6.9"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 2
94 | }
95 | 


--------------------------------------------------------------------------------
/1.5-named-entity-recognition/data_making.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #!/usr/bin/env python3
  3 | 
  4 | """
  5 | Created on Mon Aug  6 19:40:26 2018
  6 | 
  7 | @author: joy
  8 | """
  9 | import json
 10 | abs_path1 = "benchmarking_data/Train/"
 11 | 
 12 | abs_path2 = "benchmarking_data/Validate/"
 13 | import re
 14 | reg = re.compile('[A-z]*\_([A-z]*)\_[A-z]*')
 15 | reg2 = re.compile('[A-z]*\_([A-z]*)')
 16 | 
 17 | 
 18 | def make_data_for_intent_from_json(json_file,txt_file):
 19 |     
 20 |     json_d = json.load(open(abs_path1+json_file))
 21 |     json_dict = json_d[reg.match(json_file).group(1)]
 22 |     
 23 |     wr = open("Intent_Data/"+txt_file,'w')
 24 |     
 25 |     for i in json_dict:
 26 |         each_list = i['data']
 27 |         sent =""
 28 |         for i in each_list:
 29 |             sent = sent + i['text']+ " "
 30 |         sent =sent[:-1]
 31 |         for i in range(3):
 32 |             sent = sent.replace("  "," ")
 33 |         wr.write(sent)
 34 |         wr.write('\n')
 35 |         print(sent)        
 36 |                     
 37 | 
 38 | def make_data_from_json(json_file,txt_file):
 39 |     
 40 |     json_d = json.load(open(abs_path2+json_file))
 41 |     json_dict = json_d[reg2.match(json_file).group(1)]
 42 |     
 43 |     wr = open(abs_path2+txt_file,'w')
 44 |     
 45 |     for i in json_dict:
 46 |         each_list = i['data']
 47 |         for i in each_list:
 48 |             try:
 49 |                 words = i['text'].split()
 50 |                 print(words[0]+' '+'B-'+i['entity'])
 51 |                 wr.write(words[0]+' '+'B-'+i['entity'])
 52 |                 wr.write('\n')
 53 |                 for word in words[1:]:
 54 |                     print(word+' '+'I-'+i['entity'])
 55 |                     wr.write(word+' '+'I-'+i['entity'])
 56 |                     wr.write('\n')
 57 |                 #print(i['text']+'\t'+i['entity'])
 58 | 
 59 |             except:
 60 |                 words = i['text'].split()
 61 |                 for word in words:
 62 |                     print(word+' '+'O')
 63 |                     wr.write(word+' '+'O')
 64 |                     wr.write('\n')
 65 |         print('\n')
 66 |         wr.write('\n')
 67 |         
 68 | 
 69 | def make_data_from_json_train(json_file,txt_file):
 70 |     
 71 |     json_d = json.load(open(abs_path1+json_file))
 72 |     json_dict = json_d[reg.match(json_file).group(1)]
 73 |     
 74 |     wr = open(abs_path1+txt_file,'w')
 75 |     
 76 |     for i in json_dict:
 77 |         each_list = i['data']
 78 |         for i in each_list:
 79 |             try:
 80 |                 words = i['text'].split()
 81 |                 print(words[0]+' '+'B-'+i['entity'])
 82 |                 wr.write(words[0]+' '+'B-'+i['entity'])
 83 |                 wr.write('\n')
 84 |                 for word in words[1:]:
 85 |                     print(word+' '+'I-'+i['entity'])
 86 |                     wr.write(word+' '+'I-'+i['entity'])
 87 |                     wr.write('\n')
 88 |                 #print(i['text']+'\t'+i['entity'])
 89 | 
 90 |             except:
 91 |                 words = i['text'].split()
 92 |                 for word in words:
 93 |                     print(word+' '+'O')
 94 |                     wr.write(word+' '+'O')
 95 |                     wr.write('\n')
 96 |         print('\n')
 97 |         wr.write('\n')
 98 | 
 99 | import nltk
100 | def make_data_from_json_train_pos(json_file,txt_file):
101 |     
102 |     json_d = json.load(open(abs_path2+json_file))
103 |     json_dict = json_d[reg2.match(json_file).group(1)]
104 |     
105 |     wr = open(abs_path2+txt_file,'w')
106 |     
107 |     for i in json_dict:
108 |         each_list = i['data']
109 |         sent = ""
110 |         for i in each_list:
111 |             sent = sent+i['text']+" "
112 |             sent = sent.replace("  "," ")
113 |         if sent[-1]==" ":
114 |             sent = sent[:-1]
115 |         words = []
116 |         pos_tags = nltk.pos_tag(sent.split())
117 |         print(pos_tags,sent)
118 |         pos_tag_dict = {j:k for j,k in pos_tags}
119 |         for i in each_list:
120 |             try:
121 |                 
122 |                 words = i['text'].split()
123 |                 print(words[0]+' '+pos_tag_dict[words[0]]+" "+'B-'+i['entity'])
124 |                 wr.write(words[0]+" "+pos_tag_dict[words[0]]+" "+'B-'+i['entity'])
125 |                 wr.write('\n')
126 |                 for word in words[1:]:
127 |                     print(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity'])
128 |                     wr.write(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity'])
129 |                     wr.write('\n')
130 |                 #print(i['text']+'\t'+i['entity'])
131 | 
132 |             except:
133 |                 words = i['text'].split()
134 |                 for word in words:
135 |                     print(word+' '+pos_tag_dict[word]+" "+'O')
136 |                     wr.write(word+' '+pos_tag_dict[word]+" "+'O')
137 |                     wr.write('\n')
138 |         print('\n')
139 |         wr.write('\n')
140 | 
141 |         
142 | import re   
143 | import json
144 | import os     
145 | def make_data_from_snips(input_path):
146 |     
147 |     for r,d,f in os.walk(input_path):
148 |  
149 |         for filename  in f:
150 |             label = os.path.basename(r)
151 |             source = os.path.join(r,filename)
152 |             
153 | 
154 |             
155 |             if os.path.splitext(filename)[-1] != '.txt':
156 |                 continue
157 |             
158 |             
159 |             
160 |             
161 |             read_file = open(source)
162 |     
163 | 
164 |             pattern = re.compile(r'(?:[[])(?P<value>.*?)(?:[]])(?:[(])(?P<name>.*?)(?:[)])')
165 |     
166 |             corpus = dict()
167 |             corpus[label] = list()
168 |             for i in read_file:
169 |                 data = list()
170 |                 
171 |                 it = pattern.finditer(i)
172 |                 
173 |                 sent_len = len(i.strip())
174 |                 
175 |                 if sent_len == 0:
176 |                     continue
177 | 
178 |                 last_span = 0
179 |                 for m in it:
180 |                     
181 |                     head = i[last_span:m.span()[0]]
182 |                     obj = dict()
183 |                     if head.strip():
184 |                         obj['text'] = head
185 |                     
186 |                         data.append(obj)
187 |                     
188 |                     obj = dict()
189 |                     obj['text'] = m.group('value')
190 |                     obj['entity'] = m.group('name')
191 |                     
192 |                     data.append(obj)
193 |                     
194 |                     last_span = m.span()[1]
195 |                 if last_span:
196 |                     obj = dict()
197 |                     if i[last_span :].strip():
198 |                         obj['text'] = i[last_span :]
199 |                         data.append(obj)
200 |                 
201 |                 if data:
202 |                     
203 |                     corpus[label].append({'data': data})
204 |             
205 |             with open(os.path.join(r,filename.split()[0] + '.json'),'w',encoding='utf-8') as fp:
206 |                 json.dump(corpus,fp)
207 |                 
208 |     
209 |     
210 | 
211 |     
212 | #make_data("book_restaurant_train.csv","book_restaurant_train.txt")
213 | '''
214 | make_data_from_json_train_pos("train_AddToPlaylist_full.json","train_AddToPlaylist_full.txt")
215 | make_data_from_json_train_pos("train_BookRestaurant_full.json","train_BookRestaurant_full.txt")
216 | make_data_from_json_train_pos("train_GetWeather_full.json","train_GetWeather_full.txt")
217 | make_data_from_json_train_pos("train_PlayMusic_full.json","train_PlayMusic_full.txt")
218 | make_data_from_json_train_pos("train_RateBook_full.json","train_RateBook_full.txt")
219 | make_data_from_json_train_pos("train_SearchCreativeWork_full.json","train_SearchCreativeWork_full.txt")
220 | make_data_from_json_train_pos("train_SearchScreeningEvent_full.json","train_SearchScreeningEvent_full.txt")
221 | '''
222 | 
223 | make_data_from_json_train_pos("validate_AddToPlaylist.json","validate_AddToPlaylist.txt")
224 | make_data_from_json_train_pos("validate_BookRestaurant.json","validate_BookRestaurant.txt")
225 | make_data_from_json_train_pos("validate_GetWeather.json","validate_GetWeather.txt")
226 | make_data_from_json_train_pos("validate_PlayMusic.json","validate_PlayMusic.txt")
227 | make_data_from_json_train_pos("validate_RateBook.json","validate_RateBook.txt")
228 | make_data_from_json_train_pos("validate_SearchCreativeWork.json","validate_SearchCreativeWork.txt")
229 | make_data_from_json_train_pos("validate_SearchScreeningEvent.json","validate_SearchScreeningEvent.txt")
230 | 
231 | 
232 | 
233 | #make_data_from_snips("flight_data")
234 | 


--------------------------------------------------------------------------------
/1.5-named-entity-recognition/ner_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "For data i used  https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines\n",
  8 |     "Then you can run data_making.py"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "from numpy import array\n",
 18 |     "import tensorflow as tf\n",
 19 |     "import glob\n",
 20 |     "import numpy as np\n",
 21 |     "import pickle\n",
 22 |     "\n",
 23 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 24 |     "from tensorflow.keras.utils import to_categorical\n",
 25 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 26 |     "from tensorflow.keras.models import Sequential,Model\n",
 27 |     "from tensorflow.keras.layers import Dense\n",
 28 |     "from tensorflow.keras.layers import LSTM\n",
 29 |     "from tensorflow.keras.layers import Input\n",
 30 |     "from tensorflow.keras.layers import Dropout\n",
 31 |     "from tensorflow.keras.layers import Embedding\n",
 32 |     "from tensorflow.keras.layers import TimeDistributed\n",
 33 |     "from tensorflow.keras.layers import Conv1D\n",
 34 |     "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
 35 |     "from tensorflow.keras.utils import Progbar\n",
 36 |     "from tensorflow.keras.models import load_model\n",
 37 |     "\n",
 38 |     "from tensorflow.keras.initializers import RandomUniform\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": []
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "class LoadData():\n",
 55 |     "    def __init__(self):\n",
 56 |     "        self.train_files = None\n",
 57 |     "        self.validation_files = None\n",
 58 |     "        \n",
 59 |     "    def get_data(self):\n",
 60 |     "        self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
 61 |     "        self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "load_data_obj = LoadData()\n",
 71 |     "load_data_obj.get_data()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "class Preprocessing():\n",
 81 |     "    def __init__(self):\n",
 82 |     "        self.word_embediings_model = open(\"embeddings/glove.6B.100d.txt\", encoding=\"utf-8\")\n",
 83 |     "         \n",
 84 |     "    \n",
 85 |     "    def sentence_from_file(self,filename):\n",
 86 |     "        f = open(filename)\n",
 87 |     "        single_file_sentences = []\n",
 88 |     "        sentence_list = []\n",
 89 |     "        for line in f:\n",
 90 |     "            if len(line)==0 or line[0]==\"\\n\":\n",
 91 |     "                if len(sentence_list) > 0:\n",
 92 |     "                    single_file_sentences.append(sentence_list)\n",
 93 |     "                    sentence_list = []\n",
 94 |     "                continue\n",
 95 |     "            splits = line.split(' ')\n",
 96 |     "            sentence_list.append([splits[0],splits[1],splits[-1]])\n",
 97 |     "\n",
 98 |     "        if len(sentence_list) >0:\n",
 99 |     "            single_file_sentences.append(sentence_list)\n",
100 |     "            sentence_list = []\n",
101 |     "        return single_file_sentences\n",
102 |     "\n",
103 |     "    def get_case_value(self,word, case_dict):   \n",
104 |     "        case_value = 'other'\n",
105 |     "\n",
106 |     "        count_digits = 0\n",
107 |     "        for char in word:\n",
108 |     "            if char.isdigit():\n",
109 |     "                count_digits += 1\n",
110 |     "\n",
111 |     "        if word.isdigit():\n",
112 |     "            case_value = 'number'\n",
113 |     "        elif count_digits / float(len(word)) > 0.5:\n",
114 |     "            case_value = 'fraction'\n",
115 |     "        elif word.islower():\n",
116 |     "            case_value = 'lower'\n",
117 |     "        elif word.isupper():\n",
118 |     "            case_value = 'upper'\n",
119 |     "        elif word[0].isupper():\n",
120 |     "            case_value = 'title'\n",
121 |     "        elif count_digits > 0:\n",
122 |     "            case_value = 'leters_digit'\n",
123 |     "\n",
124 |     "        return case_dict[case_value]\n",
125 |     "\n",
126 |     "\n",
127 |     "    def createBatches(self,data):\n",
128 |     "        l = []\n",
129 |     "        for i in data:\n",
130 |     "            l.append(len(i[0]))\n",
131 |     "        l = set(l)\n",
132 |     "        batches = []\n",
133 |     "        batch_len = []\n",
134 |     "        z = 0\n",
135 |     "        for i in l:\n",
136 |     "            for batch in data:\n",
137 |     "                if len(batch[0]) == i:\n",
138 |     "                    batches.append(batch)\n",
139 |     "                    z += 1\n",
140 |     "            batch_len.append(z)\n",
141 |     "        return batches,batch_len\n",
142 |     "\n",
143 |     "    def create_tensors(self,sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id):\n",
144 |     "        #paddingIdx = word2Idx['PAD_TKN']\n",
145 |     "        unknownIdx = word_to_id['UNK_TKN']\n",
146 |     "\n",
147 |     "        dataset = []\n",
148 |     "\n",
149 |     "        word_count = 0\n",
150 |     "        unknownword_count = 0\n",
151 |     "\n",
152 |     "        for sentence in sentences:\n",
153 |     "            word_indices = []    \n",
154 |     "            char_indices = []\n",
155 |     "            case_indices = []\n",
156 |     "            label_indices = []\n",
157 |     "            pos_indices = []\n",
158 |     "\n",
159 |     "            for word,char,pos,label in sentence:  \n",
160 |     "\n",
161 |     "                word_count += 1\n",
162 |     "                if word in word_to_id:\n",
163 |     "                    word_index = word_to_id[word]\n",
164 |     "                elif word.lower() in word_to_id:\n",
165 |     "                    word_index = word_to_id[word.lower()]                 \n",
166 |     "                else:\n",
167 |     "                    word_index = unknownIdx\n",
168 |     "                    unknownword_count += 1\n",
169 |     "                    \n",
170 |     "                char_index = []\n",
171 |     "                for x in char:\n",
172 |     "                    char_index.append(char_to_id[x])\n",
173 |     "                    \n",
174 |     "                word_indices.append(word_index)\n",
175 |     "                case_indices.append(self.get_case_value(word, case_to_id))\n",
176 |     "                pos_indices.append(pos_to_id[pos.replace('\\n','')])\n",
177 |     "                char_indices.append(char_index)\n",
178 |     "                label_indices.append(label_to_id[label])\n",
179 |     "            print([word_indices, case_indices, char_indices, pos_indices, label_indices])\n",
180 |     "            dataset.append([word_indices, case_indices, char_indices, pos_indices, label_indices]) \n",
181 |     "        return dataset\n",
182 |     "\n",
183 |     "\n",
184 |     "    def addCharInformatioin(self,Sentences):\n",
185 |     "        for i,sentence in enumerate(Sentences):\n",
186 |     "            for j,data in enumerate(sentence):\n",
187 |     "                chars = [c for c in data[0]]\n",
188 |     "                Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
189 |     "        return Sentences\n",
190 |     "\n",
191 |     "    def padding(self,Sentences):\n",
192 |     "        maxlen = 52\n",
193 |     "        for sentence in Sentences:\n",
194 |     "            char = sentence[2]\n",
195 |     "            for x in char:\n",
196 |     "                maxlen = max(maxlen,len(x))\n",
197 |     "        for i,sentence in enumerate(Sentences):\n",
198 |     "            Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')\n",
199 |     "        return Sentences\n",
200 |     "    \n",
201 |     "    def get_word_embeddings(self,list_sentences):\n",
202 |     "        wd_to_id = {}\n",
203 |     "        wd_em = []\n",
204 |     "        \n",
205 |     "        words = {}\n",
206 |     "        for sentence in list_sentences:\n",
207 |     "            for token,char,pos,label in sentence:\n",
208 |     "                words[token.lower()] = True\n",
209 |     "                \n",
210 |     "        for line in self.word_embediings_model:\n",
211 |     "            split = line.strip().split(\" \")\n",
212 |     "\n",
213 |     "            if len(wd_to_id) == 0:\n",
214 |     "                wd_to_id[\"PAD_TKN\"] = len(wd_to_id)\n",
215 |     "                vector = np.zeros(len(split)-1) \n",
216 |     "                wd_em.append(vector)\n",
217 |     "\n",
218 |     "                wd_to_id[\"UNK_TKN\"] = len(wd_to_id)\n",
219 |     "                vector = np.random.uniform(-0.25, 0.25, len(split)-1)\n",
220 |     "                wd_em.append(vector)\n",
221 |     "            if split[0].lower() in words:\n",
222 |     "                vector = np.array([float(num) for num in split[1:]])\n",
223 |     "                wd_em.append(vector)\n",
224 |     "                wd_to_id[split[0]] = len(wd_to_id)\n",
225 |     "\n",
226 |     "        wd_em = np.array(wd_em)\n",
227 |     "        return wd_em,wd_to_id\n",
228 |     "    \n",
229 |     "    def get_feature_dict(self,sentences):\n",
230 |     "\n",
231 |     "        labelSet = set()\n",
232 |     "        lb_to_id = {}\n",
233 |     "        for sentence in sentences:\n",
234 |     "            for token,char,pos,label in sentence:\n",
235 |     "                labelSet.add(label)\n",
236 |     "\n",
237 |     "        for label in labelSet:\n",
238 |     "            lb_to_id[label] = len(lb_to_id)\n",
239 |     "\n",
240 |     "        id_to_lb = {v: k for k, v in lb_to_id.items()}\n",
241 |     "\n",
242 |     "        ch_to_id = {\"PADDING\":0, \"UNKNOWN\":1}\n",
243 |     "        for c in \" 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\\\"/\\\\%$`&=*+@^~|øæðş\":\n",
244 |     "            ch_to_id[c] = len(ch_to_id)\n",
245 |     "\n",
246 |     "        cs_to_id = {\n",
247 |     "                'number': 0, 'lower':1, 'upper':2, 'title':3, \n",
248 |     "                'other':4, 'fraction':5, 'leters_digit': 6, \n",
249 |     "                'PAD_TKN':7\n",
250 |     "                }\n",
251 |     "\n",
252 |     "        pos_to_id = {\"$\":0, \"''\":1, \"(\":2, \")\":3, \",\":4, \"--\":5, \".\":6, \":\":7, \"CC\":8, \"CD\":9, \"DT\":10,\n",
253 |     "                     \"EX\":11, \"FW\":12, \"IN\":13, \"JJ\":14, \"JJR\":15, \"JJS\":16, \"LS\":17, \"MD\":18, \"NN\":19,\n",
254 |     "                     \"NNP\":20, \"NNPS\":21, \"NNS\":22, \"PDT\":23, \"POS\":24, \"PRP\":25, \"PRP$\":26, \"RB\":27, \n",
255 |     "                     \"RBR\":28, \"RBS\":29, \"RP\":30, \"SYM\":31, \"TO\":32, \"UH\":33, \"VB\":34, \"VBD\":35, \"VBG\":36, \n",
256 |     "                     \"VBN\":37, \"VBP\":38, \"VBZ\":39, \"WDT\":40, \"WP\":41, \"WP$\":42, \"WRB\":43, \"``\":44}\n",
257 |     "        \n",
258 |     "        return cs_to_id,pos_to_id,ch_to_id,lb_to_id,id_to_lb\n",
259 |     "    \n",
260 |     "    def make_batch(self,dataset):\n",
261 |     "        self.batch,self.batch_len = self.createBatches(dataset)\n",
262 |     "        return self.batch,self.batch_len\n",
263 |     "        \n",
264 |     "    def make_dataset(self,file_name):\n",
265 |     "        sentences = self.sentence_from_file(file_name)\n",
266 |     "        sentences = self.addCharInformatioin(sentences)\n",
267 |     "        return sentences\n",
268 |     "    \n",
269 |     "    def get_sentences(self,file_list):\n",
270 |     "        list_sentences = []\n",
271 |     "        for i in file_list:\n",
272 |     "            list_sentences+= self.make_dataset(i)\n",
273 |     "        return list_sentences\n",
274 |     "        "
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "preprocess_obj = Preprocessing()\n",
284 |     "train_sentences = preprocess_obj.get_sentences(load_data_obj.train_files)\n",
285 |     "word_emb,word_to_id = preprocess_obj.get_word_embeddings(train_sentences)\n",
286 |     "\n",
287 |     "'''the below function is not requred for validation data, we will load the dictionaries for validation'''\n",
288 |     "case_to_id,pos_to_id,char_to_id,label_to_id,id_to_label = preprocess_obj.get_feature_dict(train_sentences)\n",
289 |     "train_data_set = preprocess_obj.padding(preprocess_obj.create_tensors(train_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n",
290 |     "train_batch,train_batch_len = preprocess_obj.make_batch(train_data_set)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "class DesignModel():\n",
300 |     "    def __init__(self,params):\n",
301 |     "        self.model = None\n",
302 |     "        self.wd_em = word_emb\n",
303 |     "        self.caseEmbeddings = np.identity(len(case_to_id), dtype='float32')\n",
304 |     "        self.posEmbeddings = np.identity(len(pos_to_id), dtype='float32') \n",
305 |     "        self.ch_to_id = char_to_id\n",
306 |     "        self.lb_to_id = label_to_id\n",
307 |     "        self.params = params\n",
308 |     "        self.train_batch = train_batch\n",
309 |     "        self.train_batch_len = train_batch_len\n",
310 |     "\n",
311 |     "        \n",
312 |     "    def iterate_minibatches(self,dataset,batch_len): \n",
313 |     "        start = 0\n",
314 |     "        for i in batch_len:\n",
315 |     "            tokens = []\n",
316 |     "            char = []\n",
317 |     "            labels = []\n",
318 |     "            casing = []\n",
319 |     "            pos_tags = []\n",
320 |     "            data = dataset[start:i]\n",
321 |     "            start = i\n",
322 |     "            for dt in data:\n",
323 |     "                t,c,ch,pos,l = dt\n",
324 |     "                l = np.expand_dims(l,-1)\n",
325 |     "                tokens.append(t)\n",
326 |     "                char.append(ch)\n",
327 |     "                labels.append(l)\n",
328 |     "                casing.append(c)\n",
329 |     "                pos_tags.append(pos)\n",
330 |     "            yield np.asarray(labels),np.asarray(tokens),np.asarray(casing), np.asarray(char), np.asarray(pos_tags)\n",
331 |     "    \n",
332 |     "    def BiRNN_model(self):\n",
333 |     "    \n",
334 |     "        input = Input(shape=(None,),dtype='int32')\n",
335 |     "\n",
336 |     "        words = Embedding(input_dim=self.wd_em.shape[0], output_dim=self.wd_em.shape[1],  weights=[self.wd_em], trainable=False)(input)\n",
337 |     "\n",
338 |     "        csng_input = Input(shape=(None,), dtype='int32')\n",
339 |     "        csng = Embedding(output_dim = self.caseEmbeddings.shape[1], input_dim = self.caseEmbeddings.shape[0], weights = [self.caseEmbeddings], trainable=False)(csng_input)\n",
340 |     "\n",
341 |     "\n",
342 |     "        char_input=Input(shape=(None,52,))\n",
343 |     "        embed_char_out=TimeDistributed(Embedding(len(self.ch_to_id),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input)\n",
344 |     "        dropout= Dropout(self.params['dropout_rate'])(embed_char_out)\n",
345 |     "        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.params['kernel_sizes_cnn'], filters=30, padding='same',activation=params['rnn_activation'], strides=1))(dropout)\n",
346 |     "        maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)\n",
347 |     "        char = TimeDistributed(Flatten())(maxpool_out)\n",
348 |     "        char = Dropout(self.params['dropout_rate'])(char)\n",
349 |     "\n",
350 |     "        pos_input = Input(shape=(None,), dtype='int32')\n",
351 |     "        pos = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n",
352 |     "\n",
353 |     "\n",
354 |     "        output = concatenate([words, csng, char, pos])\n",
355 |     "        output = Bidirectional(LSTM(self.params['units_lstm'], return_sequences=True, dropout=self.params['dropout_rate'], recurrent_dropout=0.25))(output)\n",
356 |     "        output = TimeDistributed(Dense(len(self.lb_to_id), activation=self.params['rnn_activation']))(output)\n",
357 |     "        self.model = Model(inputs=[input, csng_input, char_input, pos_input], outputs=[output])\n",
358 |     "        self.model.compile(loss=self.params['loss'], optimizer=self.params['optimizer'],metrics=[\"accuracy\"])\n",
359 |     "\n",
360 |     "    def train_model(self):\n",
361 |     "    \n",
362 |     "        for epoch in range(self.params['epochs']):\n",
363 |     "\n",
364 |     "            print(\"Epoch %d/%d\"%(epoch+1, self.params['epochs']))\n",
365 |     "            a = Progbar(len(preprocess_obj.batch_len))\n",
366 |     "            res = None\n",
367 |     "            for i,batch in enumerate(self.iterate_minibatches(self.train_batch,self.train_batch_len)):\n",
368 |     "                labels, tkns, csng, char, pos = batch       \n",
369 |     "                res = self.model.train_on_batch([tkns, csng, char, pos], labels)\n",
370 |     "                a.update(i)\n",
371 |     "            print(\"\\n\")\n",
372 |     "            print(self.model.metrics_names[0],\":\",res[0],self.model.metrics_names[1],\":\",res[1])\n",
373 |     "            print(' ')"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "params = {\n",
383 |     "            \"kernel_sizes_cnn\": 3,\n",
384 |     "            \"optimizer\": \"nadam\",\n",
385 |     "            \"cnn_activation\":\"tanh\",\n",
386 |     "            \"rnn_activation\":\"softmax\",\n",
387 |     "            \"units_lstm\" : 100,\n",
388 |     "            \"loss\": \"sparse_categorical_crossentropy\",\n",
389 |     "            \"text_size\": 50,\n",
390 |     "            \"dropout_rate\": 0.5,\n",
391 |     "            \"epochs\": 100,\n",
392 |     "            \"model_name\": \"cnn_model\",\n",
393 |     "            \"batch_size\": 32,\n",
394 |     "            \"verbose\": True,\n",
395 |     "            \"metrics\":[\"accuracy\"]\n",
396 |     "        }\n",
397 |     "model_obj = DesignModel(params)\n",
398 |     "model_obj.BiRNN_model()\n",
399 |     "model_obj.train_model()"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "class LoadAndSaveModels():\n",
409 |     "        \n",
410 |     "    def save_model(self,model,model_name):\n",
411 |     "        model.save(\"Model_Data/entity_models/\"+model_name+\".h5\")\n",
412 |     "        print(\"Model saved to Model folder.\")\n",
413 |     "        \n",
414 |     "    def save_dict(self, save_path,dictionaries):  \n",
415 |     "        \n",
416 |     "        for item in dictionaries:\n",
417 |     "            \n",
418 |     "            with open(save_path+\"/\"+item[1]+\".txt\", \"wb\") as myFile:\n",
419 |     "                pickle.dump(item[0], myFile)\n",
420 |     "\n",
421 |     "        print(\"Files saved.\")\n",
422 |     "        \n",
423 |     "    def load_dict(self,file):\n",
424 |     "        with open(file,\"rb\") as fp:\n",
425 |     "            dict = pickle.load(fp)\n",
426 |     "        return dict\n",
427 |     "    \n",
428 |     "    def load_model(self,model_name):\n",
429 |     "        model = load_model(model_name)\n",
430 |     "        return model\n"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "load_save = LoadAndSaveModels()\n",
440 |     "load_save.save_model(model_obj.model,\"birnn\")\n",
441 |     "dict = [(word_to_id,\"word_to_id\"),(label_to_id,\"label_to_id\"),(char_to_id,\"char_to_id\"),\n",
442 |     "        (id_to_label,\"id_to_label\"),(case_to_id,\"case_to_id\"),(pos_to_id,\"pos_to_id\")]\n",
443 |     "load_save.save_dict(\"Model_Data/dict\",dict)"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "load_save = LoadAndSaveModels()\n",
453 |     "model = load_save.load_model(\"Model_Data/entity_models/birnn.h5\")\n",
454 |     "word_to_id = load_save.load_dict(\"Model_Data/dict/word_to_id.txt\")\n",
455 |     "case_to_id = load_save.load_dict(\"Model_Data/dict/case_to_id.txt\")\n",
456 |     "pos_to_id = load_save.load_dict(\"Model_Data/dict/pos_to_id.txt\")\n",
457 |     "char_to_id = load_save.load_dict(\"Model_Data/dict/char_to_id.txt\")\n",
458 |     "label_to_id = load_save.load_dict(\"Model_Data/dict/label_to_id.txt\")\n",
459 |     "id_to_label = load_save.load_dict(\"Model_Data/dict/id_to_label.txt\")"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "validation_sentences = preprocess_obj.get_sentences(load_data_obj.validation_files)\n",
469 |     "validation_set = preprocess_obj.padding(preprocess_obj.create_tensors(validation_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n",
470 |     "validation_batch,validation_batch_len = preprocess_obj.make_batch(validation_set)\n"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "class Prediction():\n",
480 |     "    def __init__(self):\n",
481 |     "        self.case_to_id = case_to_id\n",
482 |     "        self.pos_to_id = pos_to_id\n",
483 |     "        self.char_to_id = char_to_id\n",
484 |     "        self.label_to_id = label_to_id\n",
485 |     "        self.id_to_label = id_to_label\n",
486 |     "        self.word_to_id = word_to_id\n",
487 |     "    def prediction(self,dataset,model):\n",
488 |     "        correct_labels = []\n",
489 |     "        predict_labels = []\n",
490 |     "        b = Progbar(len(dataset))\n",
491 |     "        for i,data in enumerate(dataset):    \n",
492 |     "            tkns, csng, char,pos, labels = data\n",
493 |     "            tkns = np.asarray([tkns])     \n",
494 |     "            char = np.asarray([char])\n",
495 |     "            csng = np.asarray([csng])\n",
496 |     "            pos = np.asarray([pos])\n",
497 |     "            predict = model.predict([tkns, csng, char,pos], verbose=False)[0] \n",
498 |     "            predict = predict.argmax(axis=-1)        \n",
499 |     "            correct_labels.append(labels)\n",
500 |     "            predict_labels.append(predict)\n",
501 |     "            b.update(i)\n",
502 |     "        return predict_labels, correct_labels\n",
503 |     "    \n",
504 |     "    def predict(self,sentence,model):\n",
505 |     "        sen_list = [[[i,'POS','O\\n'] for i in sentence.split()]]\n",
506 |     "        test_sent = preprocess_obj.addCharInformatioin(sen_list)\n",
507 |     "\n",
508 |     "        predLabels = []\n",
509 |     "\n",
510 |     "        test_set = preprocess_obj.padding(preprocess_obj.create_tensors(test_sent,self.word_to_id,\n",
511 |     "                                                                        self.case_to_id,self.pos_to_id,\n",
512 |     "                                                                        self.char_to_id,self.label_to_id))\n",
513 |     "        test_batch,test_batch_len = preprocess_obj.createBatches(test_set)\n",
514 |     "        for i,data in enumerate(test_batch):\n",
515 |     "            tokens, csng, char, pos, labels = data\n",
516 |     "            tokens = np.asarray([tokens])     \n",
517 |     "            char = np.asarray([char])\n",
518 |     "            csng = np.asarray([csng])\n",
519 |     "            pos = np.asarray([pos])\n",
520 |     "            pred = model.predict([tokens,csng, char,pos], verbose=False)[0] \n",
521 |     "            pred = pred.argmax(axis=-1) #Predict the classes            \n",
522 |     "            predLabels.append(pred)\n",
523 |     "        entity_labels = []\n",
524 |     "        j = 0\n",
525 |     "        words_list = sentence.split()\n",
526 |     "        for i in predLabels[-1]:\n",
527 |     "            entity_labels.append((words_list[j],self.id_to_label[int(i)].replace(\"\\n\",\"\")))\n",
528 |     "            j+=1\n",
529 |     "\n",
530 |     "        return entity_labels"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "pred_obj = Prediction()"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": null,
545 |    "metadata": {},
546 |    "outputs": [],
547 |    "source": [
548 |     "sent = \"Add Richard McNamara newest song to the Just Smile playlist\"\n",
549 |     "entity_label = pred_obj.predict(sent,model)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "entity_label"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "class Evaluate():\n",
568 |     "    def compute_precision(self,guessed_sentences, correct_sentences):\n",
569 |     "        assert(len(guessed_sentences) == len(correct_sentences))\n",
570 |     "        correctCount = 0\n",
571 |     "        count = 0\n",
572 |     "\n",
573 |     "\n",
574 |     "        for sentenceIdx in range(len(guessed_sentences)):\n",
575 |     "            guessed = guessed_sentences[sentenceIdx]\n",
576 |     "            correct = correct_sentences[sentenceIdx]\n",
577 |     "            assert(len(guessed) == len(correct))\n",
578 |     "            idx = 0\n",
579 |     "            while idx < len(guessed):\n",
580 |     "                if guessed[idx][0] == 'B': #A new chunk starts\n",
581 |     "                    count += 1\n",
582 |     "\n",
583 |     "                    if guessed[idx] == correct[idx]:\n",
584 |     "                        idx += 1\n",
585 |     "                        correctlyFound = True\n",
586 |     "\n",
587 |     "                        while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I\n",
588 |     "                            if guessed[idx] != correct[idx]:\n",
589 |     "                                correctlyFound = False\n",
590 |     "\n",
591 |     "                            idx += 1\n",
592 |     "\n",
593 |     "                        if idx < len(guessed):\n",
594 |     "                            if correct[idx][0] == 'I': #The chunk in correct was longer\n",
595 |     "                                correctlyFound = False\n",
596 |     "\n",
597 |     "\n",
598 |     "                        if correctlyFound:\n",
599 |     "                            correctCount += 1\n",
600 |     "                    else:\n",
601 |     "                        idx += 1\n",
602 |     "                else:  \n",
603 |     "                    idx += 1\n",
604 |     "\n",
605 |     "        precision = 0\n",
606 |     "        if count > 0:    \n",
607 |     "            precision = float(correctCount) / count\n",
608 |     "\n",
609 |     "        return precision\n",
610 |     "    def get_metrics(self,predictions, correct, idx2Label): \n",
611 |     "        label_pred = []    \n",
612 |     "        for sentence in predictions:\n",
613 |     "            label_pred.append([idx2Label[element] for element in sentence])\n",
614 |     "\n",
615 |     "        label_correct = []    \n",
616 |     "        for sentence in correct:\n",
617 |     "            label_correct.append([idx2Label[element] for element in sentence])\n",
618 |     "\n",
619 |     "\n",
620 |     "        #print label_pred\n",
621 |     "        #print label_correct\n",
622 |     "\n",
623 |     "        prec = self.compute_precision(label_pred, label_correct)\n",
624 |     "        rec = self.compute_precision(label_correct, label_pred)\n",
625 |     "\n",
626 |     "        f1 = 0\n",
627 |     "        if (rec+prec) > 0:\n",
628 |     "            f1 = 2.0 * prec * rec / (prec + rec);\n",
629 |     "\n",
630 |     "        return prec, rec, f1"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": [
639 |     "eval_obj = Evaluate()\n",
640 |     "\n",
641 |     "train_predict_labels, train_correct_labels = pred_obj.prediction(train_data_set,model)\n",
642 |     "pre_train, rec_train, f1_train= eval_obj.get_metrics(train_predict_labels, train_correct_labels, id_to_label)\n",
643 |     "print(\"Train-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_train, rec_train, f1_train))\n",
644 |     "     \n",
645 |     "validation_predict_labels, validation_correct_labels = pred_obj.prediction(validation_set,model)\n",
646 |     "pre_test, rec_test, f1_test= eval_obj.get_metrics(validation_predict_labels, validation_correct_labels, id_to_label)\n",
647 |     "print(\"Validation-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_test, rec_test, f1_test))\n"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": null,
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": []
656 |   }
657 |  ],
658 |  "metadata": {
659 |   "kernelspec": {
660 |    "display_name": "Python 3",
661 |    "language": "python",
662 |    "name": "python3"
663 |   },
664 |   "language_info": {
665 |    "codemirror_mode": {
666 |     "name": "ipython",
667 |     "version": 3
668 |    },
669 |    "file_extension": ".py",
670 |    "mimetype": "text/x-python",
671 |    "name": "python",
672 |    "nbconvert_exporter": "python",
673 |    "pygments_lexer": "ipython3",
674 |    "version": "3.8.2"
675 |   }
676 |  },
677 |  "nbformat": 4,
678 |  "nbformat_minor": 4
679 | }
680 | 


--------------------------------------------------------------------------------
/1.5-named-entity-recognition/simple_ner-2.0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from numpy import array\n",
 10 |     "import tensorflow as tf\n",
 11 |     "import glob\n",
 12 |     "import numpy as np\n",
 13 |     "import pickle\n",
 14 |     "from datetime import datetime\n",
 15 |     "import nltk \n",
 16 |     "\n",
 17 |     "from sklearn.metrics import accuracy_score\n",
 18 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 19 |     "from tensorflow.keras.utils import to_categorical\n",
 20 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 21 |     "from tensorflow.keras.models import Sequential,Model\n",
 22 |     "from tensorflow.keras.layers import Dense\n",
 23 |     "from tensorflow.keras.layers import LSTM\n",
 24 |     "from tensorflow.keras.layers import Input\n",
 25 |     "from tensorflow.keras.layers import Dropout\n",
 26 |     "from tensorflow.keras.layers import Embedding\n",
 27 |     "from tensorflow.keras.layers import TimeDistributed\n",
 28 |     "from tensorflow.keras.layers import Conv1D\n",
 29 |     "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
 30 |     "from tensorflow.keras.utils import Progbar\n",
 31 |     "from tensorflow.keras.models import load_model\n",
 32 |     "\n",
 33 |     "from tensorflow.keras.initializers import RandomUniform\n"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "class LoadData():\n",
 43 |     "    def __init__(self):\n",
 44 |     "        self.train_files = None\n",
 45 |     "        self.validation_files = None\n",
 46 |     "        \n",
 47 |     "    def get_data(self):\n",
 48 |     "        self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
 49 |     "        self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n",
 50 |     "        \n",
 51 |     "    def sentence_from_file(self,filename):\n",
 52 |     "        single_data_list = list()\n",
 53 |     "        with open(filename) as fp:\n",
 54 |     "            sentence_list = []\n",
 55 |     "            lines = fp.readlines()\n",
 56 |     "            for line in lines:\n",
 57 |     "                splits = line.split(' ')\n",
 58 |     "                if splits[0]=='\\n':\n",
 59 |     "                    #sent = \" \".join([word[0] for word in sentence_list])\n",
 60 |     "                    #single_data_list.append((sentence_list,sent))\n",
 61 |     "                    single_data_list.append(sentence_list)\n",
 62 |     "                    sentence_list = list()\n",
 63 |     "                else:\n",
 64 |     "                    sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n",
 65 |     "                \n",
 66 |     "        return single_data_list\n",
 67 |     "    \n",
 68 |     "    def addCharInformatioin(self,Sentences):\n",
 69 |     "        for i,sentence in enumerate(Sentences):\n",
 70 |     "            for j,data in enumerate(sentence):\n",
 71 |     "                chars = [c for c in data[0]]\n",
 72 |     "                Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
 73 |     "        return Sentences\n",
 74 |     "    \n",
 75 |     "    def prepared_data(self,files):\n",
 76 |     "        list_sentences = list()\n",
 77 |     "        for each_file in files:\n",
 78 |     "            sentences = self.sentence_from_file(each_file)\n",
 79 |     "            #sentences = self.addCharInformatioin(sentences)\n",
 80 |     "            list_sentences+= sentences\n",
 81 |     "        return list_sentences\n",
 82 |     "    "
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "load_data_obj = LoadData()\n",
 92 |     "load_data_obj.get_data()\n",
 93 |     "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n",
 94 |     "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n",
 95 |     "print(trained_sen_list[:5])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "class Preprocessing():\n",
105 |     "    def __init__(self):\n",
106 |     "        self.max_len = len(max(trained_sen_list))\n",
107 |     "        \n",
108 |     "    def make_data(self,data_list):\n",
109 |     "        \n",
110 |     "        \n",
111 |     "        words = list()\n",
112 |     "        for each_sent in data_list:\n",
113 |     "            for each_item in each_sent:\n",
114 |     "                words.append(each_item[0])\n",
115 |     "        words = list(set(words))\n",
116 |     "\n",
117 |     "        \n",
118 |     "        pos_tags = list()\n",
119 |     "        for each_sent in data_list:\n",
120 |     "            for each_item in each_sent:\n",
121 |     "                pos_tags.append(each_item[1])\n",
122 |     "        pos_tags = list(set(pos_tags))\n",
123 |     "        \n",
124 |     "        labels = list()\n",
125 |     "        for each_sent in data_list:\n",
126 |     "            for each_item in each_sent:\n",
127 |     "                labels.append(each_item[2])\n",
128 |     "        labels = list(set(labels))\n",
129 |     "        \n",
130 |     "        \n",
131 |     "        self.word2idx = {w: i for i, w in enumerate(words)}\n",
132 |     "        self.word2idx.update({\"PAD\": len(self.word2idx), \"UNK\": len(self.word2idx)+1})\n",
133 |     "        self.num_words = len(self.word2idx)\n",
134 |     "        \n",
135 |     "        self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n",
136 |     "        self.pos_tag2idx.update({\"PAD\": len(self.pos_tag2idx), \"UNK\": len(self.pos_tag2idx)+1})\n",
137 |     "        self.num_pos_tags = len(self.pos_tag2idx)\n",
138 |     "        \n",
139 |     "        self.label2idx = {t: i for i, t in enumerate(labels)}\n",
140 |     "        self.num_lables = len(self.label2idx)\n",
141 |     "        \n",
142 |     "    def word2features(self,data, word_dict):\n",
143 |     "        word = data[0]\n",
144 |     "        postag = data[1]\n",
145 |     "        binary_map = {True:0,False:1,None:2}\n",
146 |     "        features = [word_dict[word],binary_map[word.islower()], \n",
147 |     "                    binary_map[word.isupper()], binary_map[word.istitle()], \n",
148 |     "                    binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n",
149 |     "        return features\n",
150 |     "\n",
151 |     "\n",
152 |     "    def sent2features(self,sent,word_dict):\n",
153 |     "        sentence_features = list()\n",
154 |     "        for index in range(len(sent)):\n",
155 |     "            sentence_features.append(self.word2features(sent[index],word_dict))\n",
156 |     "                       \n",
157 |     "        return sentence_features\n",
158 |     "\n",
159 |     "    def sent2labels(self,sent):\n",
160 |     "        return [label for token, postag, label in sent]\n",
161 |     "\n",
162 |     "    def sent2tokens(self,sent):\n",
163 |     "        return [token for token, postag, label in sent]\n",
164 |     "    \n",
165 |     "    def create_data(self,data_list):\n",
166 |     "        self.sentences = data_list\n",
167 |     "        maxlen = max([len(item) for item in data_list])\n",
168 |     "        self.max_len = maxlen\n",
169 |     "        wd = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n",
170 |     "        \n",
171 |     "        wd = pad_sequences(maxlen=maxlen, sequences=wd, padding=\"post\",value=self.word2idx[\"PAD\"])\n",
172 |     "        \n",
173 |     "        pos = [[self.pos_tag2idx[w[1]] for w in s] for s in self.sentences]\n",
174 |     "        pos = pad_sequences(maxlen=maxlen, sequences=pos, padding=\"post\",value=self.pos_tag2idx[\"PAD\"])\n",
175 |     "\n",
176 |     "        y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n",
177 |     "        y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n",
178 |     "        return (wd,pos),y"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "preprocess_obj = Preprocessing()\n",
188 |     "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n",
189 |     "x_train,y_train = preprocess_obj.create_data(trained_sen_list)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "class MyCallback(tf.keras.callbacks.Callback):\n",
199 |     "    def __init__(self, monitor='acc', baseline=0.95):\n",
200 |     "        self.monitor = monitor\n",
201 |     "        self.baseline = baseline\n",
202 |     "        self.training_stop = False\n",
203 |     "\n",
204 |     "    def on_train_begin(self, logs={}):\n",
205 |     "        self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n",
206 |     "\n",
207 |     "    def on_epoch_end(self, epoch, logs={}):\n",
208 |     "        if logs and logs.get(self.monitor) >= self.baseline:\n",
209 |     "            print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n",
210 |     "            self.training_stop = True\n",
211 |     "        \n",
212 |     "        if self.training_stop:    \n",
213 |     "            self.model.stop_training = True\n",
214 |     "\n",
215 |     "\n",
216 |     "class CreateModel():\n",
217 |     "    def __init__(self):\n",
218 |     "        self.model = None\n",
219 |     "        self.history = None\n",
220 |     "        self.x_train = x_train\n",
221 |     "        self.y_train = y_train\n",
222 |     "        self.max_len = preprocess_obj.max_len\n",
223 |     "        self.num_words = preprocess_obj.num_words\n",
224 |     "        self.num_labels = preprocess_obj.num_lables\n",
225 |     "        self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n",
226 |     "        \n",
227 |     "    def train(self):\n",
228 |     "        word_input = Input(shape=(self.max_len,))\n",
229 |     "        word_model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n",
230 |     "        \n",
231 |     "        pos_input = Input(shape=(None,), dtype='int32')\n",
232 |     "        pos_model = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n",
233 |     "\n",
234 |     "        output = concatenate([word_model, pos_model])\n",
235 |     "        \n",
236 |     "        output = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(output)\n",
237 |     "        output = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(output)\n",
238 |     "        \n",
239 |     "        self.model = Model(inputs=[word_input, pos_input], outputs=[output])\n",
240 |     "        self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n",
241 |     "        \n",
242 |     "    def run(self,batch_size=32,epoch=5):\n",
243 |     "        logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
244 |     "        logdir = \"logs_tensorboard\"\n",
245 |     "        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n",
246 |     "        \n",
247 |     "        val_acc = 0.99\n",
248 |     "        monitor_param = 'val_acc'\n",
249 |     "        \n",
250 |     "        checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n",
251 |     "        \n",
252 |     "        #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n",
253 |     "        self.history = self.model.fit(self.x_train, self.y_train,\n",
254 |     "                                     batch_size=batch_size, epochs=epoch,\n",
255 |     "                                     validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n",
256 |     "                                     verbose=1)\n",
257 |     "    def save_model(self,model_file):\n",
258 |     "        self.model.save(model_file)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "model_obj = CreateModel()\n",
268 |     "model_obj.train()\n",
269 |     "model_obj.run(batch_size=32,epoch=100)\n",
270 |     "model_obj.save_model(\"models/simple_ner_model_v2.h5\")"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "class Prediction():\n",
280 |     "    def __init__(self):\n",
281 |     "        self.word2idx = preprocess_obj.word2idx\n",
282 |     "        self.pos_tag2idx = preprocess_obj.pos_tag2idx\n",
283 |     "        self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n",
284 |     "        self.model = model_obj.model\n",
285 |     "        self.max_len = preprocess_obj.max_len\n",
286 |     "    def predict(self,texts):\n",
287 |     "        label_lists = list()\n",
288 |     "        for text in texts:\n",
289 |     "            words = text.split()\n",
290 |     "            tagged = nltk.pos_tag(words) \n",
291 |     "            \n",
292 |     "            wd = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n",
293 |     "            wd = pad_sequences(maxlen=self.max_len, sequences=wd,\n",
294 |     "                          padding=\"post\", value=self.word2idx[\"PAD\"])\n",
295 |     "            \n",
296 |     "            pos = [[self.pos_tag2idx.get(item, self.pos_tag2idx[\"UNK\"]) for item in tagged]]\n",
297 |     "            pos = pad_sequences(maxlen=self.max_len, sequences=pos,\n",
298 |     "                          padding=\"post\", value=self.pos_tag2idx[\"PAD\"])\n",
299 |     "            \n",
300 |     "            y_pred = self.model.predict([wd,pos])\n",
301 |     "            pred_index = np.argmax(y_pred, axis=-1)\n",
302 |     "            preds = pred_index.flatten().tolist()\n",
303 |     "            labels = [self.idx2label[ind] for ind in preds]\n",
304 |     "            label_lists.append(labels)\n",
305 |     "            \n",
306 |     "            print([(words[idx],labels[idx]) for idx in range(len(words))])\n",
307 |     "        return label_lists\n",
308 |     "    "
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "#print(preprocess_obj.word2idx)\n",
318 |     "pred_obj = Prediction()\n",
319 |     "text = \"Play the last track from Beyonce off Spotify\"\n",
320 |     "y_pred = pred_obj.predict([text])"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": []
329 |   }
330 |  ],
331 |  "metadata": {
332 |   "kernelspec": {
333 |    "display_name": "Python 3",
334 |    "language": "python",
335 |    "name": "python3"
336 |   },
337 |   "language_info": {
338 |    "codemirror_mode": {
339 |     "name": "ipython",
340 |     "version": 3
341 |    },
342 |    "file_extension": ".py",
343 |    "mimetype": "text/x-python",
344 |    "name": "python",
345 |    "nbconvert_exporter": "python",
346 |    "pygments_lexer": "ipython3",
347 |    "version": "3.6.9"
348 |   }
349 |  },
350 |  "nbformat": 4,
351 |  "nbformat_minor": 2
352 | }
353 | 


--------------------------------------------------------------------------------
/1.5-named-entity-recognition/simple_ner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from numpy import array\n",
 10 |     "import tensorflow as tf\n",
 11 |     "import glob\n",
 12 |     "import numpy as np\n",
 13 |     "import pickle\n",
 14 |     "from datetime import datetime\n",
 15 |     "\n",
 16 |     "from sklearn.metrics import accuracy_score\n",
 17 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 18 |     "from tensorflow.keras.utils import to_categorical\n",
 19 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 20 |     "from tensorflow.keras.models import Sequential,Model\n",
 21 |     "from tensorflow.keras.layers import Dense\n",
 22 |     "from tensorflow.keras.layers import LSTM\n",
 23 |     "from tensorflow.keras.layers import Input\n",
 24 |     "from tensorflow.keras.layers import Dropout\n",
 25 |     "from tensorflow.keras.layers import Embedding\n",
 26 |     "from tensorflow.keras.layers import TimeDistributed\n",
 27 |     "from tensorflow.keras.layers import Conv1D\n",
 28 |     "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
 29 |     "from tensorflow.keras.utils import Progbar\n",
 30 |     "from tensorflow.keras.models import load_model\n",
 31 |     "\n",
 32 |     "from tensorflow.keras.initializers import RandomUniform\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "class LoadData():\n",
 42 |     "    def __init__(self):\n",
 43 |     "        self.train_files = None\n",
 44 |     "        self.validation_files = None\n",
 45 |     "        \n",
 46 |     "    def get_data(self):\n",
 47 |     "        self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
 48 |     "        self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n",
 49 |     "        \n",
 50 |     "    def sentence_from_file(self,filename):\n",
 51 |     "        single_data_list = list()\n",
 52 |     "        with open(filename) as fp:\n",
 53 |     "            sentence_list = []\n",
 54 |     "            lines = fp.readlines()\n",
 55 |     "            for line in lines:\n",
 56 |     "                splits = line.split(' ')\n",
 57 |     "                if splits[0]=='\\n':\n",
 58 |     "                    #sent = \" \".join([word[0] for word in sentence_list])\n",
 59 |     "                    #single_data_list.append((sentence_list,sent))\n",
 60 |     "                    single_data_list.append(sentence_list)\n",
 61 |     "                    sentence_list = list()\n",
 62 |     "                else:\n",
 63 |     "                    sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n",
 64 |     "                \n",
 65 |     "        return single_data_list\n",
 66 |     "    \n",
 67 |     "    def addCharInformatioin(self,Sentences):\n",
 68 |     "        for i,sentence in enumerate(Sentences):\n",
 69 |     "            for j,data in enumerate(sentence):\n",
 70 |     "                chars = [c for c in data[0]]\n",
 71 |     "                Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
 72 |     "        return Sentences\n",
 73 |     "    \n",
 74 |     "    def prepared_data(self,files):\n",
 75 |     "        list_sentences = list()\n",
 76 |     "        for each_file in files:\n",
 77 |     "            sentences = self.sentence_from_file(each_file)\n",
 78 |     "            #sentences = self.addCharInformatioin(sentences)\n",
 79 |     "            list_sentences+= sentences\n",
 80 |     "        return list_sentences\n",
 81 |     "    "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "load_data_obj = LoadData()\n",
 91 |     "load_data_obj.get_data()\n",
 92 |     "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n",
 93 |     "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n",
 94 |     "print(trained_sen_list[:5])"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "class Preprocessing():\n",
104 |     "    def __init__(self):\n",
105 |     "        self.max_len = len(max(trained_sen_list))\n",
106 |     "        \n",
107 |     "    def make_data(self,data_list):\n",
108 |     "        \n",
109 |     "        \n",
110 |     "        words = list()\n",
111 |     "        for each_sent in data_list:\n",
112 |     "            for each_item in each_sent:\n",
113 |     "                words.append(each_item[0])\n",
114 |     "        words = list(set(words))\n",
115 |     "\n",
116 |     "        \n",
117 |     "        pos_tags = list()\n",
118 |     "        for each_sent in data_list:\n",
119 |     "            for each_item in each_sent:\n",
120 |     "                pos_tags.append(each_item[1])\n",
121 |     "        pos_tags = list(set(pos_tags))\n",
122 |     "        \n",
123 |     "        labels = list()\n",
124 |     "        for each_sent in data_list:\n",
125 |     "            for each_item in each_sent:\n",
126 |     "                labels.append(each_item[2])\n",
127 |     "        labels = list(set(labels))\n",
128 |     "        \n",
129 |     "        self.word2idx = {\"PAD\": 0, \"UNK\": 1}\n",
130 |     "        self.word2idx.update({w: i for i, w in enumerate(words)})\n",
131 |     "        self.num_words = len(self.word2idx)\n",
132 |     "        \n",
133 |     "        self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n",
134 |     "        self.num_pos_tags = len(self.pos_tag2idx)\n",
135 |     "        \n",
136 |     "        self.label2idx = {t: i for i, t in enumerate(labels)}\n",
137 |     "        self.num_lables = len(self.label2idx)\n",
138 |     "        \n",
139 |     "    def word2features(self,data, word_dict):\n",
140 |     "        word = data[0]\n",
141 |     "        postag = data[1]\n",
142 |     "        binary_map = {True:0,False:1,None:2}\n",
143 |     "        features = [word_dict[word],binary_map[word.islower()], \n",
144 |     "                    binary_map[word.isupper()], binary_map[word.istitle()], \n",
145 |     "                    binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n",
146 |     "        return features\n",
147 |     "\n",
148 |     "\n",
149 |     "    def sent2features(self,sent,word_dict):\n",
150 |     "        sentence_features = list()\n",
151 |     "        for index in range(len(sent)):\n",
152 |     "            sentence_features.append(self.word2features(sent[index],word_dict))\n",
153 |     "                       \n",
154 |     "        return sentence_features\n",
155 |     "\n",
156 |     "    def sent2labels(self,sent):\n",
157 |     "        return [label for token, postag, label in sent]\n",
158 |     "\n",
159 |     "    def sent2tokens(self,sent):\n",
160 |     "        return [token for token, postag, label in sent]\n",
161 |     "    \n",
162 |     "    def create_data(self,data_list):\n",
163 |     "        self.sentences = data_list\n",
164 |     "        maxlen = max([len(item) for item in data_list])\n",
165 |     "        self.max_len = maxlen\n",
166 |     "        x = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n",
167 |     "        #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.num_words - 1)\n",
168 |     "        x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.word2idx[\"PAD\"])\n",
169 |     "        #x = [self.sent2features(s,self.word2idx) for s in self.sentences]\n",
170 |     "        #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=[0,2,2,2,2,len(self.pos_tag2idx)])\n",
171 |     "        print(x[2])\n",
172 |     "        y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n",
173 |     "        y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n",
174 |     "        return x,y"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "preprocess_obj = Preprocessing()\n",
184 |     "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n",
185 |     "x_train,y_train = preprocess_obj.create_data(trained_sen_list)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "class MyCallback(tf.keras.callbacks.Callback):\n",
195 |     "    def __init__(self, monitor='acc', baseline=0.95):\n",
196 |     "        self.monitor = monitor\n",
197 |     "        self.baseline = baseline\n",
198 |     "        self.training_stop = False\n",
199 |     "\n",
200 |     "    def on_train_begin(self, logs={}):\n",
201 |     "        self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n",
202 |     "\n",
203 |     "    def on_epoch_end(self, epoch, logs={}):\n",
204 |     "        if logs and logs.get(self.monitor) >= self.baseline:\n",
205 |     "            print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n",
206 |     "            self.training_stop = True\n",
207 |     "        \n",
208 |     "        if self.training_stop:    \n",
209 |     "            self.model.stop_training = True\n",
210 |     "\n",
211 |     "\n",
212 |     "class CreateModel():\n",
213 |     "    def __init__(self):\n",
214 |     "        self.model = None\n",
215 |     "        self.history = None\n",
216 |     "        self.x_train = x_train\n",
217 |     "        self.y_train = y_train\n",
218 |     "        self.max_len = preprocess_obj.max_len\n",
219 |     "        self.num_words = preprocess_obj.num_words\n",
220 |     "        self.num_labels = preprocess_obj.num_lables\n",
221 |     "        self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n",
222 |     "        \n",
223 |     "    def train(self):\n",
224 |     "        word_input = Input(shape=(self.max_len,))\n",
225 |     "        model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n",
226 |     "        model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)\n",
227 |     "        out = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(model)\n",
228 |     "        \n",
229 |     "        self.model = Model(word_input,out)\n",
230 |     "        self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n",
231 |     "        \n",
232 |     "    def run(self,batch_size=32,epoch=5):\n",
233 |     "        logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
234 |     "        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n",
235 |     "        \n",
236 |     "        val_acc = 0.99\n",
237 |     "        monitor_param = 'val_acc'\n",
238 |     "        \n",
239 |     "        checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n",
240 |     "        \n",
241 |     "        #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n",
242 |     "        self.history = self.model.fit(self.x_train, self.y_train,\n",
243 |     "                                     batch_size=batch_size, epochs=epoch,\n",
244 |     "                                     validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n",
245 |     "                                     verbose=1)\n",
246 |     "    def save_model(self,model_file):\n",
247 |     "        self.model.save(model_file)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "model_obj = CreateModel()\n",
257 |     "model_obj.train()\n",
258 |     "model_obj.run(batch_size=32,epoch=100)\n",
259 |     "model_obj.save_model(\"models/simple_ner_model.h5\")"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "class Prediction():\n",
269 |     "    def __init__(self):\n",
270 |     "        self.word2idx = preprocess_obj.word2idx\n",
271 |     "        self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n",
272 |     "        self.model = model_obj.model\n",
273 |     "        self.max_len = preprocess_obj.max_len\n",
274 |     "    def predict(self,texts):\n",
275 |     "        label_lists = list()\n",
276 |     "        for text in texts:\n",
277 |     "            words = text.split()\n",
278 |     "            x = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n",
279 |     "            x = pad_sequences(maxlen=self.max_len, sequences=x,\n",
280 |     "                          padding=\"post\", value=self.word2idx[\"PAD\"])\n",
281 |     "            y_pred = self.model.predict(x)\n",
282 |     "            print(\"Predicted Probabilities on Test Set:\\n\",y_pred.shape)\n",
283 |     "            # taking tag class with maximum probability\n",
284 |     "            pred_index = np.argmax(y_pred, axis=-1)\n",
285 |     "            print(\"Predicted tag indices: \\n\",pred_index.shape)\n",
286 |     "            preds = pred_index.flatten().tolist()\n",
287 |     "            labels = [self.idx2label[ind] for ind in preds]\n",
288 |     "            label_lists.append(labels)\n",
289 |     "            \n",
290 |     "            print([(words[idx],labels[idx]) for idx in range(len(words))])\n",
291 |     "            #print(labels)\n",
292 |     "        return label_lists\n",
293 |     "    "
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "#print(preprocess_obj.word2idx)\n",
303 |     "pred_obj = Prediction()\n",
304 |     "'''\n",
305 |     "for item in validation_sen_list:\n",
306 |     "    sent = \" \".join([self.word2idx[w[0]] for w in s] for item in self.sentences])\n",
307 |     "    \n",
308 |     "'''\n",
309 |     "text = \"Play the last track from Beyoncé off Spotify\"\n",
310 |     "y_pred = pred_obj.predict([text,text])"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": []
319 |   }
320 |  ],
321 |  "metadata": {
322 |   "kernelspec": {
323 |    "display_name": "Python 3",
324 |    "language": "python",
325 |    "name": "python3"
326 |   },
327 |   "language_info": {
328 |    "codemirror_mode": {
329 |     "name": "ipython",
330 |     "version": 3
331 |    },
332 |    "file_extension": ".py",
333 |    "mimetype": "text/x-python",
334 |    "name": "python",
335 |    "nbconvert_exporter": "python",
336 |    "pygments_lexer": "ipython3",
337 |    "version": "3.6.9"
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 2
342 | }
343 | 


--------------------------------------------------------------------------------
/1.6-intent-classification/README.md:
--------------------------------------------------------------------------------
1 | Use the below link to get the data.
2 | https://www.kaggle.com/joydeb28/nlp-benchmarking-data-for-intent-and-entity
3 | 


--------------------------------------------------------------------------------
/1.6-intent-classification/intent_classfication_bert_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  8 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import numpy as np # linear algebra\n",
 13 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 14 |     "import json\n",
 15 |     "import os\n",
 16 |     "from sklearn.metrics import roc_curve\n",
 17 |     "from sklearn.metrics import accuracy_score\n",
 18 |     "from sklearn.model_selection import train_test_split\n",
 19 |     "from tensorflow.keras.utils import to_categorical\n",
 20 |     "from tensorflow.keras.models import Sequential, Model\n",
 21 |     "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n",
 22 |     "from tensorflow.keras.optimizers import Adam\n",
 23 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 24 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 25 |     "import bert\n",
 26 |     "from tqdm import tqdm\n",
 27 |     "from tensorflow.keras import backend as K\n",
 28 |     "import tensorflow as tf\n",
 29 |     "import tensorflow_hub as hub\n",
 30 |     "print(\"TensorFlow Version:\",tf.__version__)\n",
 31 |     "print(\"Hub version: \",hub.__version__)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
 39 |     "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "class LoadingData():\n",
 44 |     "            \n",
 45 |     "    def __init__(self):\n",
 46 |     "        train_file_path = os.path.join(\"benchmarking_data\",\"Train\")\n",
 47 |     "        validation_file_path = os.path.join(\"benchmarking_data\",\"Validate\")\n",
 48 |     "        category_id = 0\n",
 49 |     "        self.cat_to_intent = {}\n",
 50 |     "        self.intent_to_cat = {}\n",
 51 |     "        \n",
 52 |     "        for dirname, _, filenames in os.walk(train_file_path):\n",
 53 |     "            for filename in filenames:\n",
 54 |     "                file_path = os.path.join(dirname, filename)\n",
 55 |     "                intent_id = filename.replace(\".json\",\"\")\n",
 56 |     "                self.cat_to_intent[category_id] = intent_id\n",
 57 |     "                self.intent_to_cat[intent_id] = category_id\n",
 58 |     "                category_id+=1\n",
 59 |     "        print(self.cat_to_intent)\n",
 60 |     "        print(self.intent_to_cat)\n",
 61 |     "        '''Training data'''\n",
 62 |     "        training_data = list() \n",
 63 |     "        for dirname, _, filenames in os.walk(train_file_path):\n",
 64 |     "            for filename in filenames:\n",
 65 |     "                file_path = os.path.join(dirname, filename)\n",
 66 |     "                intent_id = filename.replace(\".json\",\"\")\n",
 67 |     "                training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n",
 68 |     "        self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category'])   \n",
 69 |     "        \n",
 70 |     "        self.train_data_frame = self.train_data_frame.sample(frac = 1)\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "        \n",
 74 |     "        '''Validation data'''\n",
 75 |     "        validation_data = list()    \n",
 76 |     "        for dirname, _, filenames in os.walk(validation_file_path):\n",
 77 |     "            for filename in filenames:\n",
 78 |     "                file_path = os.path.join(dirname, filename)\n",
 79 |     "                intent_id = filename.replace(\".json\",\"\")\n",
 80 |     "                validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])                \n",
 81 |     "        self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n",
 82 |     "\n",
 83 |     "        self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n",
 84 |     "        \n",
 85 |     "        \n",
 86 |     "    def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n",
 87 |     "        json_d = json.load(open(json_file))         \n",
 88 |     "        \n",
 89 |     "        json_dict = json_d[intent_id]\n",
 90 |     "\n",
 91 |     "        sent_list = list()\n",
 92 |     "        for i in json_dict:\n",
 93 |     "            each_list = i['data']\n",
 94 |     "            sent =\"\"\n",
 95 |     "            for i in each_list:\n",
 96 |     "                sent = sent + i['text']+ \" \"\n",
 97 |     "            sent =sent[:-1]\n",
 98 |     "            for i in range(3):\n",
 99 |     "                sent = sent.replace(\"  \",\" \")\n",
100 |     "            sent_list.append((sent,intent_id,cat))\n",
101 |     "        return sent_list\n",
102 |     "            "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "load_data_obj = LoadingData()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "load_data_obj.train_data_frame.head()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "load_data_obj.validation_data_frame.head().values"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "class BertModel(object):\n",
139 |     "    \n",
140 |     "    def __init__(self):\n",
141 |     "        \n",
142 |     "        self.max_len = 128\n",
143 |     "        bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
144 |     "        FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
145 |     "        \n",
146 |     "        self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
147 |     "\n",
148 |     "        self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
149 |     "\n",
150 |     "        self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
151 |     "\n",
152 |     "        self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
153 |     "        \n",
154 |     "    def get_masks(self,tokens, max_seq_length):\n",
155 |     "        return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
156 |     "\n",
157 |     "    def get_segments(self,tokens, max_seq_length):\n",
158 |     "        \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n",
159 |     "        segments = []\n",
160 |     "        current_segment_id = 0\n",
161 |     "        for token in tokens:\n",
162 |     "            segments.append(current_segment_id)\n",
163 |     "            if token == \"[SEP]\":\n",
164 |     "                current_segment_id = 1\n",
165 |     "        return segments + [0] * (max_seq_length - len(tokens))\n",
166 |     "    \n",
167 |     "    def get_ids(self,tokens, tokenizer, max_seq_length):\n",
168 |     "        \"\"\"Token ids from Tokenizer vocab\"\"\"\n",
169 |     "        token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
170 |     "        input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n",
171 |     "        return input_ids\n",
172 |     "    def create_single_input(self,sentence,maxlen):\n",
173 |     "\n",
174 |     "        stokens = self.tokenizer.tokenize(sentence)\n",
175 |     "\n",
176 |     "        stokens = stokens[:maxlen]\n",
177 |     "\n",
178 |     "        stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n",
179 |     "\n",
180 |     "        ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n",
181 |     "        masks = self.get_masks(stokens, self.max_len)\n",
182 |     "        segments = self.get_segments(stokens, self.max_len)\n",
183 |     "\n",
184 |     "        return ids,masks,segments\n",
185 |     "\n",
186 |     "    def create_input_array(self,sentences):\n",
187 |     "        \n",
188 |     "        input_ids, input_masks, input_segments = [], [], []\n",
189 |     "\n",
190 |     "        for sentence in tqdm(sentences,position=0, leave=True):\n",
191 |     "            ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n",
192 |     "\n",
193 |     "            input_ids.append(ids)\n",
194 |     "            input_masks.append(masks)\n",
195 |     "            input_segments.append(segments)\n",
196 |     "            \n",
197 |     "        tensor = [np.asarray(input_ids, dtype=np.int32), \n",
198 |     "                np.asarray(input_masks, dtype=np.int32), \n",
199 |     "                np.asarray(input_segments, dtype=np.int32)]\n",
200 |     "        return tensor"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "class PreprocessingBertData():\n",
210 |     "    \n",
211 |     "    def prepare_data_x(self,train_sentences):\n",
212 |     "        x = bert_model_obj.create_input_array(train_sentences)\n",
213 |     "        return x\n",
214 |     "    \n",
215 |     "    def prepare_data_y(self,train_labels):\n",
216 |     "        y = list()\n",
217 |     "        for item in train_labels:\n",
218 |     "            label = item\n",
219 |     "            y.append(label)\n",
220 |     "        y = np.array(y)\n",
221 |     "        return y\n",
222 |     "        \n"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "bert_model_obj = BertModel()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "train_sentences = load_data_obj.train_data_frame[\"query\"].tolist()\n",
241 |     "train_labels = load_data_obj.train_data_frame[\"category\"].tolist()\n",
242 |     "\n",
243 |     "preprocess_bert_data_obj = PreprocessingBertData()\n",
244 |     "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n",
245 |     "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n",
246 |     "\n",
247 |     "train_input_ids, train_input_masks, train_segment_ids = x\n",
248 |     "train_labels = y\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "class DesignModel():\n",
258 |     "    def __init__(self):\n",
259 |     "        self.model = None        \n",
260 |     "        self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n",
261 |     "        self.train_labels = train_labels\n",
262 |     "        \n",
263 |     "    def bert_model(self,max_seq_length): \n",
264 |     "        in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n",
265 |     "        in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n",
266 |     "        in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n",
267 |     "        \n",
268 |     "        bert_inputs = [in_id, in_mask, in_segment]\n",
269 |     "        bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n",
270 |     "        \n",
271 |     "        bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n",
272 |     "        bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n",
273 |     "        bert_outputs = tf.keras.layers.Dense(len(load_data_obj.cat_to_intent), activation=\"softmax\", name=\"dense_output\")(x)\n",
274 |     "        self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n",
275 |     "        \n",
276 |     "        self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),\n",
277 |     "                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
278 |     "                           metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name=\"acc\")])\n",
279 |     "        \n",
280 |     "        self.model.summary()\n",
281 |     "    \n",
282 |     "    def model_train(self,batch_size,num_epoch):\n",
283 |     "        print(\"Fitting to model\")\n",
284 |     "        self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n",
285 |     "        print(\"Model Training complete.\")\n",
286 |     "\n",
287 |     "    def save_model(self,model,model_name):    \n",
288 |     "        self.model.save(model_name+\".h5\")\n",
289 |     "        print(\"Model saved to Model folder.\")"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "model_obj = DesignModel()\n",
299 |     "model_obj.bert_model(bert_model_obj.max_len)\n",
300 |     "model_obj.model_train(32,1)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "model_obj.save_model(model_obj.model,\"bert\")"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "class Evaluation():\n",
319 |     "    def get_accuracy(self,actuals, predictions):\n",
320 |     "        acc = accuracy_score(actuals, predictions)\n",
321 |     "        return acc"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "class Prediction():\n",
331 |     "    def __init__(self):\n",
332 |     "        self.model = model_obj.model\n",
333 |     "        \n",
334 |     "    def predict_validation(self):\n",
335 |     "        valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n",
336 |     "        valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n",
337 |     "\n",
338 |     "        preprocess_bert_data_obj = PreprocessingBertData()\n",
339 |     "        val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n",
340 |     "        prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n",
341 |     "        return valid_labels,prediction_labels\n",
342 |     "        \n",
343 |     "    \n",
344 |     "    def predict(self,query):\n",
345 |     "        query_seq = bert_model_obj.create_input_array([query])\n",
346 |     "        pred = self.model.predict(query_seq)\n",
347 |     "        pred = np.argmax(pred)\n",
348 |     "        result = load_data_obj.cat_to_intent[pred]\n",
349 |     "        return result"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "pred_obj = Prediction()\n",
359 |     "#pred_obj.predict_validation()"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "querylist = [['I want to see Medal for the General', 'SearchScreeningEvent', 1],\n",
369 |     "       ['Book a reservation for 5 people at the top-rated brasserie restaurant',\n",
370 |     "        'BookRestaurant', 5],\n",
371 |     "       ['Can I put this tune onto my sin estres playlist?',\n",
372 |     "        'AddToPlaylist', 6],\n",
373 |     "       ['add the artist Pete Murray to my relaxing playlist',\n",
374 |     "        'AddToPlaylist', 6],\n",
375 |     "       ['Book me a reservation for a party of 3 at a pub in Northern Mariana Islands',\n",
376 |     "        'BookRestaurant', 5]]\n",
377 |     "for query in querylist:\n",
378 |     "    result = pred_obj.predict(query[0])\n",
379 |     "    print(\"Predicted Intent: \"+str(result)+\"\\tActual Intent: \"+(load_data_obj.cat_to_intent[query[2]])+\"\\tQuery: \"+str(query[0]))\n"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "eval_obj = Evaluation()\n",
389 |     "ytest,ypred = pred_obj.predict_validation()\n",
390 |     "acc = eval_obj.get_accuracy(ytest,ypred)\n",
391 |     "print(\"Auc: {:.2%}\".format(acc))\n"
392 |    ]
393 |   }
394 |  ],
395 |  "metadata": {
396 |   "kernelspec": {
397 |    "display_name": "Python 3",
398 |    "language": "python",
399 |    "name": "python3"
400 |   },
401 |   "language_info": {
402 |    "codemirror_mode": {
403 |     "name": "ipython",
404 |     "version": 3
405 |    },
406 |    "file_extension": ".py",
407 |    "mimetype": "text/x-python",
408 |    "name": "python",
409 |    "nbconvert_exporter": "python",
410 |    "pygments_lexer": "ipython3",
411 |    "version": "3.6.9"
412 |   }
413 |  },
414 |  "nbformat": 4,
415 |  "nbformat_minor": 4
416 | }
417 | 


--------------------------------------------------------------------------------
/1.6-intent-classification/intent_classfication_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "kernelspec": {
  6 |       "display_name": "Python 3",
  7 |       "language": "python",
  8 |       "name": "python3"
  9 |     },
 10 |     "language_info": {
 11 |       "codemirror_mode": {
 12 |         "name": "ipython",
 13 |         "version": 3
 14 |       },
 15 |       "file_extension": ".py",
 16 |       "mimetype": "text/x-python",
 17 |       "name": "python",
 18 |       "nbconvert_exporter": "python",
 19 |       "pygments_lexer": "ipython3",
 20 |       "version": "3.6.9"
 21 |     },
 22 |     "colab": {
 23 |       "name": "intent_classfication_keras.ipynb",
 24 |       "provenance": [],
 25 |       "collapsed_sections": []
 26 |     },
 27 |     "accelerator": "GPU"
 28 |   },
 29 |   "cells": [
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
 34 |         "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
 35 |         "id": "H9qTF3ffa7Pc",
 36 |         "colab_type": "code",
 37 |         "colab": {}
 38 |       },
 39 |       "source": [
 40 |         "# Data\n",
 41 |         "# "
 42 |       ],
 43 |       "execution_count": 1,
 44 |       "outputs": []
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "metadata": {
 49 |         "id": "dVcaGQx5bw1n",
 50 |         "colab_type": "code",
 51 |         "colab": {}
 52 |       },
 53 |       "source": [
 54 |         "import numpy as np\n",
 55 |         "import pandas as pd\n",
 56 |         "import json\n",
 57 |         "import os\n",
 58 |         "import en_core_web_sm\n",
 59 |         "from sklearn.metrics import roc_curve\n",
 60 |         "from sklearn.metrics import accuracy_score\n",
 61 |         "from sklearn.model_selection import train_test_split\n",
 62 |         "from tensorflow.keras.utils import to_categorical\n",
 63 |         "from tensorflow.keras.models import Sequential, Model, load_model\n",
 64 |         "from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Bidirectional, Activation\n",
 65 |         "from tensorflow.keras.optimizers import Adam\n",
 66 |         "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 67 |         "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 68 |         "from tensorflow.keras.layers import LSTM\n",
 69 |         "from tensorflow.keras.layers import SimpleRNN\n",
 70 |         "from tensorflow.keras.layers import Conv1D\n",
 71 |         "from tensorflow.keras.layers import Dropout\n",
 72 |         "from tensorflow.keras.layers import BatchNormalization\n",
 73 |         "from tensorflow.keras.layers import GlobalMaxPooling1D\n",
 74 |         "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 75 |         "from tensorflow.keras.preprocessing.sequence import pad_sequences"
 76 |       ],
 77 |       "execution_count": 18,
 78 |       "outputs": []
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "metadata": {
 83 |         "id": "lystXd0mbNXk",
 84 |         "colab_type": "code",
 85 |         "colab": {
 86 |           "base_uri": "https://localhost:8080/",
 87 |           "height": 121
 88 |         },
 89 |         "outputId": "a9dcaebf-a417-408f-8fa8-11ed934a3efb"
 90 |       },
 91 |       "source": [
 92 |         "from google.colab import drive\n",
 93 |         "drive.mount(\"/content/drive\")"
 94 |       ],
 95 |       "execution_count": 3,
 96 |       "outputs": [
 97 |         {
 98 |           "output_type": "stream",
 99 |           "text": [
100 |             "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code\n",
101 |             "\n",
102 |             "Enter your authorization code:\n",
103 |             "··········\n",
104 |             "Mounted at /content/drive\n"
105 |           ],
106 |           "name": "stdout"
107 |         }
108 |       ]
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "metadata": {
113 |         "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
114 |         "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
115 |         "id": "jIf3M0bca7Pg",
116 |         "colab_type": "code",
117 |         "colab": {}
118 |       },
119 |       "source": [
120 |         "class LoadingData():\n",
121 |         "            \n",
122 |         "    def __init__(self):\n",
123 |         "        data_dir = \"/content/drive/My Drive/Projects/Data\"\n",
124 |         "        train_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Train\")\n",
125 |         "        validation_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Validate\")\n",
126 |         "        category_id = 0\n",
127 |         "        self.cat_to_intent = {}\n",
128 |         "        self.intent_to_cat = {}\n",
129 |         "        \n",
130 |         "        for dirname, _, filenames in os.walk(train_file_path):\n",
131 |         "            for filename in filenames:\n",
132 |         "                file_path = os.path.join(dirname, filename)\n",
133 |         "                intent_id = filename.replace(\".json\",\"\")\n",
134 |         "                self.cat_to_intent[category_id] = intent_id\n",
135 |         "                self.intent_to_cat[intent_id] = category_id\n",
136 |         "                category_id+=1\n",
137 |         "        '''Training data'''\n",
138 |         "        training_data = list() \n",
139 |         "        for dirname, _, filenames in os.walk(train_file_path):\n",
140 |         "            for filename in filenames:\n",
141 |         "                file_path = os.path.join(dirname, filename)\n",
142 |         "                intent_id = filename.replace(\".json\",\"\")\n",
143 |         "                training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n",
144 |         "        self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category'])   \n",
145 |         "        \n",
146 |         "        self.train_data_frame = self.train_data_frame.sample(frac = 1)\n",
147 |         "\n",
148 |         "\n",
149 |         "        \n",
150 |         "        '''Validation data'''\n",
151 |         "        validation_data = list()    \n",
152 |         "        for dirname, _, filenames in os.walk(validation_file_path):\n",
153 |         "            for filename in filenames:\n",
154 |         "                file_path = os.path.join(dirname, filename)\n",
155 |         "                intent_id = filename.replace(\".json\",\"\")\n",
156 |         "                validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])                \n",
157 |         "        self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n",
158 |         "\n",
159 |         "        self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n",
160 |         "        \n",
161 |         "        \n",
162 |         "    def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n",
163 |         "        json_d = json.load(open(json_file))         \n",
164 |         "        \n",
165 |         "        json_dict = json_d[intent_id]\n",
166 |         "\n",
167 |         "        sent_list = list()\n",
168 |         "        for i in json_dict:\n",
169 |         "            each_list = i['data']\n",
170 |         "            sent =\"\"\n",
171 |         "            for i in each_list:\n",
172 |         "                sent = sent + i['text']+ \" \"\n",
173 |         "            sent =sent[:-1]\n",
174 |         "            for i in range(3):\n",
175 |         "                sent = sent.replace(\"  \",\" \")\n",
176 |         "            sent_list.append((sent,intent_id,cat))\n",
177 |         "        return sent_list\n",
178 |         "            "
179 |       ],
180 |       "execution_count": 7,
181 |       "outputs": []
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "metadata": {
186 |         "id": "CpWQixmea7Pi",
187 |         "colab_type": "code",
188 |         "colab": {}
189 |       },
190 |       "source": [
191 |         "load_data_obj = LoadingData()"
192 |       ],
193 |       "execution_count": 8,
194 |       "outputs": []
195 |     },
196 |     {
197 |       "cell_type": "code",
198 |       "metadata": {
199 |         "id": "Hy352jKEa7Pl",
200 |         "colab_type": "code",
201 |         "colab": {
202 |           "base_uri": "https://localhost:8080/",
203 |           "height": 195
204 |         },
205 |         "outputId": "70dc188d-ad57-4a29-c184-aac5abb806ae"
206 |       },
207 |       "source": [
208 |         "load_data_obj.train_data_frame.head()"
209 |       ],
210 |       "execution_count": 10,
211 |       "outputs": [
212 |         {
213 |           "output_type": "execute_result",
214 |           "data": {
215 |             "text/html": [
216 |               "<div>\n",
217 |               "<style scoped>\n",
218 |               "    .dataframe tbody tr th:only-of-type {\n",
219 |               "        vertical-align: middle;\n",
220 |               "    }\n",
221 |               "\n",
222 |               "    .dataframe tbody tr th {\n",
223 |               "        vertical-align: top;\n",
224 |               "    }\n",
225 |               "\n",
226 |               "    .dataframe thead th {\n",
227 |               "        text-align: right;\n",
228 |               "    }\n",
229 |               "</style>\n",
230 |               "<table border=\"1\" class=\"dataframe\">\n",
231 |               "  <thead>\n",
232 |               "    <tr style=\"text-align: right;\">\n",
233 |               "      <th></th>\n",
234 |               "      <th>query</th>\n",
235 |               "      <th>intent</th>\n",
236 |               "      <th>category</th>\n",
237 |               "    </tr>\n",
238 |               "  </thead>\n",
239 |               "  <tbody>\n",
240 |               "    <tr>\n",
241 |               "      <th>8770</th>\n",
242 |               "      <td>rate the current novel 5 stars</td>\n",
243 |               "      <td>RateBook</td>\n",
244 |               "      <td>4</td>\n",
245 |               "    </tr>\n",
246 |               "    <tr>\n",
247 |               "      <th>6557</th>\n",
248 |               "      <td>Find the schedule for Kingsman: The Secret Ser...</td>\n",
249 |               "      <td>SearchScreeningEvent</td>\n",
250 |               "      <td>3</td>\n",
251 |               "    </tr>\n",
252 |               "    <tr>\n",
253 |               "      <th>721</th>\n",
254 |               "      <td>find Bells Break Their Towers , a video game</td>\n",
255 |               "      <td>SearchCreativeWork</td>\n",
256 |               "      <td>0</td>\n",
257 |               "    </tr>\n",
258 |               "    <tr>\n",
259 |               "      <th>229</th>\n",
260 |               "      <td>show creativity of A Catholic Education</td>\n",
261 |               "      <td>SearchCreativeWork</td>\n",
262 |               "      <td>0</td>\n",
263 |               "    </tr>\n",
264 |               "    <tr>\n",
265 |               "      <th>3680</th>\n",
266 |               "      <td>Will it be warm in Powersville Guam 23 hours f...</td>\n",
267 |               "      <td>GetWeather</td>\n",
268 |               "      <td>1</td>\n",
269 |               "    </tr>\n",
270 |               "  </tbody>\n",
271 |               "</table>\n",
272 |               "</div>"
273 |             ],
274 |             "text/plain": [
275 |               "                                                  query  ... category\n",
276 |               "8770                     rate the current novel 5 stars  ...        4\n",
277 |               "6557  Find the schedule for Kingsman: The Secret Ser...  ...        3\n",
278 |               "721        find Bells Break Their Towers , a video game  ...        0\n",
279 |               "229             show creativity of A Catholic Education  ...        0\n",
280 |               "3680  Will it be warm in Powersville Guam 23 hours f...  ...        1\n",
281 |               "\n",
282 |               "[5 rows x 3 columns]"
283 |             ]
284 |           },
285 |           "metadata": {
286 |             "tags": []
287 |           },
288 |           "execution_count": 10
289 |         }
290 |       ]
291 |     },
292 |     {
293 |       "cell_type": "code",
294 |       "metadata": {
295 |         "id": "GSDQwGEBa7Pn",
296 |         "colab_type": "code",
297 |         "colab": {
298 |           "base_uri": "https://localhost:8080/",
299 |           "height": 195
300 |         },
301 |         "outputId": "c7ec5bc6-af6d-4899-d9f6-244d00b7369e"
302 |       },
303 |       "source": [
304 |         "load_data_obj.validation_data_frame.head()"
305 |       ],
306 |       "execution_count": 11,
307 |       "outputs": [
308 |         {
309 |           "output_type": "execute_result",
310 |           "data": {
311 |             "text/html": [
312 |               "<div>\n",
313 |               "<style scoped>\n",
314 |               "    .dataframe tbody tr th:only-of-type {\n",
315 |               "        vertical-align: middle;\n",
316 |               "    }\n",
317 |               "\n",
318 |               "    .dataframe tbody tr th {\n",
319 |               "        vertical-align: top;\n",
320 |               "    }\n",
321 |               "\n",
322 |               "    .dataframe thead th {\n",
323 |               "        text-align: right;\n",
324 |               "    }\n",
325 |               "</style>\n",
326 |               "<table border=\"1\" class=\"dataframe\">\n",
327 |               "  <thead>\n",
328 |               "    <tr style=\"text-align: right;\">\n",
329 |               "      <th></th>\n",
330 |               "      <th>query</th>\n",
331 |               "      <th>intent</th>\n",
332 |               "      <th>category</th>\n",
333 |               "    </tr>\n",
334 |               "  </thead>\n",
335 |               "  <tbody>\n",
336 |               "    <tr>\n",
337 |               "      <th>699</th>\n",
338 |               "      <td>I want to see Married to the Enemy 2 at a cine...</td>\n",
339 |               "      <td>SearchScreeningEvent</td>\n",
340 |               "      <td>3</td>\n",
341 |               "    </tr>\n",
342 |               "    <tr>\n",
343 |               "      <th>22</th>\n",
344 |               "      <td>Please look up the song The Mad Magician .</td>\n",
345 |               "      <td>SearchCreativeWork</td>\n",
346 |               "      <td>0</td>\n",
347 |               "    </tr>\n",
348 |               "    <tr>\n",
349 |               "      <th>139</th>\n",
350 |               "      <td>rate the current essay zero out of 6 stars</td>\n",
351 |               "      <td>RateBook</td>\n",
352 |               "      <td>4</td>\n",
353 |               "    </tr>\n",
354 |               "    <tr>\n",
355 |               "      <th>599</th>\n",
356 |               "      <td>Add the album to my Club Hits playlist.</td>\n",
357 |               "      <td>AddToPlaylist</td>\n",
358 |               "      <td>6</td>\n",
359 |               "    </tr>\n",
360 |               "    <tr>\n",
361 |               "      <th>16</th>\n",
362 |               "      <td>Please help me find the Late Night Heartbroken...</td>\n",
363 |               "      <td>SearchCreativeWork</td>\n",
364 |               "      <td>0</td>\n",
365 |               "    </tr>\n",
366 |               "  </tbody>\n",
367 |               "</table>\n",
368 |               "</div>"
369 |             ],
370 |             "text/plain": [
371 |               "                                                 query  ... category\n",
372 |               "699  I want to see Married to the Enemy 2 at a cine...  ...        3\n",
373 |               "22          Please look up the song The Mad Magician .  ...        0\n",
374 |               "139         rate the current essay zero out of 6 stars  ...        4\n",
375 |               "599            Add the album to my Club Hits playlist.  ...        6\n",
376 |               "16   Please help me find the Late Night Heartbroken...  ...        0\n",
377 |               "\n",
378 |               "[5 rows x 3 columns]"
379 |             ]
380 |           },
381 |           "metadata": {
382 |             "tags": []
383 |           },
384 |           "execution_count": 11
385 |         }
386 |       ]
387 |     },
388 |     {
389 |       "cell_type": "code",
390 |       "metadata": {
391 |         "id": "tRmVRiTCa7Pp",
392 |         "colab_type": "code",
393 |         "colab": {}
394 |       },
395 |       "source": [
396 |         "class Preprocessing():\n",
397 |         "    def __init__(self):\n",
398 |         "        self.x_train = None\n",
399 |         "        self.y_train = None\n",
400 |         "        self.x_valid = None\n",
401 |         "        self.y_valid = None\n",
402 |         "        self.spacy_model = en_core_web_sm.load()\n",
403 |         "        self.tokenizer = None\n",
404 |         "\n",
405 |         "    def createData(self):\n",
406 |         "        self.tokenizer = Tokenizer(num_words=None)\n",
407 |         "        self.max_len = 50\n",
408 |         "        self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(load_data_obj.train_data_frame['query'].tolist(),load_data_obj.train_data_frame['category'].tolist(),test_size=0.1)\n",
409 |         "        self.tokenizer.fit_on_texts(list(self.x_train) + list(self.x_valid))\n",
410 |         "        self.x_train = self.tokenizer.texts_to_sequences(self.x_train)\n",
411 |         "        self.x_valid = self.tokenizer.texts_to_sequences(self.x_valid)\n",
412 |         "\n",
413 |         "        #zero pad the sequences\n",
414 |         "        self.x_train = pad_sequences(self.x_train, maxlen=self.max_len)\n",
415 |         "        self.x_valid = pad_sequences(self.x_valid, maxlen=self.max_len)\n",
416 |         "        self.y_train = to_categorical(self.y_train)\n",
417 |         "        self.y_valid = to_categorical(self.y_valid)\n",
418 |         "        self.word_index = self.tokenizer.word_index\n",
419 |         "        \n",
420 |         "    def getSpacyEmbeddings(self,sentneces):\n",
421 |         "        sentences_vectors = list()\n",
422 |         "        for item in sentneces:\n",
423 |         "            query_vec = self.spacy_model(item) \n",
424 |         "            sentences_vectors.append(query_vec.vector)\n",
425 |         "        return sentences_vectors\n",
426 |         "    \n",
427 |         "    \n",
428 |         "    \n",
429 |         "    \n",
430 |         "        "
431 |       ],
432 |       "execution_count": 12,
433 |       "outputs": []
434 |     },
435 |     {
436 |       "cell_type": "code",
437 |       "metadata": {
438 |         "id": "LoyTb5Gza7Pr",
439 |         "colab_type": "code",
440 |         "colab": {}
441 |       },
442 |       "source": [
443 |         "preprocess_obj = Preprocessing()\n",
444 |         "preprocess_obj.createData()"
445 |       ],
446 |       "execution_count": 13,
447 |       "outputs": []
448 |     },
449 |     {
450 |       "cell_type": "code",
451 |       "metadata": {
452 |         "id": "rYI77Z4za7Pt",
453 |         "colab_type": "code",
454 |         "colab": {
455 |           "base_uri": "https://localhost:8080/",
456 |           "height": 34
457 |         },
458 |         "outputId": "5bccac12-dacc-497c-8413-5734ca64df0b"
459 |       },
460 |       "source": [
461 |         "preprocess_obj.y_train.shape"
462 |       ],
463 |       "execution_count": 14,
464 |       "outputs": [
465 |         {
466 |           "output_type": "execute_result",
467 |           "data": {
468 |             "text/plain": [
469 |               "(12405, 7)"
470 |             ]
471 |           },
472 |           "metadata": {
473 |             "tags": []
474 |           },
475 |           "execution_count": 14
476 |         }
477 |       ]
478 |     },
479 |     {
480 |       "cell_type": "code",
481 |       "metadata": {
482 |         "id": "ZHI2TUvNa7Pv",
483 |         "colab_type": "code",
484 |         "colab": {
485 |           "base_uri": "https://localhost:8080/",
486 |           "height": 34
487 |         },
488 |         "outputId": "4b1f3c97-2873-4c48-8d51-492e70e70828"
489 |       },
490 |       "source": [
491 |         "preprocess_obj.y_valid.shape"
492 |       ],
493 |       "execution_count": 15,
494 |       "outputs": [
495 |         {
496 |           "output_type": "execute_result",
497 |           "data": {
498 |             "text/plain": [
499 |               "(1379, 7)"
500 |             ]
501 |           },
502 |           "metadata": {
503 |             "tags": []
504 |           },
505 |           "execution_count": 15
506 |         }
507 |       ]
508 |     },
509 |     {
510 |       "cell_type": "code",
511 |       "metadata": {
512 |         "id": "T3WXq62ha7Px",
513 |         "colab_type": "code",
514 |         "colab": {}
515 |       },
516 |       "source": [
517 |         "class DesignModel():\n",
518 |         "    def __init__(self):\n",
519 |         "        self.model = None\n",
520 |         "        self.x_train = preprocess_obj.x_train\n",
521 |         "        self.y_train = preprocess_obj.y_train\n",
522 |         "        self.x_valid = preprocess_obj.x_valid\n",
523 |         "        self.y_valid = preprocess_obj.y_valid\n",
524 |         "        \n",
525 |         "    def simple_rnn(self):\n",
526 |         "        self.model = Sequential()\n",
527 |         "        self.model.add(Embedding(len(preprocess_obj.word_index) + 1,100,input_length=preprocess_obj.max_len))\n",
528 |         "        self.model.add(SimpleRNN(100))\n",
529 |         "        self.model.add(Dense(len(load_data_obj.cat_to_intent), activation='sigmoid'))\n",
530 |         "        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
531 |         "        \n",
532 |         "        \n",
533 |         "    def model_train(self,batch_size,num_epoch):\n",
534 |         "        print(\"Fitting to model\")\n",
535 |         "        self.model.fit(self.x_train, self.y_train, batch_size=batch_size, epochs=num_epoch, validation_data=[self.x_valid, self.y_valid])\n",
536 |         "        print(\"Model Training complete.\")\n",
537 |         "\n",
538 |         "    def save_model(self,model_name):    \n",
539 |         "        self.model.save(model_name+\".h5\")\n",
540 |         "        print(\"Model saved to Model folder.\")"
541 |       ],
542 |       "execution_count": 24,
543 |       "outputs": []
544 |     },
545 |     {
546 |       "cell_type": "code",
547 |       "metadata": {
548 |         "id": "Tye8X7FFa7Pz",
549 |         "colab_type": "code",
550 |         "colab": {
551 |           "base_uri": "https://localhost:8080/",
552 |           "height": 235
553 |         },
554 |         "outputId": "3e26c58d-4bef-4bd1-f789-9662a0243bef"
555 |       },
556 |       "source": [
557 |         "model_obj = DesignModel()\n",
558 |         "model_obj.simple_rnn()\n",
559 |         "model_obj.model_train(64,5)\n",
560 |         "model_obj.save_model(\"srnn\")"
561 |       ],
562 |       "execution_count": 25,
563 |       "outputs": [
564 |         {
565 |           "output_type": "stream",
566 |           "text": [
567 |             "Fitting to model\n",
568 |             "Epoch 1/5\n",
569 |             "194/194 [==============================] - 9s 46ms/step - loss: 0.8717 - accuracy: 0.8039 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
570 |             "Epoch 2/5\n",
571 |             "194/194 [==============================] - 9s 46ms/step - loss: 0.0855 - accuracy: 0.9852 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
572 |             "Epoch 3/5\n",
573 |             "194/194 [==============================] - 9s 46ms/step - loss: 0.0321 - accuracy: 0.9948 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
574 |             "Epoch 4/5\n",
575 |             "194/194 [==============================] - 9s 45ms/step - loss: 0.0157 - accuracy: 0.9980 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
576 |             "Epoch 5/5\n",
577 |             "194/194 [==============================] - 9s 45ms/step - loss: 0.0098 - accuracy: 0.9990 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
578 |             "Model Training complete.\n",
579 |             "Model saved to Model folder.\n"
580 |           ],
581 |           "name": "stdout"
582 |         }
583 |       ]
584 |     },
585 |     {
586 |       "cell_type": "code",
587 |       "metadata": {
588 |         "id": "sZ1dlcGta7P1",
589 |         "colab_type": "code",
590 |         "colab": {}
591 |       },
592 |       "source": [
593 |         "class Evaluation():\n",
594 |         "    def get_accuracy(self,actuals, predictions):\n",
595 |         "        acc = accuracy_score(actuals, predictions)\n",
596 |         "        return acc"
597 |       ],
598 |       "execution_count": 26,
599 |       "outputs": []
600 |     },
601 |     {
602 |       "cell_type": "code",
603 |       "metadata": {
604 |         "id": "UdXO8h31a7P3",
605 |         "colab_type": "code",
606 |         "colab": {}
607 |       },
608 |       "source": [
609 |         "class Prediction():\n",
610 |         "    def __init__(self,model_name):\n",
611 |         "        self.model = load_model(model_name+\".h5\")\n",
612 |         "        self.tokenizer = preprocess_obj.tokenizer\n",
613 |         "        self.max_len = preprocess_obj.max_len\n",
614 |         "        \n",
615 |         "    def predict_validation(self):\n",
616 |         "        self.xtest = load_data_obj.validation_data_frame['query'].tolist()\n",
617 |         "        self.ytest = load_data_obj.validation_data_frame['category'].tolist()\n",
618 |         "        self.xtest = self.tokenizer.texts_to_sequences(self.xtest)\n",
619 |         "        self.xtest = pad_sequences(self.xtest, maxlen=self.max_len)\n",
620 |         "        self.ypred = self.model.predict(self.xtest)\n",
621 |         "        self.ypred = [np.argmax(item) for item in self.ypred]\n",
622 |         "    \n",
623 |         "    def predict(self,query):\n",
624 |         "        query_seq = self.tokenizer.texts_to_sequences([query])\n",
625 |         "        query_pad = pad_sequences(query_seq, maxlen=self.max_len)\n",
626 |         "        pred = self.model.predict(query_pad)\n",
627 |         "        pred = np.argmax(pred)\n",
628 |         "        result = load_data_obj.cat_to_intent[pred]\n",
629 |         "        return result"
630 |       ],
631 |       "execution_count": 27,
632 |       "outputs": []
633 |     },
634 |     {
635 |       "cell_type": "code",
636 |       "metadata": {
637 |         "id": "1QAb7Mr-a7P5",
638 |         "colab_type": "code",
639 |         "colab": {}
640 |       },
641 |       "source": [
642 |         "pred_obj = Prediction(\"srnn\")\n",
643 |         "pred_obj.predict_validation()"
644 |       ],
645 |       "execution_count": 28,
646 |       "outputs": []
647 |     },
648 |     {
649 |       "cell_type": "code",
650 |       "metadata": {
651 |         "id": "8bX7S8VFa7P6",
652 |         "colab_type": "code",
653 |         "colab": {
654 |           "base_uri": "https://localhost:8080/",
655 |           "height": 101
656 |         },
657 |         "outputId": "909f7973-a5de-44cd-b027-7a55a13d5efd"
658 |       },
659 |       "source": [
660 |         "querylist = [\n",
661 |         "    'rate The Gift: Imagination and the Erotic Life of Property five stars',\n",
662 |         "     'table for Breadline Cafe in Minnesota next friday',\n",
663 |         "     'Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?',\n",
664 |         "     'Play some sixties songs on Google Music',\n",
665 |         "     'rate this textbook four out of 6']\n",
666 |         "for query in querylist:\n",
667 |         "    result = pred_obj.predict(query)\n",
668 |         "    print(\"Intent: \"+str(result)+\"\\tQuery: \"+str(query))"
669 |       ],
670 |       "execution_count": 29,
671 |       "outputs": [
672 |         {
673 |           "output_type": "stream",
674 |           "text": [
675 |             "Intent: RateBook\tQuery: rate The Gift: Imagination and the Erotic Life of Property five stars\n",
676 |             "Intent: BookRestaurant\tQuery: table for Breadline Cafe in Minnesota next friday\n",
677 |             "Intent: GetWeather\tQuery: Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?\n",
678 |             "Intent: PlayMusic\tQuery: Play some sixties songs on Google Music\n",
679 |             "Intent: RateBook\tQuery: rate this textbook four out of 6\n"
680 |           ],
681 |           "name": "stdout"
682 |         }
683 |       ]
684 |     },
685 |     {
686 |       "cell_type": "code",
687 |       "metadata": {
688 |         "id": "gCFpIFH_a7P8",
689 |         "colab_type": "code",
690 |         "colab": {
691 |           "base_uri": "https://localhost:8080/",
692 |           "height": 34
693 |         },
694 |         "outputId": "d82b6973-49a1-462a-a0ca-661357100520"
695 |       },
696 |       "source": [
697 |         "eval_obj = Evaluation()\n",
698 |         "acc = eval_obj.get_accuracy(pred_obj.ytest,pred_obj.ypred)\n",
699 |         "print(\"Auc: {:.2%}\".format(acc))\n"
700 |       ],
701 |       "execution_count": 30,
702 |       "outputs": [
703 |         {
704 |           "output_type": "stream",
705 |           "text": [
706 |             "Auc: 97.14%\n"
707 |           ],
708 |           "name": "stdout"
709 |         }
710 |       ]
711 |     }
712 |   ]
713 | }


--------------------------------------------------------------------------------
/1.7-entity-recognition/resume-entities-for-ner.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joydeb28/NLP-Notebooks/9669ec6e416e449f67cedb50c143fa4d94bbd44c/1.7-entity-recognition/resume-entities-for-ner.zip


--------------------------------------------------------------------------------
/1.8-next-word-prediction/cab_booking.txt:
--------------------------------------------------------------------------------
 1 | I would like to book a Cab
 2 | Can you please book a cab from Goa to Mumbai
 3 | I would like to book taxi for Chennai Airport
 4 | I want to take a cab for airport
 5 | Could you please book a cab from me
 6 | I need a cab urgent for airport
 7 | Can you arrange a cab as soon as possible
 8 | I would like to cancel my booking
 9 | Could yo please cancel my booking
10 | Can you please cancel my tomorrows booking
11 | I want to cancel my upcoming booking
12 | 


--------------------------------------------------------------------------------
/1.8-next-word-prediction/next_word_prediction_keras.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h1>Next Word Prediction Model Using Tensorflow & keras<h1> "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<h5>Importing Libraries<h5>"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from numpy import array\n",
 24 |     "import numpy as np\n",
 25 |     "import tensorflow as tf\n",
 26 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
 27 |     "from tensorflow.keras.utils import to_categorical\n",
 28 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
 29 |     "from tensorflow.keras.models import Sequential\n",
 30 |     "from tensorflow.keras.layers import Dense\n",
 31 |     "from tensorflow.keras.layers import LSTM\n",
 32 |     "from tensorflow.keras.layers import Dropout\n",
 33 |     "from tensorflow.keras.layers import Embedding\n",
 34 |     "from tensorflow.keras.models import load_model\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "<h5>Preprocessing Data<h5>"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "class Preprocessing():\n",
 51 |     "    \n",
 52 |     "    def __init__(self,input_file):\n",
 53 |     "        self.input_data_file = input_file\n",
 54 |     "        self.data = None\n",
 55 |     "        self.vocab_size = None\n",
 56 |     "        self.encoded_data = None\n",
 57 |     "        self.max_length = None\n",
 58 |     "        self.sequences = None\n",
 59 |     "        self.x = None\n",
 60 |     "        self.y = None\n",
 61 |     "        self.tokenizer = None\n",
 62 |     "    \n",
 63 |     "    def load_data(self):\n",
 64 |     "        fp = open(self.input_data_file,'r')\n",
 65 |     "        self.data = fp.read().splitlines()        \n",
 66 |     "        fp.close()\n",
 67 |     "        \n",
 68 |     "    def encode_data(self):\n",
 69 |     "        self.tokenizer = Tokenizer()\n",
 70 |     "        self.tokenizer.fit_on_texts(self.data)\n",
 71 |     "        self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
 72 |     "        self.vocab_size = len(self.tokenizer.word_counts)+1\n",
 73 |     "        \n",
 74 |     "    def generate_sequence(self):\n",
 75 |     "        seq_list = list()\n",
 76 |     "        for item in self.encoded_data:\n",
 77 |     "            l = len(item)\n",
 78 |     "            for id in range(1,l):\n",
 79 |     "                seq_list.append(item[:id+1])\n",
 80 |     "        self.max_length = max([len(seq) for seq in seq_list])\n",
 81 |     "        self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n",
 82 |     "        self.sequences = array(self.sequences)\n",
 83 |     "            \n",
 84 |     "    def get_data(self):\n",
 85 |     "        self.x = self.sequences[:,:-1]\n",
 86 |     "        self.y = self.sequences[:,-1]\n",
 87 |     "        self.y = to_categorical(self.y,num_classes=self.vocab_size)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "pr = Preprocessing('cab_booking.txt')\n",
 97 |     "pr.load_data()\n",
 98 |     "pr.encode_data()\n",
 99 |     "pr.generate_sequence()\n",
100 |     "pr.get_data()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "<h3>Model"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "class Model():\n",
117 |     "    def __init__(self):\n",
118 |     "        self.model = None\n",
119 |     "        self.history = None\n",
120 |     "        self.x = None\n",
121 |     "        self.y = None\n",
122 |     "        self.vocab_size = pr.vocab_size\n",
123 |     "        self.max_len = pr.max_length\n",
124 |     "        \n",
125 |     "        \n",
126 |     "    def create_model(self):\n",
127 |     "        self.model = Sequential()\n",
128 |     "        self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n",
129 |     "        self.model.add(LSTM(50))\n",
130 |     "        self.model.add(Dropout(0.1))\n",
131 |     "        self.model.add(Dense(self.vocab_size,activation='softmax'))\n",
132 |     "        self.model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
133 |     "        print(self.model.summary())\n",
134 |     "    def run(self,epochs,batch_size):\n",
135 |     "        self.history = self.model.fit(self.x,self.y,epochs=epochs,batch_size=batch_size,validation_split=0.2)\n",
136 |     "        \n",
137 |     "    def save(self):\n",
138 |     "        self.model.save(\"word_prediction_model.h5\")\n",
139 |     "        "
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "model_obj = Model()\n",
149 |     "model_obj.x = pr.x\n",
150 |     "model_obj.y = pr.y\n",
151 |     "model_obj.create_model()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "model_obj.run(700,2)\n",
161 |     "model_obj.save()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "<h4>Prediction"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "class Prediction():\n",
178 |     "    def __init__(self,tokenizer,max_len):\n",
179 |     "        self.model = None\n",
180 |     "        self.tokenizer = tokenizer\n",
181 |     "        self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n",
182 |     "        self.max_length = max_len\n",
183 |     "    \n",
184 |     "    def load_model(self):\n",
185 |     "        self.model = load_model(\"word_prediction_model.h5\")\n",
186 |     "        \n",
187 |     "    def predict_words(self,text,num_words):\n",
188 |     "        encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n",
189 |     "        padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n",
190 |     "        y_preds = self.model.predict(padded_data)\n",
191 |     "        y_preds = np.argsort(-y_preds)\n",
192 |     "        y_preds = y_preds[0][:num_words]\n",
193 |     "        possible_words = [self.idx2word[item] for item in y_preds]\n",
194 |     "        print(text,possible_words)\n",
195 |     "        print(possible_words)\n"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "pred = Prediction(pr.tokenizer,pr.max_length)    \n",
205 |     "pred.load_model()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "pred.predict_words(\"I would like to\",2)\n",
215 |     "pred.predict_words(\"can you please\",2)"
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "Python 3",
222 |    "language": "python",
223 |    "name": "python3"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.6.9"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 2
240 | }
241 | 


--------------------------------------------------------------------------------
/1.9-smart-compose/README.md:
--------------------------------------------------------------------------------
1 | Data Source
2 | https://www.usingenglish.com/articles/100-most-useful-emailing-phrases.html
3 | http://english.teamdev.com/resources/useful-phrases
4 | https://blog.talaera.com/business-emails-phrases
5 | 


--------------------------------------------------------------------------------
/1.9-smart-compose/data/dataset.txt:
--------------------------------------------------------------------------------
  1 | content
  2 | Dear Sir/Madam
  3 | Dear
  4 | Hello
  5 | Hi
  6 | Hi Team
  7 | Good morning Team
  8 | Good afternoon
  9 | I recently read about that
 10 | I recently heard about that
 11 | Thank you for taking the time to write to us
 12 | Thank you for taking the time to give us some feedback
 13 | Thank you for your email
 14 | Congratulations on
 15 | Hope you're having a great day!
 16 | Hope you're feeling great!
 17 | Explaining Why You're Writing
 18 | I wanted to tell you that
 19 | I am writing to tell you about
 20 | I'm writing to tell you that
 21 | This email is to confirm that
 22 | We're sending you this email because
 23 | In this email, we wanted to tell you about
 24 | We are writing to
 25 | We wish to inform you of
 26 | I'm writing concerning
 27 | I'm writing just so you know
 28 | I'm writing to remind you about
 29 | I'm writing to let you know that
 30 | This email is just to let you know that
 31 | Just a quick reminder that
 32 | I would like to inform you that
 33 | This is just to let you know that
 34 | Wanted to give you a friendly reminder that
 35 | I am contacting you for the following reason
 36 | Just a quick email to see how you're doing
 37 | I just wanted to let you know that
 38 | So happy we reconnected after this time
 39 | So glad that we're in touch again
 40 | Can't believe it's been a year since we last spoke! Feels like yesterday
 41 | Glad you're back in our life!
 42 | Glad to see our old friends again!
 43 | It's always nice to get in touch with old friends!
 44 | Long time no see! Glad to hear from you again
 45 | I highly recommend visiting our new Knowledge Base
 46 | I highly recommend checking out this new article
 47 | Here is a copy of the information
 48 | We brought together some of the best tutorials which
 49 | What's new since you left? Lots Starting with
 50 | We're confident you'll see big improvements since
 51 | We know you're busy but we'd hate to see you miss out on this opportunity!
 52 | You can do so much with
 53 | Learn how to
 54 | We thought you might find this useful
 55 | Referring to the Previous Contact
 56 | Thank you for your letter
 57 | Thank you for contacting us
 58 | In reply to your request
 59 | Thank you for your letter regarding
 60 | Regarding our telephone conversation yesterday
 61 | Further to our meeting last week
 62 | I would just like to confirm the main points we discussed on Tuesday
 63 | I'm writing in reply to your email
 64 | In reply to your email
 65 | We understand from your email that you're interested in
 66 | We talked last week about
 67 | We had a phone call
 68 | It was nice to hear from you
 69 | I was glad to catch up
 70 | Making a Request
 71 | We would appreciate it if you would
 72 | I would be grateful if you could
 73 | Could you possibly tell us
 74 | In addition, I would like to receive
 75 | It would be helpful if you could send us
 76 | I am interested in receiving
 77 | I would appreciate your attention to this matter
 78 | Please let me know what action you propose to take
 79 | I would be grateful if you could send me further information about
 80 | Would it be possible to have a quick chat?
 81 | Would you mind if I took the day off
 82 | I was hoping you could do something
 83 | What would you like to do next?
 84 | Could you please send me the mail
 85 | Let's discuss your next step
 86 | It would be great if you could
 87 | Would you mind having a quick chat?
 88 | I was wondering if you could
 89 | Could you confirm these details?
 90 | Would you like me to send you the link?
 91 | Here are the details on
 92 | Furthermore
 93 | In addition, I would like to
 94 | We're glad the issues got sorted out despite the delay
 95 | For example
 96 | For instance
 97 | In other words
 98 | In order to fix this bug, we would need to research it a bit further
 99 | That's why
100 | I'm pleased to hear that
101 | First of all
102 | Firstly
103 | Secondly
104 | There seems to be a problem with the new feature
105 | Here are the possible solutions
106 | While running the tests we've discovered that it is not working properly
107 | Which option would you like us to work on?
108 | What would you like to go with?
109 | Which solution works best for you?
110 | We've found a bug
111 | Here's how we would like to take care of this issue
112 | How would you like us to solve this issue?
113 | We can see three options
114 | There are two ways to solve this
115 | We've come up with a workaround for this issue
116 | This solution is better but it will take longer to implement
117 | If we go with the first option we might run into some problems with in the future
118 | Could you please clarify what you would like us to do about it?
119 | If I understood you correctly you would like us to
120 | What exactly do you mean by?
121 | Could you please clarify when you would like us to finish this?
122 | When exactly are you expecting to have this feature?
123 | Could you explain what you mean by
124 | Could you be more specific?
125 | Could you please repeat it?
126 | Could you repeat what you said?
127 | Could you give us some more details?
128 | When would it be convenient for you too?
129 | Which option would work best for you?
130 | What would you like us to do next?
131 | Would you like to?
132 | Would you prefer to?
133 | Would you rather or?
134 | How would you feel about?
135 | What do you feel is the next step?
136 | Is it possible to?
137 | Could you check it, please?
138 | Just book time on my calendar and I can answer all your questions
139 | Help us give you the best advice by telling us a bit more about your project
140 | I'd love it if you could walk me through your project
141 | This may be a great time to take a look at our Knowledge Base
142 | If you're interested drop me a line and we can have a quick chat to discuss your further steps
143 | Come check out what's new and get inspired!
144 | Could you please keep us updated on this?
145 | If you have any questions please email or call me
146 | Please feel free to contact me anytime
147 | If there's anything I can do for you please let me know
148 | You can drop a mail if there's anything you'd like to discuss
149 | Feel free to call me
150 | Let's discuss this at the meeting if you don't mind
151 | At our last meeting, we talked about
152 | At the meeting, we agreed to
153 | We'd like to have a meeting about
154 | Let's have a meeting sometime this week
155 | How about taking this over at a meeting?
156 | Why don't we talk this over at a meeting?
157 | I'd be glad to tell you more about this at the meeting today
158 | This issue came up at the meeting we had on
159 | Let's have a meeting to discuss this issue
160 | I've set up a meeting
161 | Our company would be pleased to work with you
162 | If there's anything I can help you with just let me know
163 | We would be happy to help
164 | Thanking
165 | Thank you for your consideration
166 | I appreciate that you took the time to give me these details
167 | Thanks for taking the time to give us your feedback
168 | Thank you for writing to us
169 | Thanks a lot for everything
170 | Thank you for your time
171 | Thank you very much for
172 | Many thanks for
173 | You're so helpful
174 | That's thoughtful of you
175 | I appreciate your help
176 | Thank you for your patience
177 | Thank you for clearing this up
178 | Thank you for helping us in this matter
179 | We are pleased to announce that
180 | We are pleased to inform you that
181 | We have some good news for you
182 | It is my pleasure to let you know that
183 | I'm glad to tell you that
184 | You will be pleased to learn that
185 | We regret to inform you that
186 | I regret to inform you that due to a mistake in our database
187 | Unfortunately, we cannot
188 | we are unable to
189 | After careful consideration, we have decided to
190 | I'm afraid it would be impossible to do
191 | Despite my best efforts it has proved to be impossible to
192 | I'm afraid I've got some bad news for you
193 | We apologize for the delay
194 | I regret any inconvenience caused by
195 | I apologize for the problems you've had
196 | Please accept my apologies
197 | Sorry for any inconveniences this situation may have caused
198 | I would like to apologize for the delay
199 | I would like to apologize for the inconvenience
200 | Once again I apologize for any inconveniences
201 | We are sorry for the delay
202 | I'd like to apologize for making you wait
203 | Sorry to keep you waiting
204 | I'm sorry but
205 | Sorry again for
206 | Please confirm
207 | We'll get back to you as soon as we can
208 | Thank you for your order
209 | We're glad that you chose us to help you with this!
210 | I am attaching
211 | Please find the attachment
212 | You will find attached
213 | I've attached the file for your review
214 | The attached file contains
215 | Here's the attachment we discussed
216 | Please take a look at the attached file
217 | Take a look at the attachment I've attached to this email
218 | I've attached
219 | If we can be of any further assistance please let us know
220 | For further details
221 | If you require more information
222 | Thank you for taking this into consideration
223 | We hope you are happy with this arrangement
224 | We look forward to a successful working relationship in the future
225 | We would be very pleased to do business with your company
226 | I would be happy to have an opportunity to work with your firm
227 | I look forward to seeing you next week
228 | Looking forward to hearing from you
229 | I would appreciate your reply
230 | I look forward to doing business with you in the future
231 | I enjoyed working with you and look forward to
232 | Thank you once more for your help in this matter
233 | If you require any further information please let me know
234 | Let me know if you need any help
235 | If I can help in any way please do not hesitate to contact me
236 | If there's anything I can do to help you just drop me a line
237 | Do not hesitate to contact us again
238 | if there's anything we can help you with
239 | Thank you for your help
240 | I'd love to hear your feedback
241 | Hope to hear from you soon
242 | Thank you for your cooperation
243 | I'd appreciate your reply
244 | Please let me know what you think
245 | Thanks again
246 | Thank you for taking your time
247 | Happy holidays!
248 | Sincerely
249 | Yours sincerely
250 | Sincerely yours
251 | Yours faithfully
252 | Kind regards
253 | Yours truly
254 | Many thanks
255 | Regards
256 | Best regards
257 | With best wishes
258 | Best wishes
259 | Best
260 | All the best
261 | Thanks
262 | Have a great weekend!
263 | Have a wonderful day!
264 | Have a productive day!
265 | I hope you had a good weekend
266 | I hope you had a great trip
267 | Hope you had a nice break
268 | I hope you are well
269 | I hope all is well
270 | Hope you're enjoying your holiday
271 | I hope this email finds you well
272 | I hope you enjoyed the event
273 | I'm glad we had a chance to chat at the convention
274 | It was great to see you on Thursday
275 | It was a pleasure to meet you yesterday
276 | I am writing to you about our last meeting
277 | I am writing to you with regards to concerning
278 | I am writing to you regarding
279 | I am writing to ask
280 | I am writing to let you know
281 | I am writing to confirm
282 | I am writing to check
283 | I am writing to invite you
284 | I am writing to update you on
285 | I am writing to you to follow up on
286 | I am contacting you to inform
287 | I am reaching out because
288 | This is just a quick note to
289 | This is just a quick reminder
290 | I wanted to let you know that
291 | Might I take a moment of your time to
292 | I just got your request for
293 | I just read your email about
294 | As we discussed I would like to send you
295 | Thank you for your email about
296 | Thanks for your email
297 | Thanks for your feedback on
298 | Thanks for your invitation
299 | Thanks for your suggestion
300 | Thanks for sending
301 | Thanks for asking about
302 | Thanks for your quick reply
303 | Thanks for getting back to me so quickly
304 | Thank you for reaching out to me
305 | 1d Apologizing
306 | Sorry for my late reply
307 | Sorry it took me so long to get back to you
308 | I apologize for the late response
309 | Sorry it's been so long since my last email
310 | I was sorry to hear about
311 | Please accept our apologies for any inconvenience caused
312 | I'm enclosing the file
313 | The parts in bold are the changes I made
314 | The parts in red are the changes I made
315 | The parts in blue are the changes we made
316 | Here's the document that you asked for
317 | Please take a look at the file I've attached to this email
318 | Could you please?
319 | Could you possibly tell me?
320 | Can you please fill out this form?
321 | I'd appreciate it if you could
322 | I'd be very grateful if you could
323 | It would be very helpful if you could send
324 | If possible I'd like to know more about
325 | Please find my two main questions below
326 | 2c Asking for clarifications
327 | I didn't fully understand
328 | Could you please explain that again?
329 | I didn't quite get your point
330 | Could you repeat what you said about it?
331 | If you could please shed some light on this topic I would appreciate it
332 | Could you please clarify?
333 | If I understood you correctly you would like me to
334 | What exactly do you mean by
335 | In other words, would you like us to
336 | Thank you for letting me know
337 | Thank you for the heads up
338 | Thank you for the notice
339 | Please note
340 | Quick reminder
341 | Just a friendly reminder that
342 | Thank you for sharing
343 | I'd like to inform you that
344 | Thanks for keeping me in the loop
345 | Please keep me informed
346 | Please keep me posted
347 | Please keep me updated
348 | Please keep me in the loop
349 | Please let me know if this is OK with you
350 | What are your thoughts on this?
351 | What do you think?
352 | we're waiting for approval
353 | We just need the thumbs up
354 | We just need the the green light
355 | You totally have the green light!
356 | He approved of it so you can go ahead with the project
357 | I'd like to schedule a meeting if you are available
358 | I am available on
359 | if that's convenient for you
360 | Would you be available on
361 | If so I'll send you an invite shortly
362 | Can you make it on
363 | If so I'll book accordingly
364 | I'm afraid I can't make it on
365 | We need to reschedule our meeting
366 | We need to postpone our meeting
367 | We need to put back our meeting
368 | We need to cancel our meeting
369 | We need to move our meeting
370 | We need to rearrange our meeting
371 | We are sorry to inform you that the interview scheduled for
372 | We are sorry to inform you that the meeting scheduled for
373 | Unfortunately
374 | I'm afraid it will not be possible to
375 | Unfortunately, I have to tell you that
376 | I'm afraid that we can't
377 | I regret to inform you that
378 | After careful consideration, we have decided
379 | It's against company policy to
380 | I tried my best but
381 | Despite my best efforts
382 | I can't see how
383 | I'm sorry but it's out of my hands
384 | I'm afraid I won't be able to
385 | I'm sorry to tell you that
386 | Do you need a reply?
387 | Are you asking for a favor or you are meeting soon?
388 | These sentences are perfect for those moments!
389 | Looking forward to hearing from you soon
390 | I look forward to hearing from you soon
391 | Please let me know if this works
392 | Please let me know if you are available
393 | Please let me know if that sounds good
394 | Please let me know if you can
395 | Please let me know if you can help
396 | Please let me know if you need to reschedule
397 | I look forward to seeing
398 | I look forward to meeting you
399 | See you on next week
400 | Thank you in advance
401 | Thank you for everything
402 | Cheers
403 | Any feedback you can give me on this would be greatly appreciated
404 | Any feedback you can give me on this would be highly appreciated
405 | Any feedback you can give me on this would be much appreciated
406 | If you could have it ready
407 | I would appreciate it
408 | I would appreciate your help in this matter
409 | 3b Offering help or information
410 | I hope you find this helpful
411 | I hope it's clearer now
412 | I hope that answers all your questions
413 | If you have any questions
414 | If you have more questions
415 | In the meantime, if you need any more information
416 | If you need more information
417 | If you need more info
418 | If you need further information
419 | I know that's a lot to take in so let me know if anything I've said doesn't make sense
420 | please do not hesitate to contact me
421 | please feel free to contact me
422 | please feel free to get in touch
423 | please let me know
424 | drop me an email
425 | drop me a mail
426 | Thank you for your understanding
427 | Thanks again for your understanding
428 | Thanks for your patience
429 | Once again please accept our apologies for any inconvenience caused
430 | Once again please accept our apologies for the inconvenience caused
431 | Once again please accept our apologies for the delay
432 | Once again please accept our apologies for the misunderstanding
433 | I hope this is okay with you
434 | I hope we can find a solution soon
435 | I hope you can understand
436 | Sorry I couldn't be of more help
437 | Good morning
438 | Hope you're having a great!
439 | This email is to confirm that we've received your payment
440 | I'm sending you this email because
441 | In this email, I wanted to tell you about
442 | I highly recommend
443 | It was nice to hear from you yesterday
444 | I was glad to catch up yesterday
445 | Could you possibly tell us more
446 | Could you please send me the link
447 | Just wondered if you could send me a copy
448 | We're glad that the issues got sorted out despite the delay
449 | Talking about Problems and Solutions
450 | However, the second solution will take much longer and we cannot give even a rough estimate at the moment
451 | We'd like to research this problem a bit more to give you a more detailed list of options
452 | Could you please clarify what you would like us to do about
453 | I didn't quite get your point about
454 | Could you repeat what you said about
455 | Could you give us some more details on the
456 | When would it be convenient for you to
457 | Have you given any additional consideration to
458 | Could you do something?
459 | Talking about Meetings
460 | This issue came up at the meeting we had on Friday
461 | Here's the link
462 | You will be pleased to hear that
463 | Giving Bad News
464 | I'm afraid it would not be possible toThan
465 | That's not possible
466 | I can't see any way to
467 | It's out of my hands
468 | Talking about Vacations and Holidays
469 | I'm planning a vacation
470 | Would that be all right with you?
471 | We have a national holiday in our country on
472 | Therefore our office will not be working on that date
473 | I'm currently on vacation
474 | If you have questions please drop a mail
475 | When would it be all right for me to have a week-long vacation?
476 | I'm going to be on vacation
477 | going to have a day off
478 | Today I am not feeling well
479 | I'm on vacation now until
480 | I will read and answer all emails as soon as I get back
481 | If this is urgent please contact
482 | Please find attached
483 | The attached files contain
484 | Please take a look at the attachment
485 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLP-Tutorial
2 | Natural Language Processing
3 | https://www.topbots.com/generalized-language-models-tasks-datasets/
4 | 
5 | #### If you find this repository helpful, a star ⭐ would be greatly appreciated!
6 | #### Created by Joydeb Mondal
7 | 


--------------------------------------------------------------------------------
/simple-efficient-summarizer.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nimport tensorflow as tf","execution_count":11,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n    def __init__(self):\n        data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n        self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n        ","execution_count":19,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":20,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4}


--------------------------------------------------------------------------------
	query	intent	category
8770	rate the current novel 5 stars	RateBook	4
6557	Find the schedule for Kingsman: The Secret Ser...	SearchScreeningEvent	3
721	find Bells Break Their Towers , a video game	SearchCreativeWork	0
229	show creativity of A Catholic Education	SearchCreativeWork	0
3680	Will it be warm in Powersville Guam 23 hours f...	GetWeather	1
	query	intent	category
699	I want to see Married to the Enemy 2 at a cine...	SearchScreeningEvent	3
22	Please look up the song The Mad Magician .	SearchCreativeWork	0
139	rate the current essay zero out of 6 stars	RateBook	4
599	Add the album to my Club Hits playlist.	AddToPlaylist	6
16	Please help me find the Late Night Heartbroken...	SearchCreativeWork	0