├── 1.1-language-model ├── data.txt ├── language_model_keras.ipynb ├── language_model_pytorch.ipynb └── language_model_torch.ipynb ├── 1.10-question-answering └── question_answering_keras.ipynb ├── 1.12-text-summarization └── text-summarizer.ipynb ├── 1.2-sentiment-analysis ├── sentiment_classfication_bert_keras.ipynb └── sentiment_classfication_pytorch.ipynb ├── 1.3-semantic-similarity ├── README.md ├── semantic-similarity-BERT.ipynb └── try_cf.ipynb ├── 1.4-machine-translation ├── neural_machine_translation.ipynb └── seq2seq.ipynb ├── 1.5-named-entity-recognition ├── data_making.py ├── ner_bert.ipynb ├── ner_keras.ipynb ├── simple_ner-2.0.ipynb └── simple_ner.ipynb ├── 1.6-intent-classification ├── README.md ├── intent_classfication_bert.ipynb ├── intent_classfication_bert_keras.ipynb ├── intent_classfication_keras.ipynb └── text-classification-with-bert-pytorch.ipynb ├── 1.7-entity-recognition ├── entity_recognition_keras.ipynb └── resume-entities-for-ner.zip ├── 1.8-next-word-prediction ├── cab_booking.txt └── next_word_prediction_keras.ipynb ├── 1.9-smart-compose ├── README.md ├── data │ └── dataset.txt └── smart_compose_keras.ipynb ├── README.md └── simple-efficient-summarizer.ipynb /1.1-language-model/data.txt: -------------------------------------------------------------------------------- 1 | Jack and Jill went up the hill 2 | To fetch a pail of water 3 | Jack fell down and broke his crown 4 | And Jill came tumbling after 5 | -------------------------------------------------------------------------------- /1.1-language-model/language_model_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Language Model Using Tensorflow & keras

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "

Importing Libraries
" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from numpy import array\n", 24 | "import numpy as np\n", 25 | "import tensorflow as tf\n", 26 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 27 | "from tensorflow.keras.utils import to_categorical\n", 28 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 29 | "from tensorflow.keras.models import Sequential\n", 30 | "from tensorflow.keras.layers import Dense\n", 31 | "from tensorflow.keras.layers import LSTM\n", 32 | "from tensorflow.keras.layers import Dropout\n", 33 | "from tensorflow.keras.layers import Embedding\n", 34 | "from tensorflow.keras.models import load_model\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "
Preprocessing Data
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 7, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "class Preprocessing():\n", 51 | " \n", 52 | " def __init__(self,input_file):\n", 53 | " self.input_data_file = input_file\n", 54 | " self.data = None\n", 55 | " self.vocab_size = None\n", 56 | " self.encoded_data = None\n", 57 | " self.max_length = None\n", 58 | " self.sequences = None\n", 59 | " self.x = None\n", 60 | " self.y = None\n", 61 | " self.tokenizer = None\n", 62 | " \n", 63 | " def load_data(self):\n", 64 | " fp = open(self.input_data_file,'r')\n", 65 | " self.data = fp.read().splitlines() \n", 66 | " fp.close()\n", 67 | " \n", 68 | " def encode_data(self):\n", 69 | " self.tokenizer = Tokenizer()\n", 70 | " self.tokenizer.fit_on_texts(self.data)\n", 71 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n", 72 | " print(self.encoded_data)\n", 73 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n", 74 | " \n", 75 | " def generate_sequence(self):\n", 76 | " seq_list = list()\n", 77 | " for item in self.encoded_data:\n", 78 | " l = len(item)\n", 79 | " for id in range(1,l):\n", 80 | " seq_list.append(item[:id+1])\n", 81 | " self.max_length = max([len(seq) for seq in seq_list])\n", 82 | " self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n", 83 | " print(self.sequences)\n", 84 | " self.sequences = array(self.sequences)\n", 85 | " \n", 86 | " def get_data(self):\n", 87 | " self.x = self.sequences[:,:-1]\n", 88 | " self.y = self.sequences[:,-1]\n", 89 | " print(\"y before:\",self.y)\n", 90 | " self.y = to_categorical(self.y,num_classes=self.vocab_size)\n", 91 | " print(\"y After:\",self.y)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "[[2, 1, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13], [2, 14, 15, 1, 16, 17, 18], [1, 3, 19, 20, 21]]\n", 104 | "[[ 0 0 0 0 0 2 1]\n", 105 | " [ 0 0 0 0 2 1 3]\n", 106 | " [ 0 0 0 2 1 3 4]\n", 107 | " [ 0 0 2 1 3 4 5]\n", 108 | " [ 0 2 1 3 4 5 6]\n", 109 | " [ 2 1 3 4 5 6 7]\n", 110 | " [ 0 0 0 0 0 8 9]\n", 111 | " [ 0 0 0 0 8 9 10]\n", 112 | " [ 0 0 0 8 9 10 11]\n", 113 | " [ 0 0 8 9 10 11 12]\n", 114 | " [ 0 8 9 10 11 12 13]\n", 115 | " [ 0 0 0 0 0 2 14]\n", 116 | " [ 0 0 0 0 2 14 15]\n", 117 | " [ 0 0 0 2 14 15 1]\n", 118 | " [ 0 0 2 14 15 1 16]\n", 119 | " [ 0 2 14 15 1 16 17]\n", 120 | " [ 2 14 15 1 16 17 18]\n", 121 | " [ 0 0 0 0 0 1 3]\n", 122 | " [ 0 0 0 0 1 3 19]\n", 123 | " [ 0 0 0 1 3 19 20]\n", 124 | " [ 0 0 1 3 19 20 21]]\n", 125 | "y before: [ 1 3 4 5 6 7 9 10 11 12 13 14 15 1 16 17 18 3 19 20 21]\n", 126 | "y After: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 127 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 128 | " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 129 | " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 130 | " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 131 | " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 132 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 133 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 134 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 135 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 136 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 137 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n", 138 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", 139 | " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 140 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n", 141 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", 142 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n", 143 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 144 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n", 145 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", 146 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "pr = Preprocessing('data.txt')\n", 152 | "pr.load_data()\n", 153 | "pr.encode_data()\n", 154 | "pr.generate_sequence()\n", 155 | "pr.get_data()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "

Model" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "class Model():\n", 172 | " def __init__(self,params):\n", 173 | " self.model = None\n", 174 | " self.history = None\n", 175 | " self.x = None\n", 176 | " self.y = None\n", 177 | " self.vocab_size = params['vocab_size']\n", 178 | " self.max_len = params['max_len']\n", 179 | " self.activation = params['activation']\n", 180 | " self.optimizer = params['optimizer']\n", 181 | " self.epochs = params['epochs']\n", 182 | " self.metrics = params['metrics']\n", 183 | " \n", 184 | " \n", 185 | " def create_model(self):\n", 186 | " self.model = Sequential()\n", 187 | " self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n", 188 | " self.model.add(LSTM(50))\n", 189 | " self.model.add(Dropout(0.1))\n", 190 | " self.model.add(Dense(self.vocab_size,activation=self.activation))\n", 191 | " self.model.compile(loss='categorical_crossentropy',optimizer=self.optimizer,metrics=self.metrics)\n", 192 | " print(self.model.summary())\n", 193 | " def run(self):\n", 194 | " self.history = self.model.fit(self.x,self.y,epochs=self.epochs)\n", 195 | " \n", 196 | " def save(self):\n", 197 | " self.model.save(\"lang_model.h5\")\n", 198 | " " 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "params = {\"activation\":\"softmax\",\"epochs\":500,\"verbose\":2,\"loss\":\"categorical_crossentropy\",\n", 208 | " \"optimizer\":\"adam\",\"metrics\":['accuracy'],\"vocab_size\":pr.vocab_size,\"max_len\":pr.max_length}\n", 209 | "model_obj = Model(params)\n", 210 | "model_obj.x = pr.x\n", 211 | "model_obj.y = pr.y\n", 212 | "model_obj.create_model()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "model_obj.run()\n", 222 | "model_obj.save()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "

Prediction" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "class Prediction():\n", 239 | " def __init__(self,tokenizer,max_len):\n", 240 | " self.model = None\n", 241 | " self.tokenizer = tokenizer\n", 242 | " self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n", 243 | " self.max_length = max_len\n", 244 | " \n", 245 | " def load_model(self):\n", 246 | " self.model = load_model(\"lang_model.h5\")\n", 247 | " \n", 248 | " def predict_sequnce(self,text,num_words):\n", 249 | " for id in range(num_words):\n", 250 | " encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n", 251 | " padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n", 252 | " y_pred = self.model.predict(padded_data)\n", 253 | " y_pred = np.argmax(y_pred)\n", 254 | " predict_word = self.idx2word[y_pred]\n", 255 | " text += ' ' + predict_word\n", 256 | " return text" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "pred = Prediction(pr.tokenizer,pr.max_length) \n", 266 | "pred.load_model()\n", 267 | "print(pred.predict_sequnce(\"Jack and\",5))\n", 268 | "print(pred.predict_sequnce('And Jill', 4))\n", 269 | "print(pred.predict_sequnce('fell down', 5))\n", 270 | "print(pred.predict_sequnce('pail of', 3))" 271 | ] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": "Python 3", 277 | "language": "python", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.6.9" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 2 295 | } 296 | -------------------------------------------------------------------------------- /1.1-language-model/language_model_pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torch.nn as nn\n", 11 | "import torch.nn.functional as F\n", 12 | "\n", 13 | "import numpy as np\n", 14 | "from collections import Counter\n", 15 | "import os" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 16, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "batch_size = 1\n", 25 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 17, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "Vocabulary size 22\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "class PreProcessing():\n", 43 | " \n", 44 | " def get_data_from_file(self,train_file, batch_size, seq_size):\n", 45 | " with open(train_file, 'r', encoding='utf-8') as f:\n", 46 | " text = f.read()\n", 47 | " text = text.split()\n", 48 | "\n", 49 | " word_counts = Counter(text)\n", 50 | " sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)\n", 51 | " int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}\n", 52 | " vocab_to_int = {w: k for k, w in int_to_vocab.items()}\n", 53 | " n_vocab = len(int_to_vocab)\n", 54 | "\n", 55 | " print('Vocabulary size', n_vocab)\n", 56 | "\n", 57 | " int_text = [vocab_to_int[w] for w in text]\n", 58 | " num_batches = int(len(int_text) / (seq_size * batch_size))\n", 59 | " in_text = int_text[:num_batches * batch_size * seq_size]\n", 60 | " out_text = np.zeros_like(in_text)\n", 61 | " out_text[:-1] = in_text[1:]\n", 62 | " out_text[-1] = in_text[0]\n", 63 | " in_text = np.reshape(in_text, (batch_size, -1))\n", 64 | " out_text = np.reshape(out_text, (batch_size, -1))\n", 65 | " return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text\n", 66 | "\n", 67 | "\n", 68 | " def get_batches(self,in_text, out_text, batch_size, seq_size):\n", 69 | " num_batches = np.prod(in_text.shape) // (seq_size * batch_size)\n", 70 | " for i in range(0, num_batches * seq_size, seq_size):\n", 71 | " yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]\n", 72 | " \n", 73 | " \n", 74 | "preprocess_obj = PreProcessing()\n", 75 | "int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = preprocess_obj.get_data_from_file(\"data.txt\",4,4)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 18, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "array([[ 0, 1, 2, 3],\n", 87 | " [ 4, 5, 6, 7],\n", 88 | " [ 8, 9, 10, 11],\n", 89 | " [12, 0, 13, 14]])" 90 | ] 91 | }, 92 | "execution_count": 18, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "in_text" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 19, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "array([[ 1, 2, 3, 4],\n", 110 | " [ 5, 6, 7, 8],\n", 111 | " [ 9, 10, 11, 12],\n", 112 | " [ 0, 13, 14, 0]])" 113 | ] 114 | }, 115 | "execution_count": 19, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "out_text" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 26, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "class RNNModule(nn.Module):\n", 131 | " def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):\n", 132 | " super(RNNModule, self).__init__()\n", 133 | " self.seq_size = seq_size\n", 134 | " self.lstm_size = lstm_size\n", 135 | " self.embedding = nn.Embedding(n_vocab, embedding_size)\n", 136 | " self.lstm = nn.LSTM(embedding_size,\n", 137 | " lstm_size,\n", 138 | " batch_first=True)\n", 139 | " self.dense = nn.Linear(lstm_size, n_vocab)\n", 140 | "\n", 141 | " def forward(self, x, prev_state):\n", 142 | " embed = self.embedding(x)\n", 143 | " output, state = self.lstm(embed, prev_state)\n", 144 | " logits = self.dense(output)\n", 145 | "\n", 146 | " return logits, state\n", 147 | "\n", 148 | " def zero_state(self, batch_size):\n", 149 | " return (torch.zeros(1, batch_size, self.lstm_size),\n", 150 | " torch.zeros(1, batch_size, self.lstm_size))\n", 151 | " \n", 152 | " def get_loss_and_train_op(self, net, lr=0.001):\n", 153 | " criterion = nn.CrossEntropyLoss()\n", 154 | " optimizer = torch.optim.Adam(net.parameters(), lr=lr)\n", 155 | "\n", 156 | " return criterion, optimizer\n", 157 | " \n", 158 | " def train(self):\n", 159 | " iteration = 0\n", 160 | " gradients_norm=5\n", 161 | " for e in range(200):\n", 162 | " batches = preprocess_obj.get_batches(in_text, out_text, batch_size, seq_size)\n", 163 | " state_h, state_c = net.zero_state(batch_size)\n", 164 | " state_h = state_h.to(device)\n", 165 | " state_c = state_c.to(device)\n", 166 | " for x, y in batches:\n", 167 | " iteration += 1\n", 168 | " net.train()\n", 169 | "\n", 170 | " optimizer.zero_grad()\n", 171 | "\n", 172 | " x = torch.tensor(x).to(device)\n", 173 | " y = torch.tensor(y).to(device)\n", 174 | "\n", 175 | " logits, (state_h, state_c) = net(x, (state_h, state_c))\n", 176 | " loss = criterion(logits.transpose(1, 2), y)\n", 177 | "\n", 178 | " loss_value = loss.item()\n", 179 | "\n", 180 | " loss.backward()\n", 181 | "\n", 182 | " state_h = state_h.detach()\n", 183 | " state_c = state_c.detach()\n", 184 | "\n", 185 | " _ = torch.nn.utils.clip_grad_norm_(\n", 186 | " net.parameters(), gradients_norm)\n", 187 | "\n", 188 | " optimizer.step()\n", 189 | "\n", 190 | " if iteration % 100 == 0:\n", 191 | " print('Epoch: {}/{}'.format(e, 200),\n", 192 | " 'Iteration: {}'.format(iteration),\n", 193 | " 'Loss: {}'.format(loss_value))\n", 194 | "\n", 195 | " if iteration % 1000 == 0:\n", 196 | " torch.save(net.state_dict(),\n", 197 | " 'checkpoint_pt/model-{}.pth'.format(iteration))\n", 198 | "seq_size = 4\n", 199 | "embedding_size = 22\n", 200 | "lstm_size = 64\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 27, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "net = RNNModule(n_vocab, seq_size,embedding_size, lstm_size)\n", 210 | "net = net.to(device)\n", 211 | "criterion, optimizer = net.get_loss_and_train_op(net, 0.01)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 28, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "ename": "RecursionError", 221 | "evalue": "maximum recursion depth exceeded", 222 | "output_type": "error", 223 | "traceback": [ 224 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 225 | "\u001b[0;31mRecursionError\u001b[0m Traceback (most recent call last)", 226 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 227 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 228 | "... last 1 frames repeated, from the frame below ...\n", 229 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 230 | "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "net.train()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.9" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /1.1-language-model/language_model_torch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Importing Libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 191, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import torch\n", 19 | "import torch.nn as nn\n", 20 | "import torch.optim as optim\n", 21 | "from torch.autograd import Variable" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 192, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "class Tokenizer():\n", 31 | " def fit_on_texts(self,list_data):\n", 32 | " word_list = \" \".join(list_data).split()\n", 33 | " self.word_counts = list(set(word_list))\n", 34 | " self.word_dict = {w: i for i, w in enumerate(self.word_counts)}\n", 35 | " self.number_dict = {i: w for i, w in enumerate(self.word_counts)}\n", 36 | " \n", 37 | " def texts_to_sequences(self,data):\n", 38 | " encoded_sequence = list()\n", 39 | " for item in data:\n", 40 | " encoded_sequence.append([self.word_dict[word] for word in item.split()])\n", 41 | " return encoded_sequence\n", 42 | " \n", 43 | "def pad_sequences(data,padding='pre',padding_value=0):\n", 44 | " sequence = None\n", 45 | " if isinstance(data,list):\n", 46 | " maxlen = max(len(item) for item in data)\n", 47 | " \n", 48 | " if padding == 'pre':\n", 49 | " for idx in range(len(data)):\n", 50 | " data[idx] = [padding_value]*(maxlen-len(data[idx])) + data[idx]\n", 51 | " else:\n", 52 | " for idx in range(len(data)):\n", 53 | " data[idx] = data[idx]+ [padding_value]*(maxlen-len(data[idx]))\n", 54 | " \n", 55 | " return data\n", 56 | "def to_categorical(data, nb_classes):\n", 57 | " targets = np.array(data).reshape(-1)\n", 58 | " return np.eye(nb_classes)[targets]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 195, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "class Preprocessing():\n", 68 | " \n", 69 | " def __init__(self,input_file):\n", 70 | " self.input_data_file = input_file\n", 71 | " self.data = None\n", 72 | " self.vocab_size = None\n", 73 | " self.encoded_data = None\n", 74 | " self.max_length = None\n", 75 | " self.sequences = None\n", 76 | " self.x = None\n", 77 | " self.y = None\n", 78 | " self.tokenizer = None\n", 79 | " \n", 80 | " def load_data(self):\n", 81 | " fp = open(self.input_data_file,'r')\n", 82 | " self.data = fp.read().splitlines() \n", 83 | " fp.close()\n", 84 | " \n", 85 | " def encode_data(self):\n", 86 | " self.tokenizer = Tokenizer()\n", 87 | " self.tokenizer.fit_on_texts(self.data)\n", 88 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n", 89 | " print(self.encoded_data)\n", 90 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n", 91 | " \n", 92 | " def generate_sequence(self):\n", 93 | " seq_list = list()\n", 94 | " for item in self.encoded_data:\n", 95 | " l = len(item)\n", 96 | " for id in range(1,l):\n", 97 | " seq_list.append(item[:id+1])\n", 98 | " #print(seq_list[0])\n", 99 | " print(seq_list)\n", 100 | " self.sequences = pad_sequences(seq_list,padding='pre', padding_value=0)\n", 101 | " print(self.sequences)\n", 102 | " self.sequences = array(self.sequences)\n", 103 | " \n", 104 | " def get_data(self):\n", 105 | " self.x = self.sequences[:,:-1]\n", 106 | " self.y = self.sequences[:,-1]\n", 107 | " print(self.y)\n", 108 | " self.y = to_categorical(self.y,nb_classes=self.vocab_size)\n", 109 | " print(\"Y:{}\".format(self.y))\n", 110 | " print(\"X:{}\".format(self.x))\n", 111 | " return self.x,self.y" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 196, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "[[2, 12, 16, 4, 3, 14, 9], [17, 0, 18, 6, 19, 21], [2, 10, 8, 12, 11, 7, 15], [1, 16, 5, 20, 13]]\n", 124 | "[[2, 12], [2, 12, 16], [2, 12, 16, 4], [2, 12, 16, 4, 3], [2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [17, 0], [17, 0, 18], [17, 0, 18, 6], [17, 0, 18, 6, 19], [17, 0, 18, 6, 19, 21], [2, 10], [2, 10, 8], [2, 10, 8, 12], [2, 10, 8, 12, 11], [2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [1, 16], [1, 16, 5], [1, 16, 5, 20], [1, 16, 5, 20, 13]]\n", 125 | "[[0, 0, 0, 0, 0, 2, 12], [0, 0, 0, 0, 2, 12, 16], [0, 0, 0, 2, 12, 16, 4], [0, 0, 2, 12, 16, 4, 3], [0, 2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [0, 0, 0, 0, 0, 17, 0], [0, 0, 0, 0, 17, 0, 18], [0, 0, 0, 17, 0, 18, 6], [0, 0, 17, 0, 18, 6, 19], [0, 17, 0, 18, 6, 19, 21], [0, 0, 0, 0, 0, 2, 10], [0, 0, 0, 0, 2, 10, 8], [0, 0, 0, 2, 10, 8, 12], [0, 0, 2, 10, 8, 12, 11], [0, 2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [0, 0, 0, 0, 0, 1, 16], [0, 0, 0, 0, 1, 16, 5], [0, 0, 0, 1, 16, 5, 20], [0, 0, 1, 16, 5, 20, 13]]\n", 126 | "[12 16 4 3 14 9 0 18 6 19 21 10 8 12 11 7 15 16 5 20 13]\n", 127 | "Y:[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 128 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", 129 | " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 130 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 131 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 132 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 133 | " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 134 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", 135 | " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 136 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n", 137 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", 138 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 139 | " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 140 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 141 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 142 | " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 143 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n", 144 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", 145 | " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 146 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n", 147 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n", 148 | "X:[[ 0 0 0 0 0 2]\n", 149 | " [ 0 0 0 0 2 12]\n", 150 | " [ 0 0 0 2 12 16]\n", 151 | " [ 0 0 2 12 16 4]\n", 152 | " [ 0 2 12 16 4 3]\n", 153 | " [ 2 12 16 4 3 14]\n", 154 | " [ 0 0 0 0 0 17]\n", 155 | " [ 0 0 0 0 17 0]\n", 156 | " [ 0 0 0 17 0 18]\n", 157 | " [ 0 0 17 0 18 6]\n", 158 | " [ 0 17 0 18 6 19]\n", 159 | " [ 0 0 0 0 0 2]\n", 160 | " [ 0 0 0 0 2 10]\n", 161 | " [ 0 0 0 2 10 8]\n", 162 | " [ 0 0 2 10 8 12]\n", 163 | " [ 0 2 10 8 12 11]\n", 164 | " [ 2 10 8 12 11 7]\n", 165 | " [ 0 0 0 0 0 1]\n", 166 | " [ 0 0 0 0 1 16]\n", 167 | " [ 0 0 0 1 16 5]\n", 168 | " [ 0 0 1 16 5 20]]\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "pr = Preprocessing('data.txt')\n", 174 | "pr.load_data()\n", 175 | "pr.encode_data()\n", 176 | "pr.generate_sequence()\n", 177 | "x,y = pr.get_data()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 184, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "n_step = 1 # n-1 in paper\n", 187 | "n_hidden = 1 # h in paper\n", 188 | "m = 1 # m in paper\n", 189 | "n_class = pr.vocab_size\n", 190 | "dtype = torch.FloatTensor\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 178, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "class NNLM(nn.Module):\n", 200 | " def __init__(self):\n", 201 | " super(NNLM, self).__init__()\n", 202 | " self.C = nn.Embedding(n_class, m)\n", 203 | " self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))\n", 204 | " self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))\n", 205 | " self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))\n", 206 | " self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))\n", 207 | " self.b = nn.Parameter(torch.randn(n_class).type(dtype))\n", 208 | "\n", 209 | " def forward(self, X):\n", 210 | " X = self.C(X)\n", 211 | " X = X.view(-1, n_step * m) # [batch_size, n_step * n_class]\n", 212 | " tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]\n", 213 | " output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]\n", 214 | " return output\n", 215 | " \n", 216 | "def train(x,y):\n", 217 | " model = NNLM()\n", 218 | " criterion = nn.CrossEntropyLoss()\n", 219 | " optimizer = optim.Adam(model.parameters(), lr=0.001)\n", 220 | " # Training\n", 221 | " for epoch in range(100):\n", 222 | "\n", 223 | " optimizer.zero_grad()\n", 224 | " output = model(x)\n", 225 | "\n", 226 | " # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\n", 227 | " loss = criterion(output, y)\n", 228 | " if (epoch + 1)%1000 == 0:\n", 229 | " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", 230 | "\n", 231 | " loss.backward()\n", 232 | " optimizer.step()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 179, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "ename": "ValueError", 242 | "evalue": "Expected input batch_size (126) to match target batch_size (21).", 243 | "output_type": "error", 244 | "traceback": [ 245 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 246 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 247 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 248 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;31m# output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m1000\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Epoch:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'%04d'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'cost ='\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'{:.6f}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 249 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 550\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 551\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 250 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m return F.cross_entropy(input, target, weight=self.weight,\n\u001b[0;32m--> 932\u001b[0;31m ignore_index=self.ignore_index, reduction=self.reduction)\n\u001b[0m\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 934\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 251 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m 2315\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msize_average\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2316\u001b[0m \u001b[0mreduction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlegacy_get_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize_average\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2317\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlog_softmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2319\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 252 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m 2111\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2112\u001b[0m raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'\n\u001b[0;32m-> 2113\u001b[0;31m .format(input.size(0), target.size(0)))\n\u001b[0m\u001b[1;32m 2114\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2115\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_enum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 253 | "\u001b[0;31mValueError\u001b[0m: Expected input batch_size (126) to match target batch_size (21)." 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "x = Variable(torch.LongTensor(x))\n", 259 | "y = Variable(torch.LongTensor(y))\n", 260 | "train(x,y)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Python 3", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.6.9" 288 | } 289 | }, 290 | "nbformat": 4, 291 | "nbformat_minor": 2 292 | } 293 | -------------------------------------------------------------------------------- /1.12-text-summarization/text-summarizer.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport os\nimport tensorflow as tf\nfrom sklearn.model_selection import train_test_split\n\nfrom tensorflow.keras.preprocessing.text import Tokenizer \nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\n\nfrom tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint","execution_count":39,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Adding Attention Layer as its not a part of the keras\nhttps://www.kaggle.com/kweku20/attention"},{"metadata":{"trusted":true},"cell_type":"code","source":"from shutil import copyfile\ncopyfile(src = \"/kaggle/input/attention/attention.py\", dst = \"/kaggle/working/attention.py\")\nfrom attention import AttentionLayer","execution_count":40,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n def __init__(self):\n data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n print(data.head())\n self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n ","execution_count":41,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":42,"outputs":[{"output_type":"stream","text":" Id ProductId UserId ProfileName \\\n0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian \n1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa \n2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres \"Natalia Corres\" \n3 4 B000UA0QIQ A395BORC6FGVXV Karl \n4 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham \"M. Wassir\" \n\n HelpfulnessNumerator HelpfulnessDenominator Score Time \\\n0 1 1 5 1303862400 \n1 0 0 1 1346976000 \n2 1 1 4 1219017600 \n3 3 3 2 1307923200 \n4 0 0 5 1350777600 \n\n Summary Text \n0 Good Quality Dog Food I have bought several of the Vitality canned d... \n1 Not as Advertised Product arrived labeled as Jumbo Salted Peanut... \n2 \"Delight\" says it all This is a confection that has been around a fe... \n3 Cough Medicine If you are looking for the secret ingredient i... \n4 Great taffy Great taffy at a great price. There was a wid... \n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"class PreprocessingData():\n def __init__(self):\n self.max_in_len = 100\n self.max_tar_len = 10\n \n def preprocess_data(self,data):\n data.dropna(axis=0,inplace=True)\n data['Summary'] = data['Summary'].apply(lambda x : 'start '+ x + ' end')\n return data\n \n def get_data(self,data):\n x_train,x_val,y_train,y_val = train_test_split(np.array(data['Text']),np.array(data['Summary']),test_size=0.1,random_state=0,shuffle=True)\n return x_train,x_val,y_train,y_val\n \n def encode_data(self,data,x_train,x_val,y_train,y_val):\n \n # Input Encoding\n in_tokenizer = Tokenizer() \n in_tokenizer.fit_on_texts(data[\"Text\"].tolist())\n\n x_train_seq = in_tokenizer.texts_to_sequences(x_train) \n x_val_seq = in_tokenizer.texts_to_sequences(x_val)\n\n x_train = pad_sequences(x_train_seq, maxlen = self.max_in_len, padding='post')\n x_val = pad_sequences(x_val_seq, maxlen = self.max_in_len, padding='post')\n\n self.in_voc = len(in_tokenizer.word_counts) + 1\n \n # Target Encoding\n tar_tokenizer = Tokenizer() \n tar_tokenizer.fit_on_texts(data[\"Summary\"].tolist())\n\n y_train_seq = tar_tokenizer.texts_to_sequences(y_train) \n y_val_seq = tar_tokenizer.texts_to_sequences(y_val)\n\n y_train = pad_sequences(y_train_seq, maxlen = self.max_tar_len, padding='post')\n y_val = pad_sequences(y_val_seq, maxlen = self.max_tar_len, padding='post')\n\n self.tar_voc = len(tar_tokenizer.word_counts) + 1\n return x_train,x_val,y_train,y_val","execution_count":43,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling preprocessing module on loading data"},{"metadata":{"trusted":true},"cell_type":"code","source":"preprocessing_data = PreprocessingData()\ndata = preprocessing_data.preprocess_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.get_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.encode_data(data,x_train,x_val,y_train,y_val)","execution_count":44,"outputs":[{"output_type":"error","ename":"TypeError","evalue":"unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocess_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m\u001b[0m in \u001b[0;36mencode_data\u001b[0;34m(self, data, x_train, x_val, y_train, y_val)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mx_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpad_sequences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val_seq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_in_len\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_voc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0min_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_counts\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;31m# Target Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'"]}]},{"metadata":{},"cell_type":"markdown","source":"Model Creation"},{"metadata":{"trusted":true},"cell_type":"code","source":"class Model():\n def __init__(self):\n self.model = None\n \n def define_model(self):\n raise NotImplementedError","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /1.2-sentiment-analysis/sentiment_classfication_bert_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tensorflow as tf\n", 10 | "import pandas as pd\n", 11 | "import tensorflow_hub as hub\n", 12 | "import bert\n", 13 | "import os\n", 14 | "import re\n", 15 | "import numpy as np\n", 16 | "from tqdm import tqdm\n", 17 | "from tqdm import tqdm_notebook\n", 18 | "from tensorflow.keras import backend as K\n", 19 | "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n", 20 | "from tensorflow.keras.models import Sequential, Model" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "class LoadData():\n", 30 | " def __init__(self,csv_file):\n", 31 | " self.df = pd.read_csv(os.path.join(os.getcwd(),csv_file))\n", 32 | " self.train_df = None\n", 33 | " self.test_df = None\n", 34 | " def load_data(self):\n", 35 | " self.df.columns = ['sentence','sentiment']\n", 36 | " self.train_df = self.df[self.df['sentiment']=='positive']\n", 37 | " self.test_df = self.df[self.df['sentiment']=='negative']\n", 38 | " self.train_df.loc[self.train_df['sentiment']=='positive','polarity'] = 1\n", 39 | " self.test_df.loc[self.test_df['sentiment']=='negative','polarity'] = 0\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "loaddata_obj = LoadData(\"imdb_dataset_small.csv\") \n", 49 | "loaddata_obj.load_data()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "loaddata_obj.train_df.head()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "loaddata_obj.test_df.head()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "class BertModel(object):\n", 77 | " \n", 78 | " def __init__(self):\n", 79 | " \n", 80 | " self.max_len = 128\n", 81 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n", 82 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n", 83 | " \n", 84 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n", 85 | "\n", 86 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n", 87 | "\n", 88 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n", 89 | "\n", 90 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n", 91 | " \n", 92 | " def get_masks(self,tokens, max_seq_length):\n", 93 | " return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n", 94 | "\n", 95 | " def get_segments(self,tokens, max_seq_length):\n", 96 | " \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n", 97 | " segments = []\n", 98 | " current_segment_id = 0\n", 99 | " for token in tokens:\n", 100 | " segments.append(current_segment_id)\n", 101 | " if token == \"[SEP]\":\n", 102 | " current_segment_id = 1\n", 103 | " return segments + [0] * (max_seq_length - len(tokens))\n", 104 | " \n", 105 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n", 106 | " \"\"\"Token ids from Tokenizer vocab\"\"\"\n", 107 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n", 108 | " input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n", 109 | " return input_ids\n", 110 | " def create_single_input(self,sentence,maxlen):\n", 111 | "\n", 112 | " stokens = self.tokenizer.tokenize(sentence)\n", 113 | "\n", 114 | " stokens = stokens[:maxlen]\n", 115 | "\n", 116 | " stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n", 117 | "\n", 118 | " ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n", 119 | " masks = self.get_masks(stokens, self.max_len)\n", 120 | " segments = self.get_segments(stokens, self.max_len)\n", 121 | "\n", 122 | " return ids,masks,segments\n", 123 | "\n", 124 | " def create_input_array(self,sentences):\n", 125 | " \n", 126 | " input_ids, input_masks, input_segments = [], [], []\n", 127 | "\n", 128 | " for sentence in tqdm(sentences,position=0, leave=True):\n", 129 | " ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n", 130 | "\n", 131 | " input_ids.append(ids)\n", 132 | " input_masks.append(masks)\n", 133 | " input_segments.append(segments)\n", 134 | " \n", 135 | " tensor = [np.asarray(input_ids, dtype=np.int32), \n", 136 | " np.asarray(input_masks, dtype=np.int32), \n", 137 | " np.asarray(input_segments, dtype=np.int32)]\n", 138 | " return tensor" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "class PreprocessingBertData():\n", 148 | " \n", 149 | " def prepare_data_x(self,train_sentences):\n", 150 | " x = bert_model_obj.get_input_array(train_sentences)\n", 151 | " return x\n", 152 | " \n", 153 | " def prepare_data_y(self,train_labels):\n", 154 | " y = list()\n", 155 | " for item in train_labels:\n", 156 | " label = item\n", 157 | " y.append(label)\n", 158 | " y = np.array(y)\n", 159 | " return y\n", 160 | " " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "class BertModel(object):\n", 170 | " \n", 171 | " def __init__(self):\n", 172 | " \n", 173 | " self.max_len = 128\n", 174 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n", 175 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n", 176 | " \n", 177 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n", 178 | "\n", 179 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n", 180 | "\n", 181 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n", 182 | "\n", 183 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n", 184 | " \n", 185 | " def get_masks(self,tokens, max_seq_length):\n", 186 | " mask_data = [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n", 187 | " return mask_data\n", 188 | "\n", 189 | " def get_segments(self,tokens, max_seq_length):\n", 190 | " '''\n", 191 | " Segments: 0 for the first sequence, \n", 192 | " 1 for the second\n", 193 | " '''\n", 194 | " segments = []\n", 195 | " segment_id = 0\n", 196 | " for token in tokens:\n", 197 | " segments.append(current_segment_id)\n", 198 | " if token == \"[SEP]\":\n", 199 | " segment_id = 1\n", 200 | " '''Remaining are padded with 0'''\n", 201 | " remaining_segment = [0] * (max_seq_length - len(tokens))\n", 202 | " segment_data = segments + remaining_segment\n", 203 | " return segment_data\n", 204 | " \n", 205 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n", 206 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n", 207 | " remaining_ids = [0] * (max_seq_length-len(token_ids))\n", 208 | " input_ids = token_ids + remaining_ids\n", 209 | " return input_ids\n", 210 | " \n", 211 | " def get_input_data(self,sentence,maxlen):\n", 212 | "\n", 213 | " sent_token = self.tokenizer.tokenize(sentence)\n", 214 | "\n", 215 | " sent_token = sent_token[:maxlen]\n", 216 | "\n", 217 | " sent_token = [\"[CLS]\"] + sent_token + [\"[SEP]\"]\n", 218 | "\n", 219 | " id = self.get_ids(sent_token, self.tokenizer, self.max_len)\n", 220 | " mask = self.get_masks(sent_token, self.max_len)\n", 221 | " segment = self.get_segments(sent_token, self.max_len)\n", 222 | " input_data = [id,mask,segment]\n", 223 | " return input_data\n", 224 | "\n", 225 | " def get_input_array(self,sentences):\n", 226 | " \n", 227 | " input_ids, input_masks, input_segments = [], [], []\n", 228 | "\n", 229 | " for sentence in tqdm(sentences,position=0, leave=True):\n", 230 | " ids,masks,segments=self.get_input_data(sentence,self.max_len-2)\n", 231 | "\n", 232 | " input_ids.append(ids)\n", 233 | " input_masks.append(masks)\n", 234 | " input_segments.append(segments)\n", 235 | " \n", 236 | " input_array = [np.asarray(input_ids, dtype=np.int32),np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]\n", 237 | " return input_array" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "bert_model_obj = BertModel()\n", 247 | "preprocess_bert_data_obj = PreprocessingBertData()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "train_sentences = loaddata_obj.train_df[\"sentence\"].tolist()\n", 257 | "train_labels = loaddata_obj.train_df[\"polarity\"].tolist()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n", 267 | "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n", 268 | "\n", 269 | "train_input_ids, train_input_masks, train_segment_ids = x\n", 270 | "train_labels = y" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "class DesignModel():\n", 280 | " def __init__(self):\n", 281 | " self.model = None \n", 282 | " self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n", 283 | " self.train_labels = train_labels\n", 284 | " \n", 285 | " def bert_model(self,max_seq_length): \n", 286 | " in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n", 287 | " in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n", 288 | " in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n", 289 | " \n", 290 | " bert_inputs = [in_id, in_mask, in_segment]\n", 291 | " bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n", 292 | " \n", 293 | " bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n", 294 | " bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n", 295 | " bert_outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\", name=\"dense_output\")(bert_sequence_output)\n", 296 | " self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n", 297 | " \n", 298 | " self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 299 | " \n", 300 | " self.model.summary()\n", 301 | " \n", 302 | " def model_train(self,batch_size,num_epoch):\n", 303 | " print(\"Fitting to model\")\n", 304 | " \n", 305 | " self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n", 306 | " \n", 307 | " print(\"Model Training complete.\")\n", 308 | "\n", 309 | " def save_model(self,model,model_name): \n", 310 | " self.model.save(model_name+\".h5\")\n", 311 | " print(\"Model saved to Model folder.\")" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "model_obj = DesignModel()\n", 321 | "model_obj.bert_model(bert_model_obj.max_len)\n", 322 | "model_obj.model_train(32,1)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "model_obj.save_model(model_obj.model,\"bert\")" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "class Evaluation():\n", 341 | " def get_accuracy(self,actuals, predictions):\n", 342 | " acc = accuracy_score(actuals, predictions)\n", 343 | " return acc" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "class Prediction():\n", 353 | " def __init__(self):\n", 354 | " self.model = model_obj.model\n", 355 | " \n", 356 | " def predict_validation(self):\n", 357 | " valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n", 358 | " valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n", 359 | "\n", 360 | " preprocess_bert_data_obj = PreprocessingBertData()\n", 361 | " val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n", 362 | " prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n", 363 | " return valid_labels,prediction_labels\n", 364 | " \n", 365 | " \n", 366 | " def predict(self,query):\n", 367 | " query_seq = bert_model_obj.get_input_array([query])\n", 368 | " pred = self.model.predict(query_seq)\n", 369 | " pred = np.argmax(pred)\n", 370 | " result = load_data_obj.cat_to_intent[pred]\n", 371 | " return result" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "pred_obj = Prediction()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "eval_obj = Evaluation()\n", 390 | "ytest,ypred = pred_obj.predict_validation()\n", 391 | "acc = eval_obj.get_accuracy(ytest,ypred)\n", 392 | "print(\"Auc: {:.2%}\".format(acc))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [] 401 | } 402 | ], 403 | "metadata": { 404 | "kernelspec": { 405 | "display_name": "Python 3", 406 | "language": "python", 407 | "name": "python3" 408 | }, 409 | "language_info": { 410 | "codemirror_mode": { 411 | "name": "ipython", 412 | "version": 3 413 | }, 414 | "file_extension": ".py", 415 | "mimetype": "text/x-python", 416 | "name": "python", 417 | "nbconvert_exporter": "python", 418 | "pygments_lexer": "ipython3", 419 | "version": "3.6.9" 420 | } 421 | }, 422 | "nbformat": 4, 423 | "nbformat_minor": 2 424 | } 425 | -------------------------------------------------------------------------------- /1.3-semantic-similarity/README.md: -------------------------------------------------------------------------------- 1 | pip install -U sentence-transformers scipy 2 | -------------------------------------------------------------------------------- /1.3-semantic-similarity/semantic-similarity-BERT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sentence_transformers import SentenceTransformer\n", 10 | "import scipy\n", 11 | "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n", 12 | "# Corpus with example sentences\n", 13 | "\n", 14 | "corpus = [\n", 15 | " 'A man is eating a food.',\n", 16 | " 'A man is eating a piece of bread.',\n", 17 | " 'The girl is carrying a baby.',\n", 18 | " 'A man is riding a horse.',\n", 19 | " 'A woman is playing violin.',\n", 20 | " 'Two men pushed carts through the woods.',\n", 21 | " 'A man is riding a white horse on an enclosed ground.',\n", 22 | " 'A monkey is playing drums.',\n", 23 | " 'A cheetah is running behind its prey.']\n", 24 | "queries = ['A man is eating pasta.', \n", 25 | " 'Someone in a gorilla costume is playing a set of drums.', \n", 26 | " 'A cheetah chases prey on across a field.']" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 8, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "corpus_embeddings = embedder.encode(corpus)\n", 36 | "query_embeddings = embedder.encode(queries)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 22, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "[0.21805174 0.15202328 1.04767431 0.89392366 0.96026727 0.79048636\n", 49 | " 0.8414415 0.80550679 0.90363039] A man is eating pasta.\n", 50 | "The girl is carrying a baby.\n", 51 | "[0.80833937 0.8089816 0.76493318 0.79766881 0.92636551 0.84321454\n", 52 | " 0.80365482 0.20152853 0.71403489] Someone in a gorilla costume is playing a set of drums.\n", 53 | "A woman is playing violin.\n", 54 | "[0.97539473 0.95483563 0.87328057 0.7070155 0.94015294 0.63376342\n", 55 | " 0.72819954 0.6939273 0.09933373] A cheetah chases prey on across a field.\n", 56 | "A man is eating a food.\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "import numpy as np\n", 62 | "for query, query_embedding in zip(queries, query_embeddings):\n", 63 | " distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n", 64 | " print(distances,query)\n", 65 | " print(corpus[np.argmax(distances)])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.9" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /1.3-semantic-similarity/try_cf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 38, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sentence_transformers import SentenceTransformer\n", 10 | "import scipy\n", 11 | "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n", 12 | "# Corpus with example sentences\n", 13 | "\n", 14 | "corpus = [\n", 15 | " 'i would like to clean my XYZ',\n", 16 | " 'book an appointment for XYZ cleaning',\n", 17 | " 'schedule a XYZ cleaning services',\n", 18 | " 'looking for XYZ cleaninng services',\n", 19 | " 'want an appointment for XYZ cleaning'\n", 20 | "]\n", 21 | "queries = ['i would like to clean my XYZ','book a slot for XYZ cleaning','looking for XYZ cleaninng services']" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 39, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "corpus_embeddings = embedder.encode(corpus)\n", 31 | "query_embeddings = embedder.encode(queries)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 40, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "[0. 0.31504001 0.23922292 0.20180429 0.21156411]\n", 44 | "matched sent: book an appointment for XYZ cleaning , id: 1\n", 45 | "query: i would like to clean my XYZ\n", 46 | "[0.249604 0.09753854 0.14501476 0.18097553 0.14041007]\n", 47 | "matched sent: i would like to clean my XYZ , id: 0\n", 48 | "query: book a slot for XYZ cleaning\n", 49 | "[2.01804328e-01 2.37920637e-01 1.28938306e-01 1.11133325e-13\n", 50 | " 1.53037771e-01]\n", 51 | "matched sent: book an appointment for XYZ cleaning , id: 1\n", 52 | "query: looking for XYZ cleaninng services\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "import numpy as np\n", 58 | "for query, query_embedding in zip(queries, query_embeddings):\n", 59 | " distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n", 60 | " print(distances)\n", 61 | " print(\"matched sent: \",corpus[np.argmax(distances)],\", id: \",np.argmax(distances))\n", 62 | " print(\"query: \",query)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.6.9" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 2 94 | } 95 | -------------------------------------------------------------------------------- /1.5-named-entity-recognition/data_making.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #!/usr/bin/env python3 3 | 4 | """ 5 | Created on Mon Aug 6 19:40:26 2018 6 | 7 | @author: joy 8 | """ 9 | import json 10 | abs_path1 = "benchmarking_data/Train/" 11 | 12 | abs_path2 = "benchmarking_data/Validate/" 13 | import re 14 | reg = re.compile('[A-z]*\_([A-z]*)\_[A-z]*') 15 | reg2 = re.compile('[A-z]*\_([A-z]*)') 16 | 17 | 18 | def make_data_for_intent_from_json(json_file,txt_file): 19 | 20 | json_d = json.load(open(abs_path1+json_file)) 21 | json_dict = json_d[reg.match(json_file).group(1)] 22 | 23 | wr = open("Intent_Data/"+txt_file,'w') 24 | 25 | for i in json_dict: 26 | each_list = i['data'] 27 | sent ="" 28 | for i in each_list: 29 | sent = sent + i['text']+ " " 30 | sent =sent[:-1] 31 | for i in range(3): 32 | sent = sent.replace(" "," ") 33 | wr.write(sent) 34 | wr.write('\n') 35 | print(sent) 36 | 37 | 38 | def make_data_from_json(json_file,txt_file): 39 | 40 | json_d = json.load(open(abs_path2+json_file)) 41 | json_dict = json_d[reg2.match(json_file).group(1)] 42 | 43 | wr = open(abs_path2+txt_file,'w') 44 | 45 | for i in json_dict: 46 | each_list = i['data'] 47 | for i in each_list: 48 | try: 49 | words = i['text'].split() 50 | print(words[0]+' '+'B-'+i['entity']) 51 | wr.write(words[0]+' '+'B-'+i['entity']) 52 | wr.write('\n') 53 | for word in words[1:]: 54 | print(word+' '+'I-'+i['entity']) 55 | wr.write(word+' '+'I-'+i['entity']) 56 | wr.write('\n') 57 | #print(i['text']+'\t'+i['entity']) 58 | 59 | except: 60 | words = i['text'].split() 61 | for word in words: 62 | print(word+' '+'O') 63 | wr.write(word+' '+'O') 64 | wr.write('\n') 65 | print('\n') 66 | wr.write('\n') 67 | 68 | 69 | def make_data_from_json_train(json_file,txt_file): 70 | 71 | json_d = json.load(open(abs_path1+json_file)) 72 | json_dict = json_d[reg.match(json_file).group(1)] 73 | 74 | wr = open(abs_path1+txt_file,'w') 75 | 76 | for i in json_dict: 77 | each_list = i['data'] 78 | for i in each_list: 79 | try: 80 | words = i['text'].split() 81 | print(words[0]+' '+'B-'+i['entity']) 82 | wr.write(words[0]+' '+'B-'+i['entity']) 83 | wr.write('\n') 84 | for word in words[1:]: 85 | print(word+' '+'I-'+i['entity']) 86 | wr.write(word+' '+'I-'+i['entity']) 87 | wr.write('\n') 88 | #print(i['text']+'\t'+i['entity']) 89 | 90 | except: 91 | words = i['text'].split() 92 | for word in words: 93 | print(word+' '+'O') 94 | wr.write(word+' '+'O') 95 | wr.write('\n') 96 | print('\n') 97 | wr.write('\n') 98 | 99 | import nltk 100 | def make_data_from_json_train_pos(json_file,txt_file): 101 | 102 | json_d = json.load(open(abs_path2+json_file)) 103 | json_dict = json_d[reg2.match(json_file).group(1)] 104 | 105 | wr = open(abs_path2+txt_file,'w') 106 | 107 | for i in json_dict: 108 | each_list = i['data'] 109 | sent = "" 110 | for i in each_list: 111 | sent = sent+i['text']+" " 112 | sent = sent.replace(" "," ") 113 | if sent[-1]==" ": 114 | sent = sent[:-1] 115 | words = [] 116 | pos_tags = nltk.pos_tag(sent.split()) 117 | print(pos_tags,sent) 118 | pos_tag_dict = {j:k for j,k in pos_tags} 119 | for i in each_list: 120 | try: 121 | 122 | words = i['text'].split() 123 | print(words[0]+' '+pos_tag_dict[words[0]]+" "+'B-'+i['entity']) 124 | wr.write(words[0]+" "+pos_tag_dict[words[0]]+" "+'B-'+i['entity']) 125 | wr.write('\n') 126 | for word in words[1:]: 127 | print(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity']) 128 | wr.write(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity']) 129 | wr.write('\n') 130 | #print(i['text']+'\t'+i['entity']) 131 | 132 | except: 133 | words = i['text'].split() 134 | for word in words: 135 | print(word+' '+pos_tag_dict[word]+" "+'O') 136 | wr.write(word+' '+pos_tag_dict[word]+" "+'O') 137 | wr.write('\n') 138 | print('\n') 139 | wr.write('\n') 140 | 141 | 142 | import re 143 | import json 144 | import os 145 | def make_data_from_snips(input_path): 146 | 147 | for r,d,f in os.walk(input_path): 148 | 149 | for filename in f: 150 | label = os.path.basename(r) 151 | source = os.path.join(r,filename) 152 | 153 | 154 | 155 | if os.path.splitext(filename)[-1] != '.txt': 156 | continue 157 | 158 | 159 | 160 | 161 | read_file = open(source) 162 | 163 | 164 | pattern = re.compile(r'(?:[[])(?P.*?)(?:[]])(?:[(])(?P.*?)(?:[)])') 165 | 166 | corpus = dict() 167 | corpus[label] = list() 168 | for i in read_file: 169 | data = list() 170 | 171 | it = pattern.finditer(i) 172 | 173 | sent_len = len(i.strip()) 174 | 175 | if sent_len == 0: 176 | continue 177 | 178 | last_span = 0 179 | for m in it: 180 | 181 | head = i[last_span:m.span()[0]] 182 | obj = dict() 183 | if head.strip(): 184 | obj['text'] = head 185 | 186 | data.append(obj) 187 | 188 | obj = dict() 189 | obj['text'] = m.group('value') 190 | obj['entity'] = m.group('name') 191 | 192 | data.append(obj) 193 | 194 | last_span = m.span()[1] 195 | if last_span: 196 | obj = dict() 197 | if i[last_span :].strip(): 198 | obj['text'] = i[last_span :] 199 | data.append(obj) 200 | 201 | if data: 202 | 203 | corpus[label].append({'data': data}) 204 | 205 | with open(os.path.join(r,filename.split()[0] + '.json'),'w',encoding='utf-8') as fp: 206 | json.dump(corpus,fp) 207 | 208 | 209 | 210 | 211 | 212 | #make_data("book_restaurant_train.csv","book_restaurant_train.txt") 213 | ''' 214 | make_data_from_json_train_pos("train_AddToPlaylist_full.json","train_AddToPlaylist_full.txt") 215 | make_data_from_json_train_pos("train_BookRestaurant_full.json","train_BookRestaurant_full.txt") 216 | make_data_from_json_train_pos("train_GetWeather_full.json","train_GetWeather_full.txt") 217 | make_data_from_json_train_pos("train_PlayMusic_full.json","train_PlayMusic_full.txt") 218 | make_data_from_json_train_pos("train_RateBook_full.json","train_RateBook_full.txt") 219 | make_data_from_json_train_pos("train_SearchCreativeWork_full.json","train_SearchCreativeWork_full.txt") 220 | make_data_from_json_train_pos("train_SearchScreeningEvent_full.json","train_SearchScreeningEvent_full.txt") 221 | ''' 222 | 223 | make_data_from_json_train_pos("validate_AddToPlaylist.json","validate_AddToPlaylist.txt") 224 | make_data_from_json_train_pos("validate_BookRestaurant.json","validate_BookRestaurant.txt") 225 | make_data_from_json_train_pos("validate_GetWeather.json","validate_GetWeather.txt") 226 | make_data_from_json_train_pos("validate_PlayMusic.json","validate_PlayMusic.txt") 227 | make_data_from_json_train_pos("validate_RateBook.json","validate_RateBook.txt") 228 | make_data_from_json_train_pos("validate_SearchCreativeWork.json","validate_SearchCreativeWork.txt") 229 | make_data_from_json_train_pos("validate_SearchScreeningEvent.json","validate_SearchScreeningEvent.txt") 230 | 231 | 232 | 233 | #make_data_from_snips("flight_data") 234 | -------------------------------------------------------------------------------- /1.5-named-entity-recognition/ner_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "For data i used https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines\n", 8 | "Then you can run data_making.py" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from numpy import array\n", 18 | "import tensorflow as tf\n", 19 | "import glob\n", 20 | "import numpy as np\n", 21 | "import pickle\n", 22 | "\n", 23 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 24 | "from tensorflow.keras.utils import to_categorical\n", 25 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 26 | "from tensorflow.keras.models import Sequential,Model\n", 27 | "from tensorflow.keras.layers import Dense\n", 28 | "from tensorflow.keras.layers import LSTM\n", 29 | "from tensorflow.keras.layers import Input\n", 30 | "from tensorflow.keras.layers import Dropout\n", 31 | "from tensorflow.keras.layers import Embedding\n", 32 | "from tensorflow.keras.layers import TimeDistributed\n", 33 | "from tensorflow.keras.layers import Conv1D\n", 34 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n", 35 | "from tensorflow.keras.utils import Progbar\n", 36 | "from tensorflow.keras.models import load_model\n", 37 | "\n", 38 | "from tensorflow.keras.initializers import RandomUniform\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "class LoadData():\n", 55 | " def __init__(self):\n", 56 | " self.train_files = None\n", 57 | " self.validation_files = None\n", 58 | " \n", 59 | " def get_data(self):\n", 60 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n", 61 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "load_data_obj = LoadData()\n", 71 | "load_data_obj.get_data()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "class Preprocessing():\n", 81 | " def __init__(self):\n", 82 | " self.word_embediings_model = open(\"embeddings/glove.6B.100d.txt\", encoding=\"utf-8\")\n", 83 | " \n", 84 | " \n", 85 | " def sentence_from_file(self,filename):\n", 86 | " f = open(filename)\n", 87 | " single_file_sentences = []\n", 88 | " sentence_list = []\n", 89 | " for line in f:\n", 90 | " if len(line)==0 or line[0]==\"\\n\":\n", 91 | " if len(sentence_list) > 0:\n", 92 | " single_file_sentences.append(sentence_list)\n", 93 | " sentence_list = []\n", 94 | " continue\n", 95 | " splits = line.split(' ')\n", 96 | " sentence_list.append([splits[0],splits[1],splits[-1]])\n", 97 | "\n", 98 | " if len(sentence_list) >0:\n", 99 | " single_file_sentences.append(sentence_list)\n", 100 | " sentence_list = []\n", 101 | " return single_file_sentences\n", 102 | "\n", 103 | " def get_case_value(self,word, case_dict): \n", 104 | " case_value = 'other'\n", 105 | "\n", 106 | " count_digits = 0\n", 107 | " for char in word:\n", 108 | " if char.isdigit():\n", 109 | " count_digits += 1\n", 110 | "\n", 111 | " if word.isdigit():\n", 112 | " case_value = 'number'\n", 113 | " elif count_digits / float(len(word)) > 0.5:\n", 114 | " case_value = 'fraction'\n", 115 | " elif word.islower():\n", 116 | " case_value = 'lower'\n", 117 | " elif word.isupper():\n", 118 | " case_value = 'upper'\n", 119 | " elif word[0].isupper():\n", 120 | " case_value = 'title'\n", 121 | " elif count_digits > 0:\n", 122 | " case_value = 'leters_digit'\n", 123 | "\n", 124 | " return case_dict[case_value]\n", 125 | "\n", 126 | "\n", 127 | " def createBatches(self,data):\n", 128 | " l = []\n", 129 | " for i in data:\n", 130 | " l.append(len(i[0]))\n", 131 | " l = set(l)\n", 132 | " batches = []\n", 133 | " batch_len = []\n", 134 | " z = 0\n", 135 | " for i in l:\n", 136 | " for batch in data:\n", 137 | " if len(batch[0]) == i:\n", 138 | " batches.append(batch)\n", 139 | " z += 1\n", 140 | " batch_len.append(z)\n", 141 | " return batches,batch_len\n", 142 | "\n", 143 | " def create_tensors(self,sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id):\n", 144 | " #paddingIdx = word2Idx['PAD_TKN']\n", 145 | " unknownIdx = word_to_id['UNK_TKN']\n", 146 | "\n", 147 | " dataset = []\n", 148 | "\n", 149 | " word_count = 0\n", 150 | " unknownword_count = 0\n", 151 | "\n", 152 | " for sentence in sentences:\n", 153 | " word_indices = [] \n", 154 | " char_indices = []\n", 155 | " case_indices = []\n", 156 | " label_indices = []\n", 157 | " pos_indices = []\n", 158 | "\n", 159 | " for word,char,pos,label in sentence: \n", 160 | "\n", 161 | " word_count += 1\n", 162 | " if word in word_to_id:\n", 163 | " word_index = word_to_id[word]\n", 164 | " elif word.lower() in word_to_id:\n", 165 | " word_index = word_to_id[word.lower()] \n", 166 | " else:\n", 167 | " word_index = unknownIdx\n", 168 | " unknownword_count += 1\n", 169 | " \n", 170 | " char_index = []\n", 171 | " for x in char:\n", 172 | " char_index.append(char_to_id[x])\n", 173 | " \n", 174 | " word_indices.append(word_index)\n", 175 | " case_indices.append(self.get_case_value(word, case_to_id))\n", 176 | " pos_indices.append(pos_to_id[pos.replace('\\n','')])\n", 177 | " char_indices.append(char_index)\n", 178 | " label_indices.append(label_to_id[label])\n", 179 | " print([word_indices, case_indices, char_indices, pos_indices, label_indices])\n", 180 | " dataset.append([word_indices, case_indices, char_indices, pos_indices, label_indices]) \n", 181 | " return dataset\n", 182 | "\n", 183 | "\n", 184 | " def addCharInformatioin(self,Sentences):\n", 185 | " for i,sentence in enumerate(Sentences):\n", 186 | " for j,data in enumerate(sentence):\n", 187 | " chars = [c for c in data[0]]\n", 188 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n", 189 | " return Sentences\n", 190 | "\n", 191 | " def padding(self,Sentences):\n", 192 | " maxlen = 52\n", 193 | " for sentence in Sentences:\n", 194 | " char = sentence[2]\n", 195 | " for x in char:\n", 196 | " maxlen = max(maxlen,len(x))\n", 197 | " for i,sentence in enumerate(Sentences):\n", 198 | " Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')\n", 199 | " return Sentences\n", 200 | " \n", 201 | " def get_word_embeddings(self,list_sentences):\n", 202 | " wd_to_id = {}\n", 203 | " wd_em = []\n", 204 | " \n", 205 | " words = {}\n", 206 | " for sentence in list_sentences:\n", 207 | " for token,char,pos,label in sentence:\n", 208 | " words[token.lower()] = True\n", 209 | " \n", 210 | " for line in self.word_embediings_model:\n", 211 | " split = line.strip().split(\" \")\n", 212 | "\n", 213 | " if len(wd_to_id) == 0:\n", 214 | " wd_to_id[\"PAD_TKN\"] = len(wd_to_id)\n", 215 | " vector = np.zeros(len(split)-1) \n", 216 | " wd_em.append(vector)\n", 217 | "\n", 218 | " wd_to_id[\"UNK_TKN\"] = len(wd_to_id)\n", 219 | " vector = np.random.uniform(-0.25, 0.25, len(split)-1)\n", 220 | " wd_em.append(vector)\n", 221 | " if split[0].lower() in words:\n", 222 | " vector = np.array([float(num) for num in split[1:]])\n", 223 | " wd_em.append(vector)\n", 224 | " wd_to_id[split[0]] = len(wd_to_id)\n", 225 | "\n", 226 | " wd_em = np.array(wd_em)\n", 227 | " return wd_em,wd_to_id\n", 228 | " \n", 229 | " def get_feature_dict(self,sentences):\n", 230 | "\n", 231 | " labelSet = set()\n", 232 | " lb_to_id = {}\n", 233 | " for sentence in sentences:\n", 234 | " for token,char,pos,label in sentence:\n", 235 | " labelSet.add(label)\n", 236 | "\n", 237 | " for label in labelSet:\n", 238 | " lb_to_id[label] = len(lb_to_id)\n", 239 | "\n", 240 | " id_to_lb = {v: k for k, v in lb_to_id.items()}\n", 241 | "\n", 242 | " ch_to_id = {\"PADDING\":0, \"UNKNOWN\":1}\n", 243 | " for c in \" 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\\\"/\\\\%$`&=*+@^~|øæðş\":\n", 244 | " ch_to_id[c] = len(ch_to_id)\n", 245 | "\n", 246 | " cs_to_id = {\n", 247 | " 'number': 0, 'lower':1, 'upper':2, 'title':3, \n", 248 | " 'other':4, 'fraction':5, 'leters_digit': 6, \n", 249 | " 'PAD_TKN':7\n", 250 | " }\n", 251 | "\n", 252 | " pos_to_id = {\"$\":0, \"''\":1, \"(\":2, \")\":3, \",\":4, \"--\":5, \".\":6, \":\":7, \"CC\":8, \"CD\":9, \"DT\":10,\n", 253 | " \"EX\":11, \"FW\":12, \"IN\":13, \"JJ\":14, \"JJR\":15, \"JJS\":16, \"LS\":17, \"MD\":18, \"NN\":19,\n", 254 | " \"NNP\":20, \"NNPS\":21, \"NNS\":22, \"PDT\":23, \"POS\":24, \"PRP\":25, \"PRP$\":26, \"RB\":27, \n", 255 | " \"RBR\":28, \"RBS\":29, \"RP\":30, \"SYM\":31, \"TO\":32, \"UH\":33, \"VB\":34, \"VBD\":35, \"VBG\":36, \n", 256 | " \"VBN\":37, \"VBP\":38, \"VBZ\":39, \"WDT\":40, \"WP\":41, \"WP$\":42, \"WRB\":43, \"``\":44}\n", 257 | " \n", 258 | " return cs_to_id,pos_to_id,ch_to_id,lb_to_id,id_to_lb\n", 259 | " \n", 260 | " def make_batch(self,dataset):\n", 261 | " self.batch,self.batch_len = self.createBatches(dataset)\n", 262 | " return self.batch,self.batch_len\n", 263 | " \n", 264 | " def make_dataset(self,file_name):\n", 265 | " sentences = self.sentence_from_file(file_name)\n", 266 | " sentences = self.addCharInformatioin(sentences)\n", 267 | " return sentences\n", 268 | " \n", 269 | " def get_sentences(self,file_list):\n", 270 | " list_sentences = []\n", 271 | " for i in file_list:\n", 272 | " list_sentences+= self.make_dataset(i)\n", 273 | " return list_sentences\n", 274 | " " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "preprocess_obj = Preprocessing()\n", 284 | "train_sentences = preprocess_obj.get_sentences(load_data_obj.train_files)\n", 285 | "word_emb,word_to_id = preprocess_obj.get_word_embeddings(train_sentences)\n", 286 | "\n", 287 | "'''the below function is not requred for validation data, we will load the dictionaries for validation'''\n", 288 | "case_to_id,pos_to_id,char_to_id,label_to_id,id_to_label = preprocess_obj.get_feature_dict(train_sentences)\n", 289 | "train_data_set = preprocess_obj.padding(preprocess_obj.create_tensors(train_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n", 290 | "train_batch,train_batch_len = preprocess_obj.make_batch(train_data_set)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "class DesignModel():\n", 300 | " def __init__(self,params):\n", 301 | " self.model = None\n", 302 | " self.wd_em = word_emb\n", 303 | " self.caseEmbeddings = np.identity(len(case_to_id), dtype='float32')\n", 304 | " self.posEmbeddings = np.identity(len(pos_to_id), dtype='float32') \n", 305 | " self.ch_to_id = char_to_id\n", 306 | " self.lb_to_id = label_to_id\n", 307 | " self.params = params\n", 308 | " self.train_batch = train_batch\n", 309 | " self.train_batch_len = train_batch_len\n", 310 | "\n", 311 | " \n", 312 | " def iterate_minibatches(self,dataset,batch_len): \n", 313 | " start = 0\n", 314 | " for i in batch_len:\n", 315 | " tokens = []\n", 316 | " char = []\n", 317 | " labels = []\n", 318 | " casing = []\n", 319 | " pos_tags = []\n", 320 | " data = dataset[start:i]\n", 321 | " start = i\n", 322 | " for dt in data:\n", 323 | " t,c,ch,pos,l = dt\n", 324 | " l = np.expand_dims(l,-1)\n", 325 | " tokens.append(t)\n", 326 | " char.append(ch)\n", 327 | " labels.append(l)\n", 328 | " casing.append(c)\n", 329 | " pos_tags.append(pos)\n", 330 | " yield np.asarray(labels),np.asarray(tokens),np.asarray(casing), np.asarray(char), np.asarray(pos_tags)\n", 331 | " \n", 332 | " def BiRNN_model(self):\n", 333 | " \n", 334 | " input = Input(shape=(None,),dtype='int32')\n", 335 | "\n", 336 | " words = Embedding(input_dim=self.wd_em.shape[0], output_dim=self.wd_em.shape[1], weights=[self.wd_em], trainable=False)(input)\n", 337 | "\n", 338 | " csng_input = Input(shape=(None,), dtype='int32')\n", 339 | " csng = Embedding(output_dim = self.caseEmbeddings.shape[1], input_dim = self.caseEmbeddings.shape[0], weights = [self.caseEmbeddings], trainable=False)(csng_input)\n", 340 | "\n", 341 | "\n", 342 | " char_input=Input(shape=(None,52,))\n", 343 | " embed_char_out=TimeDistributed(Embedding(len(self.ch_to_id),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input)\n", 344 | " dropout= Dropout(self.params['dropout_rate'])(embed_char_out)\n", 345 | " conv1d_out = TimeDistributed(Conv1D(kernel_size=self.params['kernel_sizes_cnn'], filters=30, padding='same',activation=params['rnn_activation'], strides=1))(dropout)\n", 346 | " maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)\n", 347 | " char = TimeDistributed(Flatten())(maxpool_out)\n", 348 | " char = Dropout(self.params['dropout_rate'])(char)\n", 349 | "\n", 350 | " pos_input = Input(shape=(None,), dtype='int32')\n", 351 | " pos = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n", 352 | "\n", 353 | "\n", 354 | " output = concatenate([words, csng, char, pos])\n", 355 | " output = Bidirectional(LSTM(self.params['units_lstm'], return_sequences=True, dropout=self.params['dropout_rate'], recurrent_dropout=0.25))(output)\n", 356 | " output = TimeDistributed(Dense(len(self.lb_to_id), activation=self.params['rnn_activation']))(output)\n", 357 | " self.model = Model(inputs=[input, csng_input, char_input, pos_input], outputs=[output])\n", 358 | " self.model.compile(loss=self.params['loss'], optimizer=self.params['optimizer'],metrics=[\"accuracy\"])\n", 359 | "\n", 360 | " def train_model(self):\n", 361 | " \n", 362 | " for epoch in range(self.params['epochs']):\n", 363 | "\n", 364 | " print(\"Epoch %d/%d\"%(epoch+1, self.params['epochs']))\n", 365 | " a = Progbar(len(preprocess_obj.batch_len))\n", 366 | " res = None\n", 367 | " for i,batch in enumerate(self.iterate_minibatches(self.train_batch,self.train_batch_len)):\n", 368 | " labels, tkns, csng, char, pos = batch \n", 369 | " res = self.model.train_on_batch([tkns, csng, char, pos], labels)\n", 370 | " a.update(i)\n", 371 | " print(\"\\n\")\n", 372 | " print(self.model.metrics_names[0],\":\",res[0],self.model.metrics_names[1],\":\",res[1])\n", 373 | " print(' ')" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "params = {\n", 383 | " \"kernel_sizes_cnn\": 3,\n", 384 | " \"optimizer\": \"nadam\",\n", 385 | " \"cnn_activation\":\"tanh\",\n", 386 | " \"rnn_activation\":\"softmax\",\n", 387 | " \"units_lstm\" : 100,\n", 388 | " \"loss\": \"sparse_categorical_crossentropy\",\n", 389 | " \"text_size\": 50,\n", 390 | " \"dropout_rate\": 0.5,\n", 391 | " \"epochs\": 100,\n", 392 | " \"model_name\": \"cnn_model\",\n", 393 | " \"batch_size\": 32,\n", 394 | " \"verbose\": True,\n", 395 | " \"metrics\":[\"accuracy\"]\n", 396 | " }\n", 397 | "model_obj = DesignModel(params)\n", 398 | "model_obj.BiRNN_model()\n", 399 | "model_obj.train_model()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "class LoadAndSaveModels():\n", 409 | " \n", 410 | " def save_model(self,model,model_name):\n", 411 | " model.save(\"Model_Data/entity_models/\"+model_name+\".h5\")\n", 412 | " print(\"Model saved to Model folder.\")\n", 413 | " \n", 414 | " def save_dict(self, save_path,dictionaries): \n", 415 | " \n", 416 | " for item in dictionaries:\n", 417 | " \n", 418 | " with open(save_path+\"/\"+item[1]+\".txt\", \"wb\") as myFile:\n", 419 | " pickle.dump(item[0], myFile)\n", 420 | "\n", 421 | " print(\"Files saved.\")\n", 422 | " \n", 423 | " def load_dict(self,file):\n", 424 | " with open(file,\"rb\") as fp:\n", 425 | " dict = pickle.load(fp)\n", 426 | " return dict\n", 427 | " \n", 428 | " def load_model(self,model_name):\n", 429 | " model = load_model(model_name)\n", 430 | " return model\n" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "load_save = LoadAndSaveModels()\n", 440 | "load_save.save_model(model_obj.model,\"birnn\")\n", 441 | "dict = [(word_to_id,\"word_to_id\"),(label_to_id,\"label_to_id\"),(char_to_id,\"char_to_id\"),\n", 442 | " (id_to_label,\"id_to_label\"),(case_to_id,\"case_to_id\"),(pos_to_id,\"pos_to_id\")]\n", 443 | "load_save.save_dict(\"Model_Data/dict\",dict)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "load_save = LoadAndSaveModels()\n", 453 | "model = load_save.load_model(\"Model_Data/entity_models/birnn.h5\")\n", 454 | "word_to_id = load_save.load_dict(\"Model_Data/dict/word_to_id.txt\")\n", 455 | "case_to_id = load_save.load_dict(\"Model_Data/dict/case_to_id.txt\")\n", 456 | "pos_to_id = load_save.load_dict(\"Model_Data/dict/pos_to_id.txt\")\n", 457 | "char_to_id = load_save.load_dict(\"Model_Data/dict/char_to_id.txt\")\n", 458 | "label_to_id = load_save.load_dict(\"Model_Data/dict/label_to_id.txt\")\n", 459 | "id_to_label = load_save.load_dict(\"Model_Data/dict/id_to_label.txt\")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "validation_sentences = preprocess_obj.get_sentences(load_data_obj.validation_files)\n", 469 | "validation_set = preprocess_obj.padding(preprocess_obj.create_tensors(validation_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n", 470 | "validation_batch,validation_batch_len = preprocess_obj.make_batch(validation_set)\n" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "class Prediction():\n", 480 | " def __init__(self):\n", 481 | " self.case_to_id = case_to_id\n", 482 | " self.pos_to_id = pos_to_id\n", 483 | " self.char_to_id = char_to_id\n", 484 | " self.label_to_id = label_to_id\n", 485 | " self.id_to_label = id_to_label\n", 486 | " self.word_to_id = word_to_id\n", 487 | " def prediction(self,dataset,model):\n", 488 | " correct_labels = []\n", 489 | " predict_labels = []\n", 490 | " b = Progbar(len(dataset))\n", 491 | " for i,data in enumerate(dataset): \n", 492 | " tkns, csng, char,pos, labels = data\n", 493 | " tkns = np.asarray([tkns]) \n", 494 | " char = np.asarray([char])\n", 495 | " csng = np.asarray([csng])\n", 496 | " pos = np.asarray([pos])\n", 497 | " predict = model.predict([tkns, csng, char,pos], verbose=False)[0] \n", 498 | " predict = predict.argmax(axis=-1) \n", 499 | " correct_labels.append(labels)\n", 500 | " predict_labels.append(predict)\n", 501 | " b.update(i)\n", 502 | " return predict_labels, correct_labels\n", 503 | " \n", 504 | " def predict(self,sentence,model):\n", 505 | " sen_list = [[[i,'POS','O\\n'] for i in sentence.split()]]\n", 506 | " test_sent = preprocess_obj.addCharInformatioin(sen_list)\n", 507 | "\n", 508 | " predLabels = []\n", 509 | "\n", 510 | " test_set = preprocess_obj.padding(preprocess_obj.create_tensors(test_sent,self.word_to_id,\n", 511 | " self.case_to_id,self.pos_to_id,\n", 512 | " self.char_to_id,self.label_to_id))\n", 513 | " test_batch,test_batch_len = preprocess_obj.createBatches(test_set)\n", 514 | " for i,data in enumerate(test_batch):\n", 515 | " tokens, csng, char, pos, labels = data\n", 516 | " tokens = np.asarray([tokens]) \n", 517 | " char = np.asarray([char])\n", 518 | " csng = np.asarray([csng])\n", 519 | " pos = np.asarray([pos])\n", 520 | " pred = model.predict([tokens,csng, char,pos], verbose=False)[0] \n", 521 | " pred = pred.argmax(axis=-1) #Predict the classes \n", 522 | " predLabels.append(pred)\n", 523 | " entity_labels = []\n", 524 | " j = 0\n", 525 | " words_list = sentence.split()\n", 526 | " for i in predLabels[-1]:\n", 527 | " entity_labels.append((words_list[j],self.id_to_label[int(i)].replace(\"\\n\",\"\")))\n", 528 | " j+=1\n", 529 | "\n", 530 | " return entity_labels" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "pred_obj = Prediction()" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "sent = \"Add Richard McNamara newest song to the Just Smile playlist\"\n", 549 | "entity_label = pred_obj.predict(sent,model)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "entity_label" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "class Evaluate():\n", 568 | " def compute_precision(self,guessed_sentences, correct_sentences):\n", 569 | " assert(len(guessed_sentences) == len(correct_sentences))\n", 570 | " correctCount = 0\n", 571 | " count = 0\n", 572 | "\n", 573 | "\n", 574 | " for sentenceIdx in range(len(guessed_sentences)):\n", 575 | " guessed = guessed_sentences[sentenceIdx]\n", 576 | " correct = correct_sentences[sentenceIdx]\n", 577 | " assert(len(guessed) == len(correct))\n", 578 | " idx = 0\n", 579 | " while idx < len(guessed):\n", 580 | " if guessed[idx][0] == 'B': #A new chunk starts\n", 581 | " count += 1\n", 582 | "\n", 583 | " if guessed[idx] == correct[idx]:\n", 584 | " idx += 1\n", 585 | " correctlyFound = True\n", 586 | "\n", 587 | " while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I\n", 588 | " if guessed[idx] != correct[idx]:\n", 589 | " correctlyFound = False\n", 590 | "\n", 591 | " idx += 1\n", 592 | "\n", 593 | " if idx < len(guessed):\n", 594 | " if correct[idx][0] == 'I': #The chunk in correct was longer\n", 595 | " correctlyFound = False\n", 596 | "\n", 597 | "\n", 598 | " if correctlyFound:\n", 599 | " correctCount += 1\n", 600 | " else:\n", 601 | " idx += 1\n", 602 | " else: \n", 603 | " idx += 1\n", 604 | "\n", 605 | " precision = 0\n", 606 | " if count > 0: \n", 607 | " precision = float(correctCount) / count\n", 608 | "\n", 609 | " return precision\n", 610 | " def get_metrics(self,predictions, correct, idx2Label): \n", 611 | " label_pred = [] \n", 612 | " for sentence in predictions:\n", 613 | " label_pred.append([idx2Label[element] for element in sentence])\n", 614 | "\n", 615 | " label_correct = [] \n", 616 | " for sentence in correct:\n", 617 | " label_correct.append([idx2Label[element] for element in sentence])\n", 618 | "\n", 619 | "\n", 620 | " #print label_pred\n", 621 | " #print label_correct\n", 622 | "\n", 623 | " prec = self.compute_precision(label_pred, label_correct)\n", 624 | " rec = self.compute_precision(label_correct, label_pred)\n", 625 | "\n", 626 | " f1 = 0\n", 627 | " if (rec+prec) > 0:\n", 628 | " f1 = 2.0 * prec * rec / (prec + rec);\n", 629 | "\n", 630 | " return prec, rec, f1" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "eval_obj = Evaluate()\n", 640 | "\n", 641 | "train_predict_labels, train_correct_labels = pred_obj.prediction(train_data_set,model)\n", 642 | "pre_train, rec_train, f1_train= eval_obj.get_metrics(train_predict_labels, train_correct_labels, id_to_label)\n", 643 | "print(\"Train-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_train, rec_train, f1_train))\n", 644 | " \n", 645 | "validation_predict_labels, validation_correct_labels = pred_obj.prediction(validation_set,model)\n", 646 | "pre_test, rec_test, f1_test= eval_obj.get_metrics(validation_predict_labels, validation_correct_labels, id_to_label)\n", 647 | "print(\"Validation-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_test, rec_test, f1_test))\n" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [] 656 | } 657 | ], 658 | "metadata": { 659 | "kernelspec": { 660 | "display_name": "Python 3", 661 | "language": "python", 662 | "name": "python3" 663 | }, 664 | "language_info": { 665 | "codemirror_mode": { 666 | "name": "ipython", 667 | "version": 3 668 | }, 669 | "file_extension": ".py", 670 | "mimetype": "text/x-python", 671 | "name": "python", 672 | "nbconvert_exporter": "python", 673 | "pygments_lexer": "ipython3", 674 | "version": "3.8.2" 675 | } 676 | }, 677 | "nbformat": 4, 678 | "nbformat_minor": 4 679 | } 680 | -------------------------------------------------------------------------------- /1.5-named-entity-recognition/simple_ner-2.0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from numpy import array\n", 10 | "import tensorflow as tf\n", 11 | "import glob\n", 12 | "import numpy as np\n", 13 | "import pickle\n", 14 | "from datetime import datetime\n", 15 | "import nltk \n", 16 | "\n", 17 | "from sklearn.metrics import accuracy_score\n", 18 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 19 | "from tensorflow.keras.utils import to_categorical\n", 20 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 21 | "from tensorflow.keras.models import Sequential,Model\n", 22 | "from tensorflow.keras.layers import Dense\n", 23 | "from tensorflow.keras.layers import LSTM\n", 24 | "from tensorflow.keras.layers import Input\n", 25 | "from tensorflow.keras.layers import Dropout\n", 26 | "from tensorflow.keras.layers import Embedding\n", 27 | "from tensorflow.keras.layers import TimeDistributed\n", 28 | "from tensorflow.keras.layers import Conv1D\n", 29 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n", 30 | "from tensorflow.keras.utils import Progbar\n", 31 | "from tensorflow.keras.models import load_model\n", 32 | "\n", 33 | "from tensorflow.keras.initializers import RandomUniform\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "class LoadData():\n", 43 | " def __init__(self):\n", 44 | " self.train_files = None\n", 45 | " self.validation_files = None\n", 46 | " \n", 47 | " def get_data(self):\n", 48 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n", 49 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n", 50 | " \n", 51 | " def sentence_from_file(self,filename):\n", 52 | " single_data_list = list()\n", 53 | " with open(filename) as fp:\n", 54 | " sentence_list = []\n", 55 | " lines = fp.readlines()\n", 56 | " for line in lines:\n", 57 | " splits = line.split(' ')\n", 58 | " if splits[0]=='\\n':\n", 59 | " #sent = \" \".join([word[0] for word in sentence_list])\n", 60 | " #single_data_list.append((sentence_list,sent))\n", 61 | " single_data_list.append(sentence_list)\n", 62 | " sentence_list = list()\n", 63 | " else:\n", 64 | " sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n", 65 | " \n", 66 | " return single_data_list\n", 67 | " \n", 68 | " def addCharInformatioin(self,Sentences):\n", 69 | " for i,sentence in enumerate(Sentences):\n", 70 | " for j,data in enumerate(sentence):\n", 71 | " chars = [c for c in data[0]]\n", 72 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n", 73 | " return Sentences\n", 74 | " \n", 75 | " def prepared_data(self,files):\n", 76 | " list_sentences = list()\n", 77 | " for each_file in files:\n", 78 | " sentences = self.sentence_from_file(each_file)\n", 79 | " #sentences = self.addCharInformatioin(sentences)\n", 80 | " list_sentences+= sentences\n", 81 | " return list_sentences\n", 82 | " " 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "load_data_obj = LoadData()\n", 92 | "load_data_obj.get_data()\n", 93 | "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n", 94 | "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n", 95 | "print(trained_sen_list[:5])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "class Preprocessing():\n", 105 | " def __init__(self):\n", 106 | " self.max_len = len(max(trained_sen_list))\n", 107 | " \n", 108 | " def make_data(self,data_list):\n", 109 | " \n", 110 | " \n", 111 | " words = list()\n", 112 | " for each_sent in data_list:\n", 113 | " for each_item in each_sent:\n", 114 | " words.append(each_item[0])\n", 115 | " words = list(set(words))\n", 116 | "\n", 117 | " \n", 118 | " pos_tags = list()\n", 119 | " for each_sent in data_list:\n", 120 | " for each_item in each_sent:\n", 121 | " pos_tags.append(each_item[1])\n", 122 | " pos_tags = list(set(pos_tags))\n", 123 | " \n", 124 | " labels = list()\n", 125 | " for each_sent in data_list:\n", 126 | " for each_item in each_sent:\n", 127 | " labels.append(each_item[2])\n", 128 | " labels = list(set(labels))\n", 129 | " \n", 130 | " \n", 131 | " self.word2idx = {w: i for i, w in enumerate(words)}\n", 132 | " self.word2idx.update({\"PAD\": len(self.word2idx), \"UNK\": len(self.word2idx)+1})\n", 133 | " self.num_words = len(self.word2idx)\n", 134 | " \n", 135 | " self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n", 136 | " self.pos_tag2idx.update({\"PAD\": len(self.pos_tag2idx), \"UNK\": len(self.pos_tag2idx)+1})\n", 137 | " self.num_pos_tags = len(self.pos_tag2idx)\n", 138 | " \n", 139 | " self.label2idx = {t: i for i, t in enumerate(labels)}\n", 140 | " self.num_lables = len(self.label2idx)\n", 141 | " \n", 142 | " def word2features(self,data, word_dict):\n", 143 | " word = data[0]\n", 144 | " postag = data[1]\n", 145 | " binary_map = {True:0,False:1,None:2}\n", 146 | " features = [word_dict[word],binary_map[word.islower()], \n", 147 | " binary_map[word.isupper()], binary_map[word.istitle()], \n", 148 | " binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n", 149 | " return features\n", 150 | "\n", 151 | "\n", 152 | " def sent2features(self,sent,word_dict):\n", 153 | " sentence_features = list()\n", 154 | " for index in range(len(sent)):\n", 155 | " sentence_features.append(self.word2features(sent[index],word_dict))\n", 156 | " \n", 157 | " return sentence_features\n", 158 | "\n", 159 | " def sent2labels(self,sent):\n", 160 | " return [label for token, postag, label in sent]\n", 161 | "\n", 162 | " def sent2tokens(self,sent):\n", 163 | " return [token for token, postag, label in sent]\n", 164 | " \n", 165 | " def create_data(self,data_list):\n", 166 | " self.sentences = data_list\n", 167 | " maxlen = max([len(item) for item in data_list])\n", 168 | " self.max_len = maxlen\n", 169 | " wd = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n", 170 | " \n", 171 | " wd = pad_sequences(maxlen=maxlen, sequences=wd, padding=\"post\",value=self.word2idx[\"PAD\"])\n", 172 | " \n", 173 | " pos = [[self.pos_tag2idx[w[1]] for w in s] for s in self.sentences]\n", 174 | " pos = pad_sequences(maxlen=maxlen, sequences=pos, padding=\"post\",value=self.pos_tag2idx[\"PAD\"])\n", 175 | "\n", 176 | " y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n", 177 | " y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n", 178 | " return (wd,pos),y" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "preprocess_obj = Preprocessing()\n", 188 | "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n", 189 | "x_train,y_train = preprocess_obj.create_data(trained_sen_list)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "class MyCallback(tf.keras.callbacks.Callback):\n", 199 | " def __init__(self, monitor='acc', baseline=0.95):\n", 200 | " self.monitor = monitor\n", 201 | " self.baseline = baseline\n", 202 | " self.training_stop = False\n", 203 | "\n", 204 | " def on_train_begin(self, logs={}):\n", 205 | " self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n", 206 | "\n", 207 | " def on_epoch_end(self, epoch, logs={}):\n", 208 | " if logs and logs.get(self.monitor) >= self.baseline:\n", 209 | " print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n", 210 | " self.training_stop = True\n", 211 | " \n", 212 | " if self.training_stop: \n", 213 | " self.model.stop_training = True\n", 214 | "\n", 215 | "\n", 216 | "class CreateModel():\n", 217 | " def __init__(self):\n", 218 | " self.model = None\n", 219 | " self.history = None\n", 220 | " self.x_train = x_train\n", 221 | " self.y_train = y_train\n", 222 | " self.max_len = preprocess_obj.max_len\n", 223 | " self.num_words = preprocess_obj.num_words\n", 224 | " self.num_labels = preprocess_obj.num_lables\n", 225 | " self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n", 226 | " \n", 227 | " def train(self):\n", 228 | " word_input = Input(shape=(self.max_len,))\n", 229 | " word_model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n", 230 | " \n", 231 | " pos_input = Input(shape=(None,), dtype='int32')\n", 232 | " pos_model = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n", 233 | "\n", 234 | " output = concatenate([word_model, pos_model])\n", 235 | " \n", 236 | " output = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(output)\n", 237 | " output = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(output)\n", 238 | " \n", 239 | " self.model = Model(inputs=[word_input, pos_input], outputs=[output])\n", 240 | " self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n", 241 | " \n", 242 | " def run(self,batch_size=32,epoch=5):\n", 243 | " logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n", 244 | " logdir = \"logs_tensorboard\"\n", 245 | " tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n", 246 | " \n", 247 | " val_acc = 0.99\n", 248 | " monitor_param = 'val_acc'\n", 249 | " \n", 250 | " checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n", 251 | " \n", 252 | " #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n", 253 | " self.history = self.model.fit(self.x_train, self.y_train,\n", 254 | " batch_size=batch_size, epochs=epoch,\n", 255 | " validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n", 256 | " verbose=1)\n", 257 | " def save_model(self,model_file):\n", 258 | " self.model.save(model_file)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "model_obj = CreateModel()\n", 268 | "model_obj.train()\n", 269 | "model_obj.run(batch_size=32,epoch=100)\n", 270 | "model_obj.save_model(\"models/simple_ner_model_v2.h5\")" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "class Prediction():\n", 280 | " def __init__(self):\n", 281 | " self.word2idx = preprocess_obj.word2idx\n", 282 | " self.pos_tag2idx = preprocess_obj.pos_tag2idx\n", 283 | " self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n", 284 | " self.model = model_obj.model\n", 285 | " self.max_len = preprocess_obj.max_len\n", 286 | " def predict(self,texts):\n", 287 | " label_lists = list()\n", 288 | " for text in texts:\n", 289 | " words = text.split()\n", 290 | " tagged = nltk.pos_tag(words) \n", 291 | " \n", 292 | " wd = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n", 293 | " wd = pad_sequences(maxlen=self.max_len, sequences=wd,\n", 294 | " padding=\"post\", value=self.word2idx[\"PAD\"])\n", 295 | " \n", 296 | " pos = [[self.pos_tag2idx.get(item, self.pos_tag2idx[\"UNK\"]) for item in tagged]]\n", 297 | " pos = pad_sequences(maxlen=self.max_len, sequences=pos,\n", 298 | " padding=\"post\", value=self.pos_tag2idx[\"PAD\"])\n", 299 | " \n", 300 | " y_pred = self.model.predict([wd,pos])\n", 301 | " pred_index = np.argmax(y_pred, axis=-1)\n", 302 | " preds = pred_index.flatten().tolist()\n", 303 | " labels = [self.idx2label[ind] for ind in preds]\n", 304 | " label_lists.append(labels)\n", 305 | " \n", 306 | " print([(words[idx],labels[idx]) for idx in range(len(words))])\n", 307 | " return label_lists\n", 308 | " " 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "#print(preprocess_obj.word2idx)\n", 318 | "pred_obj = Prediction()\n", 319 | "text = \"Play the last track from Beyonce off Spotify\"\n", 320 | "y_pred = pred_obj.predict([text])" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.6.9" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 2 352 | } 353 | -------------------------------------------------------------------------------- /1.5-named-entity-recognition/simple_ner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from numpy import array\n", 10 | "import tensorflow as tf\n", 11 | "import glob\n", 12 | "import numpy as np\n", 13 | "import pickle\n", 14 | "from datetime import datetime\n", 15 | "\n", 16 | "from sklearn.metrics import accuracy_score\n", 17 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 18 | "from tensorflow.keras.utils import to_categorical\n", 19 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 20 | "from tensorflow.keras.models import Sequential,Model\n", 21 | "from tensorflow.keras.layers import Dense\n", 22 | "from tensorflow.keras.layers import LSTM\n", 23 | "from tensorflow.keras.layers import Input\n", 24 | "from tensorflow.keras.layers import Dropout\n", 25 | "from tensorflow.keras.layers import Embedding\n", 26 | "from tensorflow.keras.layers import TimeDistributed\n", 27 | "from tensorflow.keras.layers import Conv1D\n", 28 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n", 29 | "from tensorflow.keras.utils import Progbar\n", 30 | "from tensorflow.keras.models import load_model\n", 31 | "\n", 32 | "from tensorflow.keras.initializers import RandomUniform\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "class LoadData():\n", 42 | " def __init__(self):\n", 43 | " self.train_files = None\n", 44 | " self.validation_files = None\n", 45 | " \n", 46 | " def get_data(self):\n", 47 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n", 48 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n", 49 | " \n", 50 | " def sentence_from_file(self,filename):\n", 51 | " single_data_list = list()\n", 52 | " with open(filename) as fp:\n", 53 | " sentence_list = []\n", 54 | " lines = fp.readlines()\n", 55 | " for line in lines:\n", 56 | " splits = line.split(' ')\n", 57 | " if splits[0]=='\\n':\n", 58 | " #sent = \" \".join([word[0] for word in sentence_list])\n", 59 | " #single_data_list.append((sentence_list,sent))\n", 60 | " single_data_list.append(sentence_list)\n", 61 | " sentence_list = list()\n", 62 | " else:\n", 63 | " sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n", 64 | " \n", 65 | " return single_data_list\n", 66 | " \n", 67 | " def addCharInformatioin(self,Sentences):\n", 68 | " for i,sentence in enumerate(Sentences):\n", 69 | " for j,data in enumerate(sentence):\n", 70 | " chars = [c for c in data[0]]\n", 71 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n", 72 | " return Sentences\n", 73 | " \n", 74 | " def prepared_data(self,files):\n", 75 | " list_sentences = list()\n", 76 | " for each_file in files:\n", 77 | " sentences = self.sentence_from_file(each_file)\n", 78 | " #sentences = self.addCharInformatioin(sentences)\n", 79 | " list_sentences+= sentences\n", 80 | " return list_sentences\n", 81 | " " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "load_data_obj = LoadData()\n", 91 | "load_data_obj.get_data()\n", 92 | "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n", 93 | "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n", 94 | "print(trained_sen_list[:5])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "class Preprocessing():\n", 104 | " def __init__(self):\n", 105 | " self.max_len = len(max(trained_sen_list))\n", 106 | " \n", 107 | " def make_data(self,data_list):\n", 108 | " \n", 109 | " \n", 110 | " words = list()\n", 111 | " for each_sent in data_list:\n", 112 | " for each_item in each_sent:\n", 113 | " words.append(each_item[0])\n", 114 | " words = list(set(words))\n", 115 | "\n", 116 | " \n", 117 | " pos_tags = list()\n", 118 | " for each_sent in data_list:\n", 119 | " for each_item in each_sent:\n", 120 | " pos_tags.append(each_item[1])\n", 121 | " pos_tags = list(set(pos_tags))\n", 122 | " \n", 123 | " labels = list()\n", 124 | " for each_sent in data_list:\n", 125 | " for each_item in each_sent:\n", 126 | " labels.append(each_item[2])\n", 127 | " labels = list(set(labels))\n", 128 | " \n", 129 | " self.word2idx = {\"PAD\": 0, \"UNK\": 1}\n", 130 | " self.word2idx.update({w: i for i, w in enumerate(words)})\n", 131 | " self.num_words = len(self.word2idx)\n", 132 | " \n", 133 | " self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n", 134 | " self.num_pos_tags = len(self.pos_tag2idx)\n", 135 | " \n", 136 | " self.label2idx = {t: i for i, t in enumerate(labels)}\n", 137 | " self.num_lables = len(self.label2idx)\n", 138 | " \n", 139 | " def word2features(self,data, word_dict):\n", 140 | " word = data[0]\n", 141 | " postag = data[1]\n", 142 | " binary_map = {True:0,False:1,None:2}\n", 143 | " features = [word_dict[word],binary_map[word.islower()], \n", 144 | " binary_map[word.isupper()], binary_map[word.istitle()], \n", 145 | " binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n", 146 | " return features\n", 147 | "\n", 148 | "\n", 149 | " def sent2features(self,sent,word_dict):\n", 150 | " sentence_features = list()\n", 151 | " for index in range(len(sent)):\n", 152 | " sentence_features.append(self.word2features(sent[index],word_dict))\n", 153 | " \n", 154 | " return sentence_features\n", 155 | "\n", 156 | " def sent2labels(self,sent):\n", 157 | " return [label for token, postag, label in sent]\n", 158 | "\n", 159 | " def sent2tokens(self,sent):\n", 160 | " return [token for token, postag, label in sent]\n", 161 | " \n", 162 | " def create_data(self,data_list):\n", 163 | " self.sentences = data_list\n", 164 | " maxlen = max([len(item) for item in data_list])\n", 165 | " self.max_len = maxlen\n", 166 | " x = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n", 167 | " #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.num_words - 1)\n", 168 | " x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.word2idx[\"PAD\"])\n", 169 | " #x = [self.sent2features(s,self.word2idx) for s in self.sentences]\n", 170 | " #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=[0,2,2,2,2,len(self.pos_tag2idx)])\n", 171 | " print(x[2])\n", 172 | " y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n", 173 | " y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n", 174 | " return x,y" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "preprocess_obj = Preprocessing()\n", 184 | "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n", 185 | "x_train,y_train = preprocess_obj.create_data(trained_sen_list)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "class MyCallback(tf.keras.callbacks.Callback):\n", 195 | " def __init__(self, monitor='acc', baseline=0.95):\n", 196 | " self.monitor = monitor\n", 197 | " self.baseline = baseline\n", 198 | " self.training_stop = False\n", 199 | "\n", 200 | " def on_train_begin(self, logs={}):\n", 201 | " self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n", 202 | "\n", 203 | " def on_epoch_end(self, epoch, logs={}):\n", 204 | " if logs and logs.get(self.monitor) >= self.baseline:\n", 205 | " print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n", 206 | " self.training_stop = True\n", 207 | " \n", 208 | " if self.training_stop: \n", 209 | " self.model.stop_training = True\n", 210 | "\n", 211 | "\n", 212 | "class CreateModel():\n", 213 | " def __init__(self):\n", 214 | " self.model = None\n", 215 | " self.history = None\n", 216 | " self.x_train = x_train\n", 217 | " self.y_train = y_train\n", 218 | " self.max_len = preprocess_obj.max_len\n", 219 | " self.num_words = preprocess_obj.num_words\n", 220 | " self.num_labels = preprocess_obj.num_lables\n", 221 | " self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n", 222 | " \n", 223 | " def train(self):\n", 224 | " word_input = Input(shape=(self.max_len,))\n", 225 | " model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n", 226 | " model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)\n", 227 | " out = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(model)\n", 228 | " \n", 229 | " self.model = Model(word_input,out)\n", 230 | " self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n", 231 | " \n", 232 | " def run(self,batch_size=32,epoch=5):\n", 233 | " logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n", 234 | " tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n", 235 | " \n", 236 | " val_acc = 0.99\n", 237 | " monitor_param = 'val_acc'\n", 238 | " \n", 239 | " checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n", 240 | " \n", 241 | " #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n", 242 | " self.history = self.model.fit(self.x_train, self.y_train,\n", 243 | " batch_size=batch_size, epochs=epoch,\n", 244 | " validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n", 245 | " verbose=1)\n", 246 | " def save_model(self,model_file):\n", 247 | " self.model.save(model_file)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "model_obj = CreateModel()\n", 257 | "model_obj.train()\n", 258 | "model_obj.run(batch_size=32,epoch=100)\n", 259 | "model_obj.save_model(\"models/simple_ner_model.h5\")" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "class Prediction():\n", 269 | " def __init__(self):\n", 270 | " self.word2idx = preprocess_obj.word2idx\n", 271 | " self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n", 272 | " self.model = model_obj.model\n", 273 | " self.max_len = preprocess_obj.max_len\n", 274 | " def predict(self,texts):\n", 275 | " label_lists = list()\n", 276 | " for text in texts:\n", 277 | " words = text.split()\n", 278 | " x = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n", 279 | " x = pad_sequences(maxlen=self.max_len, sequences=x,\n", 280 | " padding=\"post\", value=self.word2idx[\"PAD\"])\n", 281 | " y_pred = self.model.predict(x)\n", 282 | " print(\"Predicted Probabilities on Test Set:\\n\",y_pred.shape)\n", 283 | " # taking tag class with maximum probability\n", 284 | " pred_index = np.argmax(y_pred, axis=-1)\n", 285 | " print(\"Predicted tag indices: \\n\",pred_index.shape)\n", 286 | " preds = pred_index.flatten().tolist()\n", 287 | " labels = [self.idx2label[ind] for ind in preds]\n", 288 | " label_lists.append(labels)\n", 289 | " \n", 290 | " print([(words[idx],labels[idx]) for idx in range(len(words))])\n", 291 | " #print(labels)\n", 292 | " return label_lists\n", 293 | " " 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "#print(preprocess_obj.word2idx)\n", 303 | "pred_obj = Prediction()\n", 304 | "'''\n", 305 | "for item in validation_sen_list:\n", 306 | " sent = \" \".join([self.word2idx[w[0]] for w in s] for item in self.sentences])\n", 307 | " \n", 308 | "'''\n", 309 | "text = \"Play the last track from Beyoncé off Spotify\"\n", 310 | "y_pred = pred_obj.predict([text,text])" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.6.9" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 2 342 | } 343 | -------------------------------------------------------------------------------- /1.6-intent-classification/README.md: -------------------------------------------------------------------------------- 1 | Use the below link to get the data. 2 | https://www.kaggle.com/joydeb28/nlp-benchmarking-data-for-intent-and-entity 3 | -------------------------------------------------------------------------------- /1.6-intent-classification/intent_classfication_bert_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import numpy as np # linear algebra\n", 13 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 14 | "import json\n", 15 | "import os\n", 16 | "from sklearn.metrics import roc_curve\n", 17 | "from sklearn.metrics import accuracy_score\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from tensorflow.keras.utils import to_categorical\n", 20 | "from tensorflow.keras.models import Sequential, Model\n", 21 | "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n", 22 | "from tensorflow.keras.optimizers import Adam\n", 23 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 24 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 25 | "import bert\n", 26 | "from tqdm import tqdm\n", 27 | "from tensorflow.keras import backend as K\n", 28 | "import tensorflow as tf\n", 29 | "import tensorflow_hub as hub\n", 30 | "print(\"TensorFlow Version:\",tf.__version__)\n", 31 | "print(\"Hub version: \",hub.__version__)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 39 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "class LoadingData():\n", 44 | " \n", 45 | " def __init__(self):\n", 46 | " train_file_path = os.path.join(\"benchmarking_data\",\"Train\")\n", 47 | " validation_file_path = os.path.join(\"benchmarking_data\",\"Validate\")\n", 48 | " category_id = 0\n", 49 | " self.cat_to_intent = {}\n", 50 | " self.intent_to_cat = {}\n", 51 | " \n", 52 | " for dirname, _, filenames in os.walk(train_file_path):\n", 53 | " for filename in filenames:\n", 54 | " file_path = os.path.join(dirname, filename)\n", 55 | " intent_id = filename.replace(\".json\",\"\")\n", 56 | " self.cat_to_intent[category_id] = intent_id\n", 57 | " self.intent_to_cat[intent_id] = category_id\n", 58 | " category_id+=1\n", 59 | " print(self.cat_to_intent)\n", 60 | " print(self.intent_to_cat)\n", 61 | " '''Training data'''\n", 62 | " training_data = list() \n", 63 | " for dirname, _, filenames in os.walk(train_file_path):\n", 64 | " for filename in filenames:\n", 65 | " file_path = os.path.join(dirname, filename)\n", 66 | " intent_id = filename.replace(\".json\",\"\")\n", 67 | " training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n", 68 | " self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category']) \n", 69 | " \n", 70 | " self.train_data_frame = self.train_data_frame.sample(frac = 1)\n", 71 | "\n", 72 | "\n", 73 | " \n", 74 | " '''Validation data'''\n", 75 | " validation_data = list() \n", 76 | " for dirname, _, filenames in os.walk(validation_file_path):\n", 77 | " for filename in filenames:\n", 78 | " file_path = os.path.join(dirname, filename)\n", 79 | " intent_id = filename.replace(\".json\",\"\")\n", 80 | " validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id]) \n", 81 | " self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n", 82 | "\n", 83 | " self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n", 84 | " \n", 85 | " \n", 86 | " def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n", 87 | " json_d = json.load(open(json_file)) \n", 88 | " \n", 89 | " json_dict = json_d[intent_id]\n", 90 | "\n", 91 | " sent_list = list()\n", 92 | " for i in json_dict:\n", 93 | " each_list = i['data']\n", 94 | " sent =\"\"\n", 95 | " for i in each_list:\n", 96 | " sent = sent + i['text']+ \" \"\n", 97 | " sent =sent[:-1]\n", 98 | " for i in range(3):\n", 99 | " sent = sent.replace(\" \",\" \")\n", 100 | " sent_list.append((sent,intent_id,cat))\n", 101 | " return sent_list\n", 102 | " " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "load_data_obj = LoadingData()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "load_data_obj.train_data_frame.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "load_data_obj.validation_data_frame.head().values" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "class BertModel(object):\n", 139 | " \n", 140 | " def __init__(self):\n", 141 | " \n", 142 | " self.max_len = 128\n", 143 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n", 144 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n", 145 | " \n", 146 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n", 147 | "\n", 148 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n", 149 | "\n", 150 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n", 151 | "\n", 152 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n", 153 | " \n", 154 | " def get_masks(self,tokens, max_seq_length):\n", 155 | " return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n", 156 | "\n", 157 | " def get_segments(self,tokens, max_seq_length):\n", 158 | " \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n", 159 | " segments = []\n", 160 | " current_segment_id = 0\n", 161 | " for token in tokens:\n", 162 | " segments.append(current_segment_id)\n", 163 | " if token == \"[SEP]\":\n", 164 | " current_segment_id = 1\n", 165 | " return segments + [0] * (max_seq_length - len(tokens))\n", 166 | " \n", 167 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n", 168 | " \"\"\"Token ids from Tokenizer vocab\"\"\"\n", 169 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n", 170 | " input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n", 171 | " return input_ids\n", 172 | " def create_single_input(self,sentence,maxlen):\n", 173 | "\n", 174 | " stokens = self.tokenizer.tokenize(sentence)\n", 175 | "\n", 176 | " stokens = stokens[:maxlen]\n", 177 | "\n", 178 | " stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n", 179 | "\n", 180 | " ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n", 181 | " masks = self.get_masks(stokens, self.max_len)\n", 182 | " segments = self.get_segments(stokens, self.max_len)\n", 183 | "\n", 184 | " return ids,masks,segments\n", 185 | "\n", 186 | " def create_input_array(self,sentences):\n", 187 | " \n", 188 | " input_ids, input_masks, input_segments = [], [], []\n", 189 | "\n", 190 | " for sentence in tqdm(sentences,position=0, leave=True):\n", 191 | " ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n", 192 | "\n", 193 | " input_ids.append(ids)\n", 194 | " input_masks.append(masks)\n", 195 | " input_segments.append(segments)\n", 196 | " \n", 197 | " tensor = [np.asarray(input_ids, dtype=np.int32), \n", 198 | " np.asarray(input_masks, dtype=np.int32), \n", 199 | " np.asarray(input_segments, dtype=np.int32)]\n", 200 | " return tensor" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "class PreprocessingBertData():\n", 210 | " \n", 211 | " def prepare_data_x(self,train_sentences):\n", 212 | " x = bert_model_obj.create_input_array(train_sentences)\n", 213 | " return x\n", 214 | " \n", 215 | " def prepare_data_y(self,train_labels):\n", 216 | " y = list()\n", 217 | " for item in train_labels:\n", 218 | " label = item\n", 219 | " y.append(label)\n", 220 | " y = np.array(y)\n", 221 | " return y\n", 222 | " \n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "bert_model_obj = BertModel()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "train_sentences = load_data_obj.train_data_frame[\"query\"].tolist()\n", 241 | "train_labels = load_data_obj.train_data_frame[\"category\"].tolist()\n", 242 | "\n", 243 | "preprocess_bert_data_obj = PreprocessingBertData()\n", 244 | "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n", 245 | "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n", 246 | "\n", 247 | "train_input_ids, train_input_masks, train_segment_ids = x\n", 248 | "train_labels = y\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "class DesignModel():\n", 258 | " def __init__(self):\n", 259 | " self.model = None \n", 260 | " self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n", 261 | " self.train_labels = train_labels\n", 262 | " \n", 263 | " def bert_model(self,max_seq_length): \n", 264 | " in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n", 265 | " in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n", 266 | " in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n", 267 | " \n", 268 | " bert_inputs = [in_id, in_mask, in_segment]\n", 269 | " bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n", 270 | " \n", 271 | " bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n", 272 | " bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n", 273 | " bert_outputs = tf.keras.layers.Dense(len(load_data_obj.cat_to_intent), activation=\"softmax\", name=\"dense_output\")(x)\n", 274 | " self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n", 275 | " \n", 276 | " self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),\n", 277 | " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", 278 | " metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name=\"acc\")])\n", 279 | " \n", 280 | " self.model.summary()\n", 281 | " \n", 282 | " def model_train(self,batch_size,num_epoch):\n", 283 | " print(\"Fitting to model\")\n", 284 | " self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n", 285 | " print(\"Model Training complete.\")\n", 286 | "\n", 287 | " def save_model(self,model,model_name): \n", 288 | " self.model.save(model_name+\".h5\")\n", 289 | " print(\"Model saved to Model folder.\")" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "model_obj = DesignModel()\n", 299 | "model_obj.bert_model(bert_model_obj.max_len)\n", 300 | "model_obj.model_train(32,1)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "model_obj.save_model(model_obj.model,\"bert\")" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "class Evaluation():\n", 319 | " def get_accuracy(self,actuals, predictions):\n", 320 | " acc = accuracy_score(actuals, predictions)\n", 321 | " return acc" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "class Prediction():\n", 331 | " def __init__(self):\n", 332 | " self.model = model_obj.model\n", 333 | " \n", 334 | " def predict_validation(self):\n", 335 | " valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n", 336 | " valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n", 337 | "\n", 338 | " preprocess_bert_data_obj = PreprocessingBertData()\n", 339 | " val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n", 340 | " prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n", 341 | " return valid_labels,prediction_labels\n", 342 | " \n", 343 | " \n", 344 | " def predict(self,query):\n", 345 | " query_seq = bert_model_obj.create_input_array([query])\n", 346 | " pred = self.model.predict(query_seq)\n", 347 | " pred = np.argmax(pred)\n", 348 | " result = load_data_obj.cat_to_intent[pred]\n", 349 | " return result" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "pred_obj = Prediction()\n", 359 | "#pred_obj.predict_validation()" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "querylist = [['I want to see Medal for the General', 'SearchScreeningEvent', 1],\n", 369 | " ['Book a reservation for 5 people at the top-rated brasserie restaurant',\n", 370 | " 'BookRestaurant', 5],\n", 371 | " ['Can I put this tune onto my sin estres playlist?',\n", 372 | " 'AddToPlaylist', 6],\n", 373 | " ['add the artist Pete Murray to my relaxing playlist',\n", 374 | " 'AddToPlaylist', 6],\n", 375 | " ['Book me a reservation for a party of 3 at a pub in Northern Mariana Islands',\n", 376 | " 'BookRestaurant', 5]]\n", 377 | "for query in querylist:\n", 378 | " result = pred_obj.predict(query[0])\n", 379 | " print(\"Predicted Intent: \"+str(result)+\"\\tActual Intent: \"+(load_data_obj.cat_to_intent[query[2]])+\"\\tQuery: \"+str(query[0]))\n" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "eval_obj = Evaluation()\n", 389 | "ytest,ypred = pred_obj.predict_validation()\n", 390 | "acc = eval_obj.get_accuracy(ytest,ypred)\n", 391 | "print(\"Auc: {:.2%}\".format(acc))\n" 392 | ] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "Python 3", 398 | "language": "python", 399 | "name": "python3" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 3 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython3", 411 | "version": "3.6.9" 412 | } 413 | }, 414 | "nbformat": 4, 415 | "nbformat_minor": 4 416 | } 417 | -------------------------------------------------------------------------------- /1.6-intent-classification/intent_classfication_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.6.9" 21 | }, 22 | "colab": { 23 | "name": "intent_classfication_keras.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | }, 27 | "accelerator": "GPU" 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 34 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 35 | "id": "H9qTF3ffa7Pc", 36 | "colab_type": "code", 37 | "colab": {} 38 | }, 39 | "source": [ 40 | "# Data\n", 41 | "# " 42 | ], 43 | "execution_count": 1, 44 | "outputs": [] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "metadata": { 49 | "id": "dVcaGQx5bw1n", 50 | "colab_type": "code", 51 | "colab": {} 52 | }, 53 | "source": [ 54 | "import numpy as np\n", 55 | "import pandas as pd\n", 56 | "import json\n", 57 | "import os\n", 58 | "import en_core_web_sm\n", 59 | "from sklearn.metrics import roc_curve\n", 60 | "from sklearn.metrics import accuracy_score\n", 61 | "from sklearn.model_selection import train_test_split\n", 62 | "from tensorflow.keras.utils import to_categorical\n", 63 | "from tensorflow.keras.models import Sequential, Model, load_model\n", 64 | "from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Bidirectional, Activation\n", 65 | "from tensorflow.keras.optimizers import Adam\n", 66 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 67 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 68 | "from tensorflow.keras.layers import LSTM\n", 69 | "from tensorflow.keras.layers import SimpleRNN\n", 70 | "from tensorflow.keras.layers import Conv1D\n", 71 | "from tensorflow.keras.layers import Dropout\n", 72 | "from tensorflow.keras.layers import BatchNormalization\n", 73 | "from tensorflow.keras.layers import GlobalMaxPooling1D\n", 74 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 75 | "from tensorflow.keras.preprocessing.sequence import pad_sequences" 76 | ], 77 | "execution_count": 18, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "lystXd0mbNXk", 84 | "colab_type": "code", 85 | "colab": { 86 | "base_uri": "https://localhost:8080/", 87 | "height": 121 88 | }, 89 | "outputId": "a9dcaebf-a417-408f-8fa8-11ed934a3efb" 90 | }, 91 | "source": [ 92 | "from google.colab import drive\n", 93 | "drive.mount(\"/content/drive\")" 94 | ], 95 | "execution_count": 3, 96 | "outputs": [ 97 | { 98 | "output_type": "stream", 99 | "text": [ 100 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code\n", 101 | "\n", 102 | "Enter your authorization code:\n", 103 | "··········\n", 104 | "Mounted at /content/drive\n" 105 | ], 106 | "name": "stdout" 107 | } 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "metadata": { 113 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 114 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 115 | "id": "jIf3M0bca7Pg", 116 | "colab_type": "code", 117 | "colab": {} 118 | }, 119 | "source": [ 120 | "class LoadingData():\n", 121 | " \n", 122 | " def __init__(self):\n", 123 | " data_dir = \"/content/drive/My Drive/Projects/Data\"\n", 124 | " train_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Train\")\n", 125 | " validation_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Validate\")\n", 126 | " category_id = 0\n", 127 | " self.cat_to_intent = {}\n", 128 | " self.intent_to_cat = {}\n", 129 | " \n", 130 | " for dirname, _, filenames in os.walk(train_file_path):\n", 131 | " for filename in filenames:\n", 132 | " file_path = os.path.join(dirname, filename)\n", 133 | " intent_id = filename.replace(\".json\",\"\")\n", 134 | " self.cat_to_intent[category_id] = intent_id\n", 135 | " self.intent_to_cat[intent_id] = category_id\n", 136 | " category_id+=1\n", 137 | " '''Training data'''\n", 138 | " training_data = list() \n", 139 | " for dirname, _, filenames in os.walk(train_file_path):\n", 140 | " for filename in filenames:\n", 141 | " file_path = os.path.join(dirname, filename)\n", 142 | " intent_id = filename.replace(\".json\",\"\")\n", 143 | " training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n", 144 | " self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category']) \n", 145 | " \n", 146 | " self.train_data_frame = self.train_data_frame.sample(frac = 1)\n", 147 | "\n", 148 | "\n", 149 | " \n", 150 | " '''Validation data'''\n", 151 | " validation_data = list() \n", 152 | " for dirname, _, filenames in os.walk(validation_file_path):\n", 153 | " for filename in filenames:\n", 154 | " file_path = os.path.join(dirname, filename)\n", 155 | " intent_id = filename.replace(\".json\",\"\")\n", 156 | " validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id]) \n", 157 | " self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n", 158 | "\n", 159 | " self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n", 160 | " \n", 161 | " \n", 162 | " def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n", 163 | " json_d = json.load(open(json_file)) \n", 164 | " \n", 165 | " json_dict = json_d[intent_id]\n", 166 | "\n", 167 | " sent_list = list()\n", 168 | " for i in json_dict:\n", 169 | " each_list = i['data']\n", 170 | " sent =\"\"\n", 171 | " for i in each_list:\n", 172 | " sent = sent + i['text']+ \" \"\n", 173 | " sent =sent[:-1]\n", 174 | " for i in range(3):\n", 175 | " sent = sent.replace(\" \",\" \")\n", 176 | " sent_list.append((sent,intent_id,cat))\n", 177 | " return sent_list\n", 178 | " " 179 | ], 180 | "execution_count": 7, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "CpWQixmea7Pi", 187 | "colab_type": "code", 188 | "colab": {} 189 | }, 190 | "source": [ 191 | "load_data_obj = LoadingData()" 192 | ], 193 | "execution_count": 8, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "metadata": { 199 | "id": "Hy352jKEa7Pl", 200 | "colab_type": "code", 201 | "colab": { 202 | "base_uri": "https://localhost:8080/", 203 | "height": 195 204 | }, 205 | "outputId": "70dc188d-ad57-4a29-c184-aac5abb806ae" 206 | }, 207 | "source": [ 208 | "load_data_obj.train_data_frame.head()" 209 | ], 210 | "execution_count": 10, 211 | "outputs": [ 212 | { 213 | "output_type": "execute_result", 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | "
queryintentcategory
8770rate the current novel 5 starsRateBook4
6557Find the schedule for Kingsman: The Secret Ser...SearchScreeningEvent3
721find Bells Break Their Towers , a video gameSearchCreativeWork0
229show creativity of A Catholic EducationSearchCreativeWork0
3680Will it be warm in Powersville Guam 23 hours f...GetWeather1
\n", 272 | "
" 273 | ], 274 | "text/plain": [ 275 | " query ... category\n", 276 | "8770 rate the current novel 5 stars ... 4\n", 277 | "6557 Find the schedule for Kingsman: The Secret Ser... ... 3\n", 278 | "721 find Bells Break Their Towers , a video game ... 0\n", 279 | "229 show creativity of A Catholic Education ... 0\n", 280 | "3680 Will it be warm in Powersville Guam 23 hours f... ... 1\n", 281 | "\n", 282 | "[5 rows x 3 columns]" 283 | ] 284 | }, 285 | "metadata": { 286 | "tags": [] 287 | }, 288 | "execution_count": 10 289 | } 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "GSDQwGEBa7Pn", 296 | "colab_type": "code", 297 | "colab": { 298 | "base_uri": "https://localhost:8080/", 299 | "height": 195 300 | }, 301 | "outputId": "c7ec5bc6-af6d-4899-d9f6-244d00b7369e" 302 | }, 303 | "source": [ 304 | "load_data_obj.validation_data_frame.head()" 305 | ], 306 | "execution_count": 11, 307 | "outputs": [ 308 | { 309 | "output_type": "execute_result", 310 | "data": { 311 | "text/html": [ 312 | "
\n", 313 | "\n", 326 | "\n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | "
queryintentcategory
699I want to see Married to the Enemy 2 at a cine...SearchScreeningEvent3
22Please look up the song The Mad Magician .SearchCreativeWork0
139rate the current essay zero out of 6 starsRateBook4
599Add the album to my Club Hits playlist.AddToPlaylist6
16Please help me find the Late Night Heartbroken...SearchCreativeWork0
\n", 368 | "
" 369 | ], 370 | "text/plain": [ 371 | " query ... category\n", 372 | "699 I want to see Married to the Enemy 2 at a cine... ... 3\n", 373 | "22 Please look up the song The Mad Magician . ... 0\n", 374 | "139 rate the current essay zero out of 6 stars ... 4\n", 375 | "599 Add the album to my Club Hits playlist. ... 6\n", 376 | "16 Please help me find the Late Night Heartbroken... ... 0\n", 377 | "\n", 378 | "[5 rows x 3 columns]" 379 | ] 380 | }, 381 | "metadata": { 382 | "tags": [] 383 | }, 384 | "execution_count": 11 385 | } 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "tRmVRiTCa7Pp", 392 | "colab_type": "code", 393 | "colab": {} 394 | }, 395 | "source": [ 396 | "class Preprocessing():\n", 397 | " def __init__(self):\n", 398 | " self.x_train = None\n", 399 | " self.y_train = None\n", 400 | " self.x_valid = None\n", 401 | " self.y_valid = None\n", 402 | " self.spacy_model = en_core_web_sm.load()\n", 403 | " self.tokenizer = None\n", 404 | "\n", 405 | " def createData(self):\n", 406 | " self.tokenizer = Tokenizer(num_words=None)\n", 407 | " self.max_len = 50\n", 408 | " self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(load_data_obj.train_data_frame['query'].tolist(),load_data_obj.train_data_frame['category'].tolist(),test_size=0.1)\n", 409 | " self.tokenizer.fit_on_texts(list(self.x_train) + list(self.x_valid))\n", 410 | " self.x_train = self.tokenizer.texts_to_sequences(self.x_train)\n", 411 | " self.x_valid = self.tokenizer.texts_to_sequences(self.x_valid)\n", 412 | "\n", 413 | " #zero pad the sequences\n", 414 | " self.x_train = pad_sequences(self.x_train, maxlen=self.max_len)\n", 415 | " self.x_valid = pad_sequences(self.x_valid, maxlen=self.max_len)\n", 416 | " self.y_train = to_categorical(self.y_train)\n", 417 | " self.y_valid = to_categorical(self.y_valid)\n", 418 | " self.word_index = self.tokenizer.word_index\n", 419 | " \n", 420 | " def getSpacyEmbeddings(self,sentneces):\n", 421 | " sentences_vectors = list()\n", 422 | " for item in sentneces:\n", 423 | " query_vec = self.spacy_model(item) \n", 424 | " sentences_vectors.append(query_vec.vector)\n", 425 | " return sentences_vectors\n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " " 431 | ], 432 | "execution_count": 12, 433 | "outputs": [] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "metadata": { 438 | "id": "LoyTb5Gza7Pr", 439 | "colab_type": "code", 440 | "colab": {} 441 | }, 442 | "source": [ 443 | "preprocess_obj = Preprocessing()\n", 444 | "preprocess_obj.createData()" 445 | ], 446 | "execution_count": 13, 447 | "outputs": [] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "metadata": { 452 | "id": "rYI77Z4za7Pt", 453 | "colab_type": "code", 454 | "colab": { 455 | "base_uri": "https://localhost:8080/", 456 | "height": 34 457 | }, 458 | "outputId": "5bccac12-dacc-497c-8413-5734ca64df0b" 459 | }, 460 | "source": [ 461 | "preprocess_obj.y_train.shape" 462 | ], 463 | "execution_count": 14, 464 | "outputs": [ 465 | { 466 | "output_type": "execute_result", 467 | "data": { 468 | "text/plain": [ 469 | "(12405, 7)" 470 | ] 471 | }, 472 | "metadata": { 473 | "tags": [] 474 | }, 475 | "execution_count": 14 476 | } 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "metadata": { 482 | "id": "ZHI2TUvNa7Pv", 483 | "colab_type": "code", 484 | "colab": { 485 | "base_uri": "https://localhost:8080/", 486 | "height": 34 487 | }, 488 | "outputId": "4b1f3c97-2873-4c48-8d51-492e70e70828" 489 | }, 490 | "source": [ 491 | "preprocess_obj.y_valid.shape" 492 | ], 493 | "execution_count": 15, 494 | "outputs": [ 495 | { 496 | "output_type": "execute_result", 497 | "data": { 498 | "text/plain": [ 499 | "(1379, 7)" 500 | ] 501 | }, 502 | "metadata": { 503 | "tags": [] 504 | }, 505 | "execution_count": 15 506 | } 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "metadata": { 512 | "id": "T3WXq62ha7Px", 513 | "colab_type": "code", 514 | "colab": {} 515 | }, 516 | "source": [ 517 | "class DesignModel():\n", 518 | " def __init__(self):\n", 519 | " self.model = None\n", 520 | " self.x_train = preprocess_obj.x_train\n", 521 | " self.y_train = preprocess_obj.y_train\n", 522 | " self.x_valid = preprocess_obj.x_valid\n", 523 | " self.y_valid = preprocess_obj.y_valid\n", 524 | " \n", 525 | " def simple_rnn(self):\n", 526 | " self.model = Sequential()\n", 527 | " self.model.add(Embedding(len(preprocess_obj.word_index) + 1,100,input_length=preprocess_obj.max_len))\n", 528 | " self.model.add(SimpleRNN(100))\n", 529 | " self.model.add(Dense(len(load_data_obj.cat_to_intent), activation='sigmoid'))\n", 530 | " self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 531 | " \n", 532 | " \n", 533 | " def model_train(self,batch_size,num_epoch):\n", 534 | " print(\"Fitting to model\")\n", 535 | " self.model.fit(self.x_train, self.y_train, batch_size=batch_size, epochs=num_epoch, validation_data=[self.x_valid, self.y_valid])\n", 536 | " print(\"Model Training complete.\")\n", 537 | "\n", 538 | " def save_model(self,model_name): \n", 539 | " self.model.save(model_name+\".h5\")\n", 540 | " print(\"Model saved to Model folder.\")" 541 | ], 542 | "execution_count": 24, 543 | "outputs": [] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "metadata": { 548 | "id": "Tye8X7FFa7Pz", 549 | "colab_type": "code", 550 | "colab": { 551 | "base_uri": "https://localhost:8080/", 552 | "height": 235 553 | }, 554 | "outputId": "3e26c58d-4bef-4bd1-f789-9662a0243bef" 555 | }, 556 | "source": [ 557 | "model_obj = DesignModel()\n", 558 | "model_obj.simple_rnn()\n", 559 | "model_obj.model_train(64,5)\n", 560 | "model_obj.save_model(\"srnn\")" 561 | ], 562 | "execution_count": 25, 563 | "outputs": [ 564 | { 565 | "output_type": "stream", 566 | "text": [ 567 | "Fitting to model\n", 568 | "Epoch 1/5\n", 569 | "194/194 [==============================] - 9s 46ms/step - loss: 0.8717 - accuracy: 0.8039 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", 570 | "Epoch 2/5\n", 571 | "194/194 [==============================] - 9s 46ms/step - loss: 0.0855 - accuracy: 0.9852 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", 572 | "Epoch 3/5\n", 573 | "194/194 [==============================] - 9s 46ms/step - loss: 0.0321 - accuracy: 0.9948 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", 574 | "Epoch 4/5\n", 575 | "194/194 [==============================] - 9s 45ms/step - loss: 0.0157 - accuracy: 0.9980 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", 576 | "Epoch 5/5\n", 577 | "194/194 [==============================] - 9s 45ms/step - loss: 0.0098 - accuracy: 0.9990 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", 578 | "Model Training complete.\n", 579 | "Model saved to Model folder.\n" 580 | ], 581 | "name": "stdout" 582 | } 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "metadata": { 588 | "id": "sZ1dlcGta7P1", 589 | "colab_type": "code", 590 | "colab": {} 591 | }, 592 | "source": [ 593 | "class Evaluation():\n", 594 | " def get_accuracy(self,actuals, predictions):\n", 595 | " acc = accuracy_score(actuals, predictions)\n", 596 | " return acc" 597 | ], 598 | "execution_count": 26, 599 | "outputs": [] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "metadata": { 604 | "id": "UdXO8h31a7P3", 605 | "colab_type": "code", 606 | "colab": {} 607 | }, 608 | "source": [ 609 | "class Prediction():\n", 610 | " def __init__(self,model_name):\n", 611 | " self.model = load_model(model_name+\".h5\")\n", 612 | " self.tokenizer = preprocess_obj.tokenizer\n", 613 | " self.max_len = preprocess_obj.max_len\n", 614 | " \n", 615 | " def predict_validation(self):\n", 616 | " self.xtest = load_data_obj.validation_data_frame['query'].tolist()\n", 617 | " self.ytest = load_data_obj.validation_data_frame['category'].tolist()\n", 618 | " self.xtest = self.tokenizer.texts_to_sequences(self.xtest)\n", 619 | " self.xtest = pad_sequences(self.xtest, maxlen=self.max_len)\n", 620 | " self.ypred = self.model.predict(self.xtest)\n", 621 | " self.ypred = [np.argmax(item) for item in self.ypred]\n", 622 | " \n", 623 | " def predict(self,query):\n", 624 | " query_seq = self.tokenizer.texts_to_sequences([query])\n", 625 | " query_pad = pad_sequences(query_seq, maxlen=self.max_len)\n", 626 | " pred = self.model.predict(query_pad)\n", 627 | " pred = np.argmax(pred)\n", 628 | " result = load_data_obj.cat_to_intent[pred]\n", 629 | " return result" 630 | ], 631 | "execution_count": 27, 632 | "outputs": [] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "metadata": { 637 | "id": "1QAb7Mr-a7P5", 638 | "colab_type": "code", 639 | "colab": {} 640 | }, 641 | "source": [ 642 | "pred_obj = Prediction(\"srnn\")\n", 643 | "pred_obj.predict_validation()" 644 | ], 645 | "execution_count": 28, 646 | "outputs": [] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "metadata": { 651 | "id": "8bX7S8VFa7P6", 652 | "colab_type": "code", 653 | "colab": { 654 | "base_uri": "https://localhost:8080/", 655 | "height": 101 656 | }, 657 | "outputId": "909f7973-a5de-44cd-b027-7a55a13d5efd" 658 | }, 659 | "source": [ 660 | "querylist = [\n", 661 | " 'rate The Gift: Imagination and the Erotic Life of Property five stars',\n", 662 | " 'table for Breadline Cafe in Minnesota next friday',\n", 663 | " 'Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?',\n", 664 | " 'Play some sixties songs on Google Music',\n", 665 | " 'rate this textbook four out of 6']\n", 666 | "for query in querylist:\n", 667 | " result = pred_obj.predict(query)\n", 668 | " print(\"Intent: \"+str(result)+\"\\tQuery: \"+str(query))" 669 | ], 670 | "execution_count": 29, 671 | "outputs": [ 672 | { 673 | "output_type": "stream", 674 | "text": [ 675 | "Intent: RateBook\tQuery: rate The Gift: Imagination and the Erotic Life of Property five stars\n", 676 | "Intent: BookRestaurant\tQuery: table for Breadline Cafe in Minnesota next friday\n", 677 | "Intent: GetWeather\tQuery: Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?\n", 678 | "Intent: PlayMusic\tQuery: Play some sixties songs on Google Music\n", 679 | "Intent: RateBook\tQuery: rate this textbook four out of 6\n" 680 | ], 681 | "name": "stdout" 682 | } 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "metadata": { 688 | "id": "gCFpIFH_a7P8", 689 | "colab_type": "code", 690 | "colab": { 691 | "base_uri": "https://localhost:8080/", 692 | "height": 34 693 | }, 694 | "outputId": "d82b6973-49a1-462a-a0ca-661357100520" 695 | }, 696 | "source": [ 697 | "eval_obj = Evaluation()\n", 698 | "acc = eval_obj.get_accuracy(pred_obj.ytest,pred_obj.ypred)\n", 699 | "print(\"Auc: {:.2%}\".format(acc))\n" 700 | ], 701 | "execution_count": 30, 702 | "outputs": [ 703 | { 704 | "output_type": "stream", 705 | "text": [ 706 | "Auc: 97.14%\n" 707 | ], 708 | "name": "stdout" 709 | } 710 | ] 711 | } 712 | ] 713 | } -------------------------------------------------------------------------------- /1.7-entity-recognition/resume-entities-for-ner.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joydeb28/NLP-Notebooks/9669ec6e416e449f67cedb50c143fa4d94bbd44c/1.7-entity-recognition/resume-entities-for-ner.zip -------------------------------------------------------------------------------- /1.8-next-word-prediction/cab_booking.txt: -------------------------------------------------------------------------------- 1 | I would like to book a Cab 2 | Can you please book a cab from Goa to Mumbai 3 | I would like to book taxi for Chennai Airport 4 | I want to take a cab for airport 5 | Could you please book a cab from me 6 | I need a cab urgent for airport 7 | Can you arrange a cab as soon as possible 8 | I would like to cancel my booking 9 | Could yo please cancel my booking 10 | Can you please cancel my tomorrows booking 11 | I want to cancel my upcoming booking 12 | -------------------------------------------------------------------------------- /1.8-next-word-prediction/next_word_prediction_keras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Next Word Prediction Model Using Tensorflow & keras

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "

Importing Libraries
" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from numpy import array\n", 24 | "import numpy as np\n", 25 | "import tensorflow as tf\n", 26 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 27 | "from tensorflow.keras.utils import to_categorical\n", 28 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 29 | "from tensorflow.keras.models import Sequential\n", 30 | "from tensorflow.keras.layers import Dense\n", 31 | "from tensorflow.keras.layers import LSTM\n", 32 | "from tensorflow.keras.layers import Dropout\n", 33 | "from tensorflow.keras.layers import Embedding\n", 34 | "from tensorflow.keras.models import load_model\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "
Preprocessing Data
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "class Preprocessing():\n", 51 | " \n", 52 | " def __init__(self,input_file):\n", 53 | " self.input_data_file = input_file\n", 54 | " self.data = None\n", 55 | " self.vocab_size = None\n", 56 | " self.encoded_data = None\n", 57 | " self.max_length = None\n", 58 | " self.sequences = None\n", 59 | " self.x = None\n", 60 | " self.y = None\n", 61 | " self.tokenizer = None\n", 62 | " \n", 63 | " def load_data(self):\n", 64 | " fp = open(self.input_data_file,'r')\n", 65 | " self.data = fp.read().splitlines() \n", 66 | " fp.close()\n", 67 | " \n", 68 | " def encode_data(self):\n", 69 | " self.tokenizer = Tokenizer()\n", 70 | " self.tokenizer.fit_on_texts(self.data)\n", 71 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n", 72 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n", 73 | " \n", 74 | " def generate_sequence(self):\n", 75 | " seq_list = list()\n", 76 | " for item in self.encoded_data:\n", 77 | " l = len(item)\n", 78 | " for id in range(1,l):\n", 79 | " seq_list.append(item[:id+1])\n", 80 | " self.max_length = max([len(seq) for seq in seq_list])\n", 81 | " self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n", 82 | " self.sequences = array(self.sequences)\n", 83 | " \n", 84 | " def get_data(self):\n", 85 | " self.x = self.sequences[:,:-1]\n", 86 | " self.y = self.sequences[:,-1]\n", 87 | " self.y = to_categorical(self.y,num_classes=self.vocab_size)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "pr = Preprocessing('cab_booking.txt')\n", 97 | "pr.load_data()\n", 98 | "pr.encode_data()\n", 99 | "pr.generate_sequence()\n", 100 | "pr.get_data()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "

Model" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "class Model():\n", 117 | " def __init__(self):\n", 118 | " self.model = None\n", 119 | " self.history = None\n", 120 | " self.x = None\n", 121 | " self.y = None\n", 122 | " self.vocab_size = pr.vocab_size\n", 123 | " self.max_len = pr.max_length\n", 124 | " \n", 125 | " \n", 126 | " def create_model(self):\n", 127 | " self.model = Sequential()\n", 128 | " self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n", 129 | " self.model.add(LSTM(50))\n", 130 | " self.model.add(Dropout(0.1))\n", 131 | " self.model.add(Dense(self.vocab_size,activation='softmax'))\n", 132 | " self.model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])\n", 133 | " print(self.model.summary())\n", 134 | " def run(self,epochs,batch_size):\n", 135 | " self.history = self.model.fit(self.x,self.y,epochs=epochs,batch_size=batch_size,validation_split=0.2)\n", 136 | " \n", 137 | " def save(self):\n", 138 | " self.model.save(\"word_prediction_model.h5\")\n", 139 | " " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "model_obj = Model()\n", 149 | "model_obj.x = pr.x\n", 150 | "model_obj.y = pr.y\n", 151 | "model_obj.create_model()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "model_obj.run(700,2)\n", 161 | "model_obj.save()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "

Prediction" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "class Prediction():\n", 178 | " def __init__(self,tokenizer,max_len):\n", 179 | " self.model = None\n", 180 | " self.tokenizer = tokenizer\n", 181 | " self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n", 182 | " self.max_length = max_len\n", 183 | " \n", 184 | " def load_model(self):\n", 185 | " self.model = load_model(\"word_prediction_model.h5\")\n", 186 | " \n", 187 | " def predict_words(self,text,num_words):\n", 188 | " encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n", 189 | " padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n", 190 | " y_preds = self.model.predict(padded_data)\n", 191 | " y_preds = np.argsort(-y_preds)\n", 192 | " y_preds = y_preds[0][:num_words]\n", 193 | " possible_words = [self.idx2word[item] for item in y_preds]\n", 194 | " print(text,possible_words)\n", 195 | " print(possible_words)\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "pred = Prediction(pr.tokenizer,pr.max_length) \n", 205 | "pred.load_model()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "pred.predict_words(\"I would like to\",2)\n", 215 | "pred.predict_words(\"can you please\",2)" 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.6.9" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 2 240 | } 241 | -------------------------------------------------------------------------------- /1.9-smart-compose/README.md: -------------------------------------------------------------------------------- 1 | Data Source 2 | https://www.usingenglish.com/articles/100-most-useful-emailing-phrases.html 3 | http://english.teamdev.com/resources/useful-phrases 4 | https://blog.talaera.com/business-emails-phrases 5 | -------------------------------------------------------------------------------- /1.9-smart-compose/data/dataset.txt: -------------------------------------------------------------------------------- 1 | content 2 | Dear Sir/Madam 3 | Dear 4 | Hello 5 | Hi 6 | Hi Team 7 | Good morning Team 8 | Good afternoon 9 | I recently read about that 10 | I recently heard about that 11 | Thank you for taking the time to write to us 12 | Thank you for taking the time to give us some feedback 13 | Thank you for your email 14 | Congratulations on 15 | Hope you're having a great day! 16 | Hope you're feeling great! 17 | Explaining Why You're Writing 18 | I wanted to tell you that 19 | I am writing to tell you about 20 | I'm writing to tell you that 21 | This email is to confirm that 22 | We're sending you this email because 23 | In this email, we wanted to tell you about 24 | We are writing to 25 | We wish to inform you of 26 | I'm writing concerning 27 | I'm writing just so you know 28 | I'm writing to remind you about 29 | I'm writing to let you know that 30 | This email is just to let you know that 31 | Just a quick reminder that 32 | I would like to inform you that 33 | This is just to let you know that 34 | Wanted to give you a friendly reminder that 35 | I am contacting you for the following reason 36 | Just a quick email to see how you're doing 37 | I just wanted to let you know that 38 | So happy we reconnected after this time 39 | So glad that we're in touch again 40 | Can't believe it's been a year since we last spoke! Feels like yesterday 41 | Glad you're back in our life! 42 | Glad to see our old friends again! 43 | It's always nice to get in touch with old friends! 44 | Long time no see! Glad to hear from you again 45 | I highly recommend visiting our new Knowledge Base 46 | I highly recommend checking out this new article 47 | Here is a copy of the information 48 | We brought together some of the best tutorials which 49 | What's new since you left? Lots Starting with 50 | We're confident you'll see big improvements since 51 | We know you're busy but we'd hate to see you miss out on this opportunity! 52 | You can do so much with 53 | Learn how to 54 | We thought you might find this useful 55 | Referring to the Previous Contact 56 | Thank you for your letter 57 | Thank you for contacting us 58 | In reply to your request 59 | Thank you for your letter regarding 60 | Regarding our telephone conversation yesterday 61 | Further to our meeting last week 62 | I would just like to confirm the main points we discussed on Tuesday 63 | I'm writing in reply to your email 64 | In reply to your email 65 | We understand from your email that you're interested in 66 | We talked last week about 67 | We had a phone call 68 | It was nice to hear from you 69 | I was glad to catch up 70 | Making a Request 71 | We would appreciate it if you would 72 | I would be grateful if you could 73 | Could you possibly tell us 74 | In addition, I would like to receive 75 | It would be helpful if you could send us 76 | I am interested in receiving 77 | I would appreciate your attention to this matter 78 | Please let me know what action you propose to take 79 | I would be grateful if you could send me further information about 80 | Would it be possible to have a quick chat? 81 | Would you mind if I took the day off 82 | I was hoping you could do something 83 | What would you like to do next? 84 | Could you please send me the mail 85 | Let's discuss your next step 86 | It would be great if you could 87 | Would you mind having a quick chat? 88 | I was wondering if you could 89 | Could you confirm these details? 90 | Would you like me to send you the link? 91 | Here are the details on 92 | Furthermore 93 | In addition, I would like to 94 | We're glad the issues got sorted out despite the delay 95 | For example 96 | For instance 97 | In other words 98 | In order to fix this bug, we would need to research it a bit further 99 | That's why 100 | I'm pleased to hear that 101 | First of all 102 | Firstly 103 | Secondly 104 | There seems to be a problem with the new feature 105 | Here are the possible solutions 106 | While running the tests we've discovered that it is not working properly 107 | Which option would you like us to work on? 108 | What would you like to go with? 109 | Which solution works best for you? 110 | We've found a bug 111 | Here's how we would like to take care of this issue 112 | How would you like us to solve this issue? 113 | We can see three options 114 | There are two ways to solve this 115 | We've come up with a workaround for this issue 116 | This solution is better but it will take longer to implement 117 | If we go with the first option we might run into some problems with in the future 118 | Could you please clarify what you would like us to do about it? 119 | If I understood you correctly you would like us to 120 | What exactly do you mean by? 121 | Could you please clarify when you would like us to finish this? 122 | When exactly are you expecting to have this feature? 123 | Could you explain what you mean by 124 | Could you be more specific? 125 | Could you please repeat it? 126 | Could you repeat what you said? 127 | Could you give us some more details? 128 | When would it be convenient for you too? 129 | Which option would work best for you? 130 | What would you like us to do next? 131 | Would you like to? 132 | Would you prefer to? 133 | Would you rather or? 134 | How would you feel about? 135 | What do you feel is the next step? 136 | Is it possible to? 137 | Could you check it, please? 138 | Just book time on my calendar and I can answer all your questions 139 | Help us give you the best advice by telling us a bit more about your project 140 | I'd love it if you could walk me through your project 141 | This may be a great time to take a look at our Knowledge Base 142 | If you're interested drop me a line and we can have a quick chat to discuss your further steps 143 | Come check out what's new and get inspired! 144 | Could you please keep us updated on this? 145 | If you have any questions please email or call me 146 | Please feel free to contact me anytime 147 | If there's anything I can do for you please let me know 148 | You can drop a mail if there's anything you'd like to discuss 149 | Feel free to call me 150 | Let's discuss this at the meeting if you don't mind 151 | At our last meeting, we talked about 152 | At the meeting, we agreed to 153 | We'd like to have a meeting about 154 | Let's have a meeting sometime this week 155 | How about taking this over at a meeting? 156 | Why don't we talk this over at a meeting? 157 | I'd be glad to tell you more about this at the meeting today 158 | This issue came up at the meeting we had on 159 | Let's have a meeting to discuss this issue 160 | I've set up a meeting 161 | Our company would be pleased to work with you 162 | If there's anything I can help you with just let me know 163 | We would be happy to help 164 | Thanking 165 | Thank you for your consideration 166 | I appreciate that you took the time to give me these details 167 | Thanks for taking the time to give us your feedback 168 | Thank you for writing to us 169 | Thanks a lot for everything 170 | Thank you for your time 171 | Thank you very much for 172 | Many thanks for 173 | You're so helpful 174 | That's thoughtful of you 175 | I appreciate your help 176 | Thank you for your patience 177 | Thank you for clearing this up 178 | Thank you for helping us in this matter 179 | We are pleased to announce that 180 | We are pleased to inform you that 181 | We have some good news for you 182 | It is my pleasure to let you know that 183 | I'm glad to tell you that 184 | You will be pleased to learn that 185 | We regret to inform you that 186 | I regret to inform you that due to a mistake in our database 187 | Unfortunately, we cannot 188 | we are unable to 189 | After careful consideration, we have decided to 190 | I'm afraid it would be impossible to do 191 | Despite my best efforts it has proved to be impossible to 192 | I'm afraid I've got some bad news for you 193 | We apologize for the delay 194 | I regret any inconvenience caused by 195 | I apologize for the problems you've had 196 | Please accept my apologies 197 | Sorry for any inconveniences this situation may have caused 198 | I would like to apologize for the delay 199 | I would like to apologize for the inconvenience 200 | Once again I apologize for any inconveniences 201 | We are sorry for the delay 202 | I'd like to apologize for making you wait 203 | Sorry to keep you waiting 204 | I'm sorry but 205 | Sorry again for 206 | Please confirm 207 | We'll get back to you as soon as we can 208 | Thank you for your order 209 | We're glad that you chose us to help you with this! 210 | I am attaching 211 | Please find the attachment 212 | You will find attached 213 | I've attached the file for your review 214 | The attached file contains 215 | Here's the attachment we discussed 216 | Please take a look at the attached file 217 | Take a look at the attachment I've attached to this email 218 | I've attached 219 | If we can be of any further assistance please let us know 220 | For further details 221 | If you require more information 222 | Thank you for taking this into consideration 223 | We hope you are happy with this arrangement 224 | We look forward to a successful working relationship in the future 225 | We would be very pleased to do business with your company 226 | I would be happy to have an opportunity to work with your firm 227 | I look forward to seeing you next week 228 | Looking forward to hearing from you 229 | I would appreciate your reply 230 | I look forward to doing business with you in the future 231 | I enjoyed working with you and look forward to 232 | Thank you once more for your help in this matter 233 | If you require any further information please let me know 234 | Let me know if you need any help 235 | If I can help in any way please do not hesitate to contact me 236 | If there's anything I can do to help you just drop me a line 237 | Do not hesitate to contact us again 238 | if there's anything we can help you with 239 | Thank you for your help 240 | I'd love to hear your feedback 241 | Hope to hear from you soon 242 | Thank you for your cooperation 243 | I'd appreciate your reply 244 | Please let me know what you think 245 | Thanks again 246 | Thank you for taking your time 247 | Happy holidays! 248 | Sincerely 249 | Yours sincerely 250 | Sincerely yours 251 | Yours faithfully 252 | Kind regards 253 | Yours truly 254 | Many thanks 255 | Regards 256 | Best regards 257 | With best wishes 258 | Best wishes 259 | Best 260 | All the best 261 | Thanks 262 | Have a great weekend! 263 | Have a wonderful day! 264 | Have a productive day! 265 | I hope you had a good weekend 266 | I hope you had a great trip 267 | Hope you had a nice break 268 | I hope you are well 269 | I hope all is well 270 | Hope you're enjoying your holiday 271 | I hope this email finds you well 272 | I hope you enjoyed the event 273 | I'm glad we had a chance to chat at the convention 274 | It was great to see you on Thursday 275 | It was a pleasure to meet you yesterday 276 | I am writing to you about our last meeting 277 | I am writing to you with regards to concerning 278 | I am writing to you regarding 279 | I am writing to ask 280 | I am writing to let you know 281 | I am writing to confirm 282 | I am writing to check 283 | I am writing to invite you 284 | I am writing to update you on 285 | I am writing to you to follow up on 286 | I am contacting you to inform 287 | I am reaching out because 288 | This is just a quick note to 289 | This is just a quick reminder 290 | I wanted to let you know that 291 | Might I take a moment of your time to 292 | I just got your request for 293 | I just read your email about 294 | As we discussed I would like to send you 295 | Thank you for your email about 296 | Thanks for your email 297 | Thanks for your feedback on 298 | Thanks for your invitation 299 | Thanks for your suggestion 300 | Thanks for sending 301 | Thanks for asking about 302 | Thanks for your quick reply 303 | Thanks for getting back to me so quickly 304 | Thank you for reaching out to me 305 | 1d Apologizing 306 | Sorry for my late reply 307 | Sorry it took me so long to get back to you 308 | I apologize for the late response 309 | Sorry it's been so long since my last email 310 | I was sorry to hear about 311 | Please accept our apologies for any inconvenience caused 312 | I'm enclosing the file 313 | The parts in bold are the changes I made 314 | The parts in red are the changes I made 315 | The parts in blue are the changes we made 316 | Here's the document that you asked for 317 | Please take a look at the file I've attached to this email 318 | Could you please? 319 | Could you possibly tell me? 320 | Can you please fill out this form? 321 | I'd appreciate it if you could 322 | I'd be very grateful if you could 323 | It would be very helpful if you could send 324 | If possible I'd like to know more about 325 | Please find my two main questions below 326 | 2c Asking for clarifications 327 | I didn't fully understand 328 | Could you please explain that again? 329 | I didn't quite get your point 330 | Could you repeat what you said about it? 331 | If you could please shed some light on this topic I would appreciate it 332 | Could you please clarify? 333 | If I understood you correctly you would like me to 334 | What exactly do you mean by 335 | In other words, would you like us to 336 | Thank you for letting me know 337 | Thank you for the heads up 338 | Thank you for the notice 339 | Please note 340 | Quick reminder 341 | Just a friendly reminder that 342 | Thank you for sharing 343 | I'd like to inform you that 344 | Thanks for keeping me in the loop 345 | Please keep me informed 346 | Please keep me posted 347 | Please keep me updated 348 | Please keep me in the loop 349 | Please let me know if this is OK with you 350 | What are your thoughts on this? 351 | What do you think? 352 | we're waiting for approval 353 | We just need the thumbs up 354 | We just need the the green light 355 | You totally have the green light! 356 | He approved of it so you can go ahead with the project 357 | I'd like to schedule a meeting if you are available 358 | I am available on 359 | if that's convenient for you 360 | Would you be available on 361 | If so I'll send you an invite shortly 362 | Can you make it on 363 | If so I'll book accordingly 364 | I'm afraid I can't make it on 365 | We need to reschedule our meeting 366 | We need to postpone our meeting 367 | We need to put back our meeting 368 | We need to cancel our meeting 369 | We need to move our meeting 370 | We need to rearrange our meeting 371 | We are sorry to inform you that the interview scheduled for 372 | We are sorry to inform you that the meeting scheduled for 373 | Unfortunately 374 | I'm afraid it will not be possible to 375 | Unfortunately, I have to tell you that 376 | I'm afraid that we can't 377 | I regret to inform you that 378 | After careful consideration, we have decided 379 | It's against company policy to 380 | I tried my best but 381 | Despite my best efforts 382 | I can't see how 383 | I'm sorry but it's out of my hands 384 | I'm afraid I won't be able to 385 | I'm sorry to tell you that 386 | Do you need a reply? 387 | Are you asking for a favor or you are meeting soon? 388 | These sentences are perfect for those moments! 389 | Looking forward to hearing from you soon 390 | I look forward to hearing from you soon 391 | Please let me know if this works 392 | Please let me know if you are available 393 | Please let me know if that sounds good 394 | Please let me know if you can 395 | Please let me know if you can help 396 | Please let me know if you need to reschedule 397 | I look forward to seeing 398 | I look forward to meeting you 399 | See you on next week 400 | Thank you in advance 401 | Thank you for everything 402 | Cheers 403 | Any feedback you can give me on this would be greatly appreciated 404 | Any feedback you can give me on this would be highly appreciated 405 | Any feedback you can give me on this would be much appreciated 406 | If you could have it ready 407 | I would appreciate it 408 | I would appreciate your help in this matter 409 | 3b Offering help or information 410 | I hope you find this helpful 411 | I hope it's clearer now 412 | I hope that answers all your questions 413 | If you have any questions 414 | If you have more questions 415 | In the meantime, if you need any more information 416 | If you need more information 417 | If you need more info 418 | If you need further information 419 | I know that's a lot to take in so let me know if anything I've said doesn't make sense 420 | please do not hesitate to contact me 421 | please feel free to contact me 422 | please feel free to get in touch 423 | please let me know 424 | drop me an email 425 | drop me a mail 426 | Thank you for your understanding 427 | Thanks again for your understanding 428 | Thanks for your patience 429 | Once again please accept our apologies for any inconvenience caused 430 | Once again please accept our apologies for the inconvenience caused 431 | Once again please accept our apologies for the delay 432 | Once again please accept our apologies for the misunderstanding 433 | I hope this is okay with you 434 | I hope we can find a solution soon 435 | I hope you can understand 436 | Sorry I couldn't be of more help 437 | Good morning 438 | Hope you're having a great! 439 | This email is to confirm that we've received your payment 440 | I'm sending you this email because 441 | In this email, I wanted to tell you about 442 | I highly recommend 443 | It was nice to hear from you yesterday 444 | I was glad to catch up yesterday 445 | Could you possibly tell us more 446 | Could you please send me the link 447 | Just wondered if you could send me a copy 448 | We're glad that the issues got sorted out despite the delay 449 | Talking about Problems and Solutions 450 | However, the second solution will take much longer and we cannot give even a rough estimate at the moment 451 | We'd like to research this problem a bit more to give you a more detailed list of options 452 | Could you please clarify what you would like us to do about 453 | I didn't quite get your point about 454 | Could you repeat what you said about 455 | Could you give us some more details on the 456 | When would it be convenient for you to 457 | Have you given any additional consideration to 458 | Could you do something? 459 | Talking about Meetings 460 | This issue came up at the meeting we had on Friday 461 | Here's the link 462 | You will be pleased to hear that 463 | Giving Bad News 464 | I'm afraid it would not be possible toThan 465 | That's not possible 466 | I can't see any way to 467 | It's out of my hands 468 | Talking about Vacations and Holidays 469 | I'm planning a vacation 470 | Would that be all right with you? 471 | We have a national holiday in our country on 472 | Therefore our office will not be working on that date 473 | I'm currently on vacation 474 | If you have questions please drop a mail 475 | When would it be all right for me to have a week-long vacation? 476 | I'm going to be on vacation 477 | going to have a day off 478 | Today I am not feeling well 479 | I'm on vacation now until 480 | I will read and answer all emails as soon as I get back 481 | If this is urgent please contact 482 | Please find attached 483 | The attached files contain 484 | Please take a look at the attachment 485 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-Tutorial 2 | Natural Language Processing 3 | https://www.topbots.com/generalized-language-models-tasks-datasets/ 4 | 5 | #### If you find this repository helpful, a star ⭐ would be greatly appreciated! 6 | #### Created by Joydeb Mondal 7 | -------------------------------------------------------------------------------- /simple-efficient-summarizer.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nimport tensorflow as tf","execution_count":11,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n def __init__(self):\n data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n ","execution_count":19,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":20,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4} --------------------------------------------------------------------------------