├── 1.1-language-model
├── data.txt
├── language_model_keras.ipynb
├── language_model_pytorch.ipynb
└── language_model_torch.ipynb
├── 1.10-question-answering
└── question_answering_keras.ipynb
├── 1.12-text-summarization
└── text-summarizer.ipynb
├── 1.2-sentiment-analysis
├── sentiment_classfication_bert_keras.ipynb
└── sentiment_classfication_pytorch.ipynb
├── 1.3-semantic-similarity
├── README.md
├── semantic-similarity-BERT.ipynb
└── try_cf.ipynb
├── 1.4-machine-translation
├── neural_machine_translation.ipynb
└── seq2seq.ipynb
├── 1.5-named-entity-recognition
├── data_making.py
├── ner_bert.ipynb
├── ner_keras.ipynb
├── simple_ner-2.0.ipynb
└── simple_ner.ipynb
├── 1.6-intent-classification
├── README.md
├── intent_classfication_bert.ipynb
├── intent_classfication_bert_keras.ipynb
├── intent_classfication_keras.ipynb
└── text-classification-with-bert-pytorch.ipynb
├── 1.7-entity-recognition
├── entity_recognition_keras.ipynb
└── resume-entities-for-ner.zip
├── 1.8-next-word-prediction
├── cab_booking.txt
└── next_word_prediction_keras.ipynb
├── 1.9-smart-compose
├── README.md
├── data
│ └── dataset.txt
└── smart_compose_keras.ipynb
├── README.md
└── simple-efficient-summarizer.ipynb
/1.1-language-model/data.txt:
--------------------------------------------------------------------------------
1 | Jack and Jill went up the hill
2 | To fetch a pail of water
3 | Jack fell down and broke his crown
4 | And Jill came tumbling after
5 |
--------------------------------------------------------------------------------
/1.1-language-model/language_model_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Language Model Using Tensorflow & keras "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Importing Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from numpy import array\n",
24 | "import numpy as np\n",
25 | "import tensorflow as tf\n",
26 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
27 | "from tensorflow.keras.utils import to_categorical\n",
28 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
29 | "from tensorflow.keras.models import Sequential\n",
30 | "from tensorflow.keras.layers import Dense\n",
31 | "from tensorflow.keras.layers import LSTM\n",
32 | "from tensorflow.keras.layers import Dropout\n",
33 | "from tensorflow.keras.layers import Embedding\n",
34 | "from tensorflow.keras.models import load_model\n"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "Preprocessing Data"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 7,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "class Preprocessing():\n",
51 | " \n",
52 | " def __init__(self,input_file):\n",
53 | " self.input_data_file = input_file\n",
54 | " self.data = None\n",
55 | " self.vocab_size = None\n",
56 | " self.encoded_data = None\n",
57 | " self.max_length = None\n",
58 | " self.sequences = None\n",
59 | " self.x = None\n",
60 | " self.y = None\n",
61 | " self.tokenizer = None\n",
62 | " \n",
63 | " def load_data(self):\n",
64 | " fp = open(self.input_data_file,'r')\n",
65 | " self.data = fp.read().splitlines() \n",
66 | " fp.close()\n",
67 | " \n",
68 | " def encode_data(self):\n",
69 | " self.tokenizer = Tokenizer()\n",
70 | " self.tokenizer.fit_on_texts(self.data)\n",
71 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
72 | " print(self.encoded_data)\n",
73 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n",
74 | " \n",
75 | " def generate_sequence(self):\n",
76 | " seq_list = list()\n",
77 | " for item in self.encoded_data:\n",
78 | " l = len(item)\n",
79 | " for id in range(1,l):\n",
80 | " seq_list.append(item[:id+1])\n",
81 | " self.max_length = max([len(seq) for seq in seq_list])\n",
82 | " self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n",
83 | " print(self.sequences)\n",
84 | " self.sequences = array(self.sequences)\n",
85 | " \n",
86 | " def get_data(self):\n",
87 | " self.x = self.sequences[:,:-1]\n",
88 | " self.y = self.sequences[:,-1]\n",
89 | " print(\"y before:\",self.y)\n",
90 | " self.y = to_categorical(self.y,num_classes=self.vocab_size)\n",
91 | " print(\"y After:\",self.y)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 8,
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "[[2, 1, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13], [2, 14, 15, 1, 16, 17, 18], [1, 3, 19, 20, 21]]\n",
104 | "[[ 0 0 0 0 0 2 1]\n",
105 | " [ 0 0 0 0 2 1 3]\n",
106 | " [ 0 0 0 2 1 3 4]\n",
107 | " [ 0 0 2 1 3 4 5]\n",
108 | " [ 0 2 1 3 4 5 6]\n",
109 | " [ 2 1 3 4 5 6 7]\n",
110 | " [ 0 0 0 0 0 8 9]\n",
111 | " [ 0 0 0 0 8 9 10]\n",
112 | " [ 0 0 0 8 9 10 11]\n",
113 | " [ 0 0 8 9 10 11 12]\n",
114 | " [ 0 8 9 10 11 12 13]\n",
115 | " [ 0 0 0 0 0 2 14]\n",
116 | " [ 0 0 0 0 2 14 15]\n",
117 | " [ 0 0 0 2 14 15 1]\n",
118 | " [ 0 0 2 14 15 1 16]\n",
119 | " [ 0 2 14 15 1 16 17]\n",
120 | " [ 2 14 15 1 16 17 18]\n",
121 | " [ 0 0 0 0 0 1 3]\n",
122 | " [ 0 0 0 0 1 3 19]\n",
123 | " [ 0 0 0 1 3 19 20]\n",
124 | " [ 0 0 1 3 19 20 21]]\n",
125 | "y before: [ 1 3 4 5 6 7 9 10 11 12 13 14 15 1 16 17 18 3 19 20 21]\n",
126 | "y After: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
127 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
128 | " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
129 | " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
130 | " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
131 | " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
132 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
133 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
134 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
135 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
136 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
137 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
138 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
139 | " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
140 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
141 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
142 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
143 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
144 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n",
145 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
146 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]\n"
147 | ]
148 | }
149 | ],
150 | "source": [
151 | "pr = Preprocessing('data.txt')\n",
152 | "pr.load_data()\n",
153 | "pr.encode_data()\n",
154 | "pr.generate_sequence()\n",
155 | "pr.get_data()"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Model"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "class Model():\n",
172 | " def __init__(self,params):\n",
173 | " self.model = None\n",
174 | " self.history = None\n",
175 | " self.x = None\n",
176 | " self.y = None\n",
177 | " self.vocab_size = params['vocab_size']\n",
178 | " self.max_len = params['max_len']\n",
179 | " self.activation = params['activation']\n",
180 | " self.optimizer = params['optimizer']\n",
181 | " self.epochs = params['epochs']\n",
182 | " self.metrics = params['metrics']\n",
183 | " \n",
184 | " \n",
185 | " def create_model(self):\n",
186 | " self.model = Sequential()\n",
187 | " self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n",
188 | " self.model.add(LSTM(50))\n",
189 | " self.model.add(Dropout(0.1))\n",
190 | " self.model.add(Dense(self.vocab_size,activation=self.activation))\n",
191 | " self.model.compile(loss='categorical_crossentropy',optimizer=self.optimizer,metrics=self.metrics)\n",
192 | " print(self.model.summary())\n",
193 | " def run(self):\n",
194 | " self.history = self.model.fit(self.x,self.y,epochs=self.epochs)\n",
195 | " \n",
196 | " def save(self):\n",
197 | " self.model.save(\"lang_model.h5\")\n",
198 | " "
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "params = {\"activation\":\"softmax\",\"epochs\":500,\"verbose\":2,\"loss\":\"categorical_crossentropy\",\n",
208 | " \"optimizer\":\"adam\",\"metrics\":['accuracy'],\"vocab_size\":pr.vocab_size,\"max_len\":pr.max_length}\n",
209 | "model_obj = Model(params)\n",
210 | "model_obj.x = pr.x\n",
211 | "model_obj.y = pr.y\n",
212 | "model_obj.create_model()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "model_obj.run()\n",
222 | "model_obj.save()"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "Prediction"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "class Prediction():\n",
239 | " def __init__(self,tokenizer,max_len):\n",
240 | " self.model = None\n",
241 | " self.tokenizer = tokenizer\n",
242 | " self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n",
243 | " self.max_length = max_len\n",
244 | " \n",
245 | " def load_model(self):\n",
246 | " self.model = load_model(\"lang_model.h5\")\n",
247 | " \n",
248 | " def predict_sequnce(self,text,num_words):\n",
249 | " for id in range(num_words):\n",
250 | " encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n",
251 | " padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n",
252 | " y_pred = self.model.predict(padded_data)\n",
253 | " y_pred = np.argmax(y_pred)\n",
254 | " predict_word = self.idx2word[y_pred]\n",
255 | " text += ' ' + predict_word\n",
256 | " return text"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "pred = Prediction(pr.tokenizer,pr.max_length) \n",
266 | "pred.load_model()\n",
267 | "print(pred.predict_sequnce(\"Jack and\",5))\n",
268 | "print(pred.predict_sequnce('And Jill', 4))\n",
269 | "print(pred.predict_sequnce('fell down', 5))\n",
270 | "print(pred.predict_sequnce('pail of', 3))"
271 | ]
272 | }
273 | ],
274 | "metadata": {
275 | "kernelspec": {
276 | "display_name": "Python 3",
277 | "language": "python",
278 | "name": "python3"
279 | },
280 | "language_info": {
281 | "codemirror_mode": {
282 | "name": "ipython",
283 | "version": 3
284 | },
285 | "file_extension": ".py",
286 | "mimetype": "text/x-python",
287 | "name": "python",
288 | "nbconvert_exporter": "python",
289 | "pygments_lexer": "ipython3",
290 | "version": "3.6.9"
291 | }
292 | },
293 | "nbformat": 4,
294 | "nbformat_minor": 2
295 | }
296 |
--------------------------------------------------------------------------------
/1.1-language-model/language_model_pytorch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "import torch.nn as nn\n",
11 | "import torch.nn.functional as F\n",
12 | "\n",
13 | "import numpy as np\n",
14 | "from collections import Counter\n",
15 | "import os"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 16,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "batch_size = 1\n",
25 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 17,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "Vocabulary size 22\n"
38 | ]
39 | }
40 | ],
41 | "source": [
42 | "class PreProcessing():\n",
43 | " \n",
44 | " def get_data_from_file(self,train_file, batch_size, seq_size):\n",
45 | " with open(train_file, 'r', encoding='utf-8') as f:\n",
46 | " text = f.read()\n",
47 | " text = text.split()\n",
48 | "\n",
49 | " word_counts = Counter(text)\n",
50 | " sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)\n",
51 | " int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}\n",
52 | " vocab_to_int = {w: k for k, w in int_to_vocab.items()}\n",
53 | " n_vocab = len(int_to_vocab)\n",
54 | "\n",
55 | " print('Vocabulary size', n_vocab)\n",
56 | "\n",
57 | " int_text = [vocab_to_int[w] for w in text]\n",
58 | " num_batches = int(len(int_text) / (seq_size * batch_size))\n",
59 | " in_text = int_text[:num_batches * batch_size * seq_size]\n",
60 | " out_text = np.zeros_like(in_text)\n",
61 | " out_text[:-1] = in_text[1:]\n",
62 | " out_text[-1] = in_text[0]\n",
63 | " in_text = np.reshape(in_text, (batch_size, -1))\n",
64 | " out_text = np.reshape(out_text, (batch_size, -1))\n",
65 | " return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text\n",
66 | "\n",
67 | "\n",
68 | " def get_batches(self,in_text, out_text, batch_size, seq_size):\n",
69 | " num_batches = np.prod(in_text.shape) // (seq_size * batch_size)\n",
70 | " for i in range(0, num_batches * seq_size, seq_size):\n",
71 | " yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]\n",
72 | " \n",
73 | " \n",
74 | "preprocess_obj = PreProcessing()\n",
75 | "int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = preprocess_obj.get_data_from_file(\"data.txt\",4,4)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 18,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "array([[ 0, 1, 2, 3],\n",
87 | " [ 4, 5, 6, 7],\n",
88 | " [ 8, 9, 10, 11],\n",
89 | " [12, 0, 13, 14]])"
90 | ]
91 | },
92 | "execution_count": 18,
93 | "metadata": {},
94 | "output_type": "execute_result"
95 | }
96 | ],
97 | "source": [
98 | "in_text"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 19,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/plain": [
109 | "array([[ 1, 2, 3, 4],\n",
110 | " [ 5, 6, 7, 8],\n",
111 | " [ 9, 10, 11, 12],\n",
112 | " [ 0, 13, 14, 0]])"
113 | ]
114 | },
115 | "execution_count": 19,
116 | "metadata": {},
117 | "output_type": "execute_result"
118 | }
119 | ],
120 | "source": [
121 | "out_text"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 26,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "class RNNModule(nn.Module):\n",
131 | " def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):\n",
132 | " super(RNNModule, self).__init__()\n",
133 | " self.seq_size = seq_size\n",
134 | " self.lstm_size = lstm_size\n",
135 | " self.embedding = nn.Embedding(n_vocab, embedding_size)\n",
136 | " self.lstm = nn.LSTM(embedding_size,\n",
137 | " lstm_size,\n",
138 | " batch_first=True)\n",
139 | " self.dense = nn.Linear(lstm_size, n_vocab)\n",
140 | "\n",
141 | " def forward(self, x, prev_state):\n",
142 | " embed = self.embedding(x)\n",
143 | " output, state = self.lstm(embed, prev_state)\n",
144 | " logits = self.dense(output)\n",
145 | "\n",
146 | " return logits, state\n",
147 | "\n",
148 | " def zero_state(self, batch_size):\n",
149 | " return (torch.zeros(1, batch_size, self.lstm_size),\n",
150 | " torch.zeros(1, batch_size, self.lstm_size))\n",
151 | " \n",
152 | " def get_loss_and_train_op(self, net, lr=0.001):\n",
153 | " criterion = nn.CrossEntropyLoss()\n",
154 | " optimizer = torch.optim.Adam(net.parameters(), lr=lr)\n",
155 | "\n",
156 | " return criterion, optimizer\n",
157 | " \n",
158 | " def train(self):\n",
159 | " iteration = 0\n",
160 | " gradients_norm=5\n",
161 | " for e in range(200):\n",
162 | " batches = preprocess_obj.get_batches(in_text, out_text, batch_size, seq_size)\n",
163 | " state_h, state_c = net.zero_state(batch_size)\n",
164 | " state_h = state_h.to(device)\n",
165 | " state_c = state_c.to(device)\n",
166 | " for x, y in batches:\n",
167 | " iteration += 1\n",
168 | " net.train()\n",
169 | "\n",
170 | " optimizer.zero_grad()\n",
171 | "\n",
172 | " x = torch.tensor(x).to(device)\n",
173 | " y = torch.tensor(y).to(device)\n",
174 | "\n",
175 | " logits, (state_h, state_c) = net(x, (state_h, state_c))\n",
176 | " loss = criterion(logits.transpose(1, 2), y)\n",
177 | "\n",
178 | " loss_value = loss.item()\n",
179 | "\n",
180 | " loss.backward()\n",
181 | "\n",
182 | " state_h = state_h.detach()\n",
183 | " state_c = state_c.detach()\n",
184 | "\n",
185 | " _ = torch.nn.utils.clip_grad_norm_(\n",
186 | " net.parameters(), gradients_norm)\n",
187 | "\n",
188 | " optimizer.step()\n",
189 | "\n",
190 | " if iteration % 100 == 0:\n",
191 | " print('Epoch: {}/{}'.format(e, 200),\n",
192 | " 'Iteration: {}'.format(iteration),\n",
193 | " 'Loss: {}'.format(loss_value))\n",
194 | "\n",
195 | " if iteration % 1000 == 0:\n",
196 | " torch.save(net.state_dict(),\n",
197 | " 'checkpoint_pt/model-{}.pth'.format(iteration))\n",
198 | "seq_size = 4\n",
199 | "embedding_size = 22\n",
200 | "lstm_size = 64\n"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 27,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "net = RNNModule(n_vocab, seq_size,embedding_size, lstm_size)\n",
210 | "net = net.to(device)\n",
211 | "criterion, optimizer = net.get_loss_and_train_op(net, 0.01)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 28,
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "ename": "RecursionError",
221 | "evalue": "maximum recursion depth exceeded",
222 | "output_type": "error",
223 | "traceback": [
224 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
225 | "\u001b[0;31mRecursionError\u001b[0m Traceback (most recent call last)",
226 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
227 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
228 | "... last 1 frames repeated, from the frame below ...\n",
229 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbatches\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0miteration\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
230 | "\u001b[0;31mRecursionError\u001b[0m: maximum recursion depth exceeded"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "net.train()"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": []
244 | }
245 | ],
246 | "metadata": {
247 | "kernelspec": {
248 | "display_name": "Python 3",
249 | "language": "python",
250 | "name": "python3"
251 | },
252 | "language_info": {
253 | "codemirror_mode": {
254 | "name": "ipython",
255 | "version": 3
256 | },
257 | "file_extension": ".py",
258 | "mimetype": "text/x-python",
259 | "name": "python",
260 | "nbconvert_exporter": "python",
261 | "pygments_lexer": "ipython3",
262 | "version": "3.6.9"
263 | }
264 | },
265 | "nbformat": 4,
266 | "nbformat_minor": 2
267 | }
268 |
--------------------------------------------------------------------------------
/1.1-language-model/language_model_torch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Importing Libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 191,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import torch\n",
19 | "import torch.nn as nn\n",
20 | "import torch.optim as optim\n",
21 | "from torch.autograd import Variable"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 192,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "class Tokenizer():\n",
31 | " def fit_on_texts(self,list_data):\n",
32 | " word_list = \" \".join(list_data).split()\n",
33 | " self.word_counts = list(set(word_list))\n",
34 | " self.word_dict = {w: i for i, w in enumerate(self.word_counts)}\n",
35 | " self.number_dict = {i: w for i, w in enumerate(self.word_counts)}\n",
36 | " \n",
37 | " def texts_to_sequences(self,data):\n",
38 | " encoded_sequence = list()\n",
39 | " for item in data:\n",
40 | " encoded_sequence.append([self.word_dict[word] for word in item.split()])\n",
41 | " return encoded_sequence\n",
42 | " \n",
43 | "def pad_sequences(data,padding='pre',padding_value=0):\n",
44 | " sequence = None\n",
45 | " if isinstance(data,list):\n",
46 | " maxlen = max(len(item) for item in data)\n",
47 | " \n",
48 | " if padding == 'pre':\n",
49 | " for idx in range(len(data)):\n",
50 | " data[idx] = [padding_value]*(maxlen-len(data[idx])) + data[idx]\n",
51 | " else:\n",
52 | " for idx in range(len(data)):\n",
53 | " data[idx] = data[idx]+ [padding_value]*(maxlen-len(data[idx]))\n",
54 | " \n",
55 | " return data\n",
56 | "def to_categorical(data, nb_classes):\n",
57 | " targets = np.array(data).reshape(-1)\n",
58 | " return np.eye(nb_classes)[targets]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 195,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "class Preprocessing():\n",
68 | " \n",
69 | " def __init__(self,input_file):\n",
70 | " self.input_data_file = input_file\n",
71 | " self.data = None\n",
72 | " self.vocab_size = None\n",
73 | " self.encoded_data = None\n",
74 | " self.max_length = None\n",
75 | " self.sequences = None\n",
76 | " self.x = None\n",
77 | " self.y = None\n",
78 | " self.tokenizer = None\n",
79 | " \n",
80 | " def load_data(self):\n",
81 | " fp = open(self.input_data_file,'r')\n",
82 | " self.data = fp.read().splitlines() \n",
83 | " fp.close()\n",
84 | " \n",
85 | " def encode_data(self):\n",
86 | " self.tokenizer = Tokenizer()\n",
87 | " self.tokenizer.fit_on_texts(self.data)\n",
88 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
89 | " print(self.encoded_data)\n",
90 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n",
91 | " \n",
92 | " def generate_sequence(self):\n",
93 | " seq_list = list()\n",
94 | " for item in self.encoded_data:\n",
95 | " l = len(item)\n",
96 | " for id in range(1,l):\n",
97 | " seq_list.append(item[:id+1])\n",
98 | " #print(seq_list[0])\n",
99 | " print(seq_list)\n",
100 | " self.sequences = pad_sequences(seq_list,padding='pre', padding_value=0)\n",
101 | " print(self.sequences)\n",
102 | " self.sequences = array(self.sequences)\n",
103 | " \n",
104 | " def get_data(self):\n",
105 | " self.x = self.sequences[:,:-1]\n",
106 | " self.y = self.sequences[:,-1]\n",
107 | " print(self.y)\n",
108 | " self.y = to_categorical(self.y,nb_classes=self.vocab_size)\n",
109 | " print(\"Y:{}\".format(self.y))\n",
110 | " print(\"X:{}\".format(self.x))\n",
111 | " return self.x,self.y"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 196,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "[[2, 12, 16, 4, 3, 14, 9], [17, 0, 18, 6, 19, 21], [2, 10, 8, 12, 11, 7, 15], [1, 16, 5, 20, 13]]\n",
124 | "[[2, 12], [2, 12, 16], [2, 12, 16, 4], [2, 12, 16, 4, 3], [2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [17, 0], [17, 0, 18], [17, 0, 18, 6], [17, 0, 18, 6, 19], [17, 0, 18, 6, 19, 21], [2, 10], [2, 10, 8], [2, 10, 8, 12], [2, 10, 8, 12, 11], [2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [1, 16], [1, 16, 5], [1, 16, 5, 20], [1, 16, 5, 20, 13]]\n",
125 | "[[0, 0, 0, 0, 0, 2, 12], [0, 0, 0, 0, 2, 12, 16], [0, 0, 0, 2, 12, 16, 4], [0, 0, 2, 12, 16, 4, 3], [0, 2, 12, 16, 4, 3, 14], [2, 12, 16, 4, 3, 14, 9], [0, 0, 0, 0, 0, 17, 0], [0, 0, 0, 0, 17, 0, 18], [0, 0, 0, 17, 0, 18, 6], [0, 0, 17, 0, 18, 6, 19], [0, 17, 0, 18, 6, 19, 21], [0, 0, 0, 0, 0, 2, 10], [0, 0, 0, 0, 2, 10, 8], [0, 0, 0, 2, 10, 8, 12], [0, 0, 2, 10, 8, 12, 11], [0, 2, 10, 8, 12, 11, 7], [2, 10, 8, 12, 11, 7, 15], [0, 0, 0, 0, 0, 1, 16], [0, 0, 0, 0, 1, 16, 5], [0, 0, 0, 1, 16, 5, 20], [0, 0, 1, 16, 5, 20, 13]]\n",
126 | "[12 16 4 3 14 9 0 18 6 19 21 10 8 12 11 7 15 16 5 20 13]\n",
127 | "Y:[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
128 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
129 | " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
130 | " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
131 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
132 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
133 | " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
134 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
135 | " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
136 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
137 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
138 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
139 | " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
140 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
141 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
142 | " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
143 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
144 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
145 | " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
146 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n",
147 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n",
148 | "X:[[ 0 0 0 0 0 2]\n",
149 | " [ 0 0 0 0 2 12]\n",
150 | " [ 0 0 0 2 12 16]\n",
151 | " [ 0 0 2 12 16 4]\n",
152 | " [ 0 2 12 16 4 3]\n",
153 | " [ 2 12 16 4 3 14]\n",
154 | " [ 0 0 0 0 0 17]\n",
155 | " [ 0 0 0 0 17 0]\n",
156 | " [ 0 0 0 17 0 18]\n",
157 | " [ 0 0 17 0 18 6]\n",
158 | " [ 0 17 0 18 6 19]\n",
159 | " [ 0 0 0 0 0 2]\n",
160 | " [ 0 0 0 0 2 10]\n",
161 | " [ 0 0 0 2 10 8]\n",
162 | " [ 0 0 2 10 8 12]\n",
163 | " [ 0 2 10 8 12 11]\n",
164 | " [ 2 10 8 12 11 7]\n",
165 | " [ 0 0 0 0 0 1]\n",
166 | " [ 0 0 0 0 1 16]\n",
167 | " [ 0 0 0 1 16 5]\n",
168 | " [ 0 0 1 16 5 20]]\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "pr = Preprocessing('data.txt')\n",
174 | "pr.load_data()\n",
175 | "pr.encode_data()\n",
176 | "pr.generate_sequence()\n",
177 | "x,y = pr.get_data()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 184,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "n_step = 1 # n-1 in paper\n",
187 | "n_hidden = 1 # h in paper\n",
188 | "m = 1 # m in paper\n",
189 | "n_class = pr.vocab_size\n",
190 | "dtype = torch.FloatTensor\n"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 178,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "class NNLM(nn.Module):\n",
200 | " def __init__(self):\n",
201 | " super(NNLM, self).__init__()\n",
202 | " self.C = nn.Embedding(n_class, m)\n",
203 | " self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))\n",
204 | " self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))\n",
205 | " self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))\n",
206 | " self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))\n",
207 | " self.b = nn.Parameter(torch.randn(n_class).type(dtype))\n",
208 | "\n",
209 | " def forward(self, X):\n",
210 | " X = self.C(X)\n",
211 | " X = X.view(-1, n_step * m) # [batch_size, n_step * n_class]\n",
212 | " tanh = torch.tanh(self.d + torch.mm(X, self.H)) # [batch_size, n_hidden]\n",
213 | " output = self.b + torch.mm(X, self.W) + torch.mm(tanh, self.U) # [batch_size, n_class]\n",
214 | " return output\n",
215 | " \n",
216 | "def train(x,y):\n",
217 | " model = NNLM()\n",
218 | " criterion = nn.CrossEntropyLoss()\n",
219 | " optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
220 | " # Training\n",
221 | " for epoch in range(100):\n",
222 | "\n",
223 | " optimizer.zero_grad()\n",
224 | " output = model(x)\n",
225 | "\n",
226 | " # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\n",
227 | " loss = criterion(output, y)\n",
228 | " if (epoch + 1)%1000 == 0:\n",
229 | " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
230 | "\n",
231 | " loss.backward()\n",
232 | " optimizer.step()"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 179,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "ename": "ValueError",
242 | "evalue": "Expected input batch_size (126) to match target batch_size (21).",
243 | "output_type": "error",
244 | "traceback": [
245 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
246 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
247 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLongTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
248 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;31m# output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m1000\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Epoch:'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'%04d'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mepoch\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'cost ='\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'{:.6f}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
249 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 550\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 551\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
250 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m return F.cross_entropy(input, target, weight=self.weight,\n\u001b[0;32m--> 932\u001b[0;31m ignore_index=self.ignore_index, reduction=self.reduction)\n\u001b[0m\u001b[1;32m 933\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 934\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
251 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m 2315\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msize_average\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2316\u001b[0m \u001b[0mreduction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlegacy_get_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize_average\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2317\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlog_softmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2319\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
252 | "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m 2111\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2112\u001b[0m raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'\n\u001b[0;32m-> 2113\u001b[0;31m .format(input.size(0), target.size(0)))\n\u001b[0m\u001b[1;32m 2114\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2115\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_enum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
253 | "\u001b[0;31mValueError\u001b[0m: Expected input batch_size (126) to match target batch_size (21)."
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "x = Variable(torch.LongTensor(x))\n",
259 | "y = Variable(torch.LongTensor(y))\n",
260 | "train(x,y)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": []
269 | }
270 | ],
271 | "metadata": {
272 | "kernelspec": {
273 | "display_name": "Python 3",
274 | "language": "python",
275 | "name": "python3"
276 | },
277 | "language_info": {
278 | "codemirror_mode": {
279 | "name": "ipython",
280 | "version": 3
281 | },
282 | "file_extension": ".py",
283 | "mimetype": "text/x-python",
284 | "name": "python",
285 | "nbconvert_exporter": "python",
286 | "pygments_lexer": "ipython3",
287 | "version": "3.6.9"
288 | }
289 | },
290 | "nbformat": 4,
291 | "nbformat_minor": 2
292 | }
293 |
--------------------------------------------------------------------------------
/1.12-text-summarization/text-summarizer.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport os\nimport tensorflow as tf\nfrom sklearn.model_selection import train_test_split\n\nfrom tensorflow.keras.preprocessing.text import Tokenizer \nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\n\nfrom tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint","execution_count":39,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Adding Attention Layer as its not a part of the keras\nhttps://www.kaggle.com/kweku20/attention"},{"metadata":{"trusted":true},"cell_type":"code","source":"from shutil import copyfile\ncopyfile(src = \"/kaggle/input/attention/attention.py\", dst = \"/kaggle/working/attention.py\")\nfrom attention import AttentionLayer","execution_count":40,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n def __init__(self):\n data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n print(data.head())\n self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n ","execution_count":41,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":42,"outputs":[{"output_type":"stream","text":" Id ProductId UserId ProfileName \\\n0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian \n1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa \n2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres \"Natalia Corres\" \n3 4 B000UA0QIQ A395BORC6FGVXV Karl \n4 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham \"M. Wassir\" \n\n HelpfulnessNumerator HelpfulnessDenominator Score Time \\\n0 1 1 5 1303862400 \n1 0 0 1 1346976000 \n2 1 1 4 1219017600 \n3 3 3 2 1307923200 \n4 0 0 5 1350777600 \n\n Summary Text \n0 Good Quality Dog Food I have bought several of the Vitality canned d... \n1 Not as Advertised Product arrived labeled as Jumbo Salted Peanut... \n2 \"Delight\" says it all This is a confection that has been around a fe... \n3 Cough Medicine If you are looking for the secret ingredient i... \n4 Great taffy Great taffy at a great price. There was a wid... \n","name":"stdout"}]},{"metadata":{"trusted":true},"cell_type":"code","source":"class PreprocessingData():\n def __init__(self):\n self.max_in_len = 100\n self.max_tar_len = 10\n \n def preprocess_data(self,data):\n data.dropna(axis=0,inplace=True)\n data['Summary'] = data['Summary'].apply(lambda x : 'start '+ x + ' end')\n return data\n \n def get_data(self,data):\n x_train,x_val,y_train,y_val = train_test_split(np.array(data['Text']),np.array(data['Summary']),test_size=0.1,random_state=0,shuffle=True)\n return x_train,x_val,y_train,y_val\n \n def encode_data(self,data,x_train,x_val,y_train,y_val):\n \n # Input Encoding\n in_tokenizer = Tokenizer() \n in_tokenizer.fit_on_texts(data[\"Text\"].tolist())\n\n x_train_seq = in_tokenizer.texts_to_sequences(x_train) \n x_val_seq = in_tokenizer.texts_to_sequences(x_val)\n\n x_train = pad_sequences(x_train_seq, maxlen = self.max_in_len, padding='post')\n x_val = pad_sequences(x_val_seq, maxlen = self.max_in_len, padding='post')\n\n self.in_voc = len(in_tokenizer.word_counts) + 1\n \n # Target Encoding\n tar_tokenizer = Tokenizer() \n tar_tokenizer.fit_on_texts(data[\"Summary\"].tolist())\n\n y_train_seq = tar_tokenizer.texts_to_sequences(y_train) \n y_val_seq = tar_tokenizer.texts_to_sequences(y_val)\n\n y_train = pad_sequences(y_train_seq, maxlen = self.max_tar_len, padding='post')\n y_val = pad_sequences(y_val_seq, maxlen = self.max_tar_len, padding='post')\n\n self.tar_voc = len(tar_tokenizer.word_counts) + 1\n return x_train,x_val,y_train,y_val","execution_count":43,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling preprocessing module on loading data"},{"metadata":{"trusted":true},"cell_type":"code","source":"preprocessing_data = PreprocessingData()\ndata = preprocessing_data.preprocess_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.get_data(data)\nx_train,x_val,y_train,y_val = preprocessing_data.encode_data(data,x_train,x_val,y_train,y_val)","execution_count":44,"outputs":[{"output_type":"error","ename":"TypeError","evalue":"unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocess_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocessing_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m\u001b[0m in \u001b[0;36mencode_data\u001b[0;34m(self, data, x_train, x_val, y_train, y_val)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mx_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpad_sequences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val_seq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_in_len\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'post'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_voc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0min_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_counts\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;31m# Target Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'collections.OrderedDict' and 'int'"]}]},{"metadata":{},"cell_type":"markdown","source":"Model Creation"},{"metadata":{"trusted":true},"cell_type":"code","source":"class Model():\n def __init__(self):\n self.model = None\n \n def define_model(self):\n raise NotImplementedError","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4}
--------------------------------------------------------------------------------
/1.2-sentiment-analysis/sentiment_classfication_bert_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import tensorflow as tf\n",
10 | "import pandas as pd\n",
11 | "import tensorflow_hub as hub\n",
12 | "import bert\n",
13 | "import os\n",
14 | "import re\n",
15 | "import numpy as np\n",
16 | "from tqdm import tqdm\n",
17 | "from tqdm import tqdm_notebook\n",
18 | "from tensorflow.keras import backend as K\n",
19 | "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n",
20 | "from tensorflow.keras.models import Sequential, Model"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "class LoadData():\n",
30 | " def __init__(self,csv_file):\n",
31 | " self.df = pd.read_csv(os.path.join(os.getcwd(),csv_file))\n",
32 | " self.train_df = None\n",
33 | " self.test_df = None\n",
34 | " def load_data(self):\n",
35 | " self.df.columns = ['sentence','sentiment']\n",
36 | " self.train_df = self.df[self.df['sentiment']=='positive']\n",
37 | " self.test_df = self.df[self.df['sentiment']=='negative']\n",
38 | " self.train_df.loc[self.train_df['sentiment']=='positive','polarity'] = 1\n",
39 | " self.test_df.loc[self.test_df['sentiment']=='negative','polarity'] = 0\n"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "loaddata_obj = LoadData(\"imdb_dataset_small.csv\") \n",
49 | "loaddata_obj.load_data()"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "loaddata_obj.train_df.head()"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "loaddata_obj.test_df.head()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "class BertModel(object):\n",
77 | " \n",
78 | " def __init__(self):\n",
79 | " \n",
80 | " self.max_len = 128\n",
81 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
82 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
83 | " \n",
84 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
85 | "\n",
86 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
87 | "\n",
88 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
89 | "\n",
90 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
91 | " \n",
92 | " def get_masks(self,tokens, max_seq_length):\n",
93 | " return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
94 | "\n",
95 | " def get_segments(self,tokens, max_seq_length):\n",
96 | " \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n",
97 | " segments = []\n",
98 | " current_segment_id = 0\n",
99 | " for token in tokens:\n",
100 | " segments.append(current_segment_id)\n",
101 | " if token == \"[SEP]\":\n",
102 | " current_segment_id = 1\n",
103 | " return segments + [0] * (max_seq_length - len(tokens))\n",
104 | " \n",
105 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n",
106 | " \"\"\"Token ids from Tokenizer vocab\"\"\"\n",
107 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
108 | " input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n",
109 | " return input_ids\n",
110 | " def create_single_input(self,sentence,maxlen):\n",
111 | "\n",
112 | " stokens = self.tokenizer.tokenize(sentence)\n",
113 | "\n",
114 | " stokens = stokens[:maxlen]\n",
115 | "\n",
116 | " stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n",
117 | "\n",
118 | " ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n",
119 | " masks = self.get_masks(stokens, self.max_len)\n",
120 | " segments = self.get_segments(stokens, self.max_len)\n",
121 | "\n",
122 | " return ids,masks,segments\n",
123 | "\n",
124 | " def create_input_array(self,sentences):\n",
125 | " \n",
126 | " input_ids, input_masks, input_segments = [], [], []\n",
127 | "\n",
128 | " for sentence in tqdm(sentences,position=0, leave=True):\n",
129 | " ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n",
130 | "\n",
131 | " input_ids.append(ids)\n",
132 | " input_masks.append(masks)\n",
133 | " input_segments.append(segments)\n",
134 | " \n",
135 | " tensor = [np.asarray(input_ids, dtype=np.int32), \n",
136 | " np.asarray(input_masks, dtype=np.int32), \n",
137 | " np.asarray(input_segments, dtype=np.int32)]\n",
138 | " return tensor"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "class PreprocessingBertData():\n",
148 | " \n",
149 | " def prepare_data_x(self,train_sentences):\n",
150 | " x = bert_model_obj.get_input_array(train_sentences)\n",
151 | " return x\n",
152 | " \n",
153 | " def prepare_data_y(self,train_labels):\n",
154 | " y = list()\n",
155 | " for item in train_labels:\n",
156 | " label = item\n",
157 | " y.append(label)\n",
158 | " y = np.array(y)\n",
159 | " return y\n",
160 | " "
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "class BertModel(object):\n",
170 | " \n",
171 | " def __init__(self):\n",
172 | " \n",
173 | " self.max_len = 128\n",
174 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
175 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
176 | " \n",
177 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
178 | "\n",
179 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
180 | "\n",
181 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
182 | "\n",
183 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
184 | " \n",
185 | " def get_masks(self,tokens, max_seq_length):\n",
186 | " mask_data = [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
187 | " return mask_data\n",
188 | "\n",
189 | " def get_segments(self,tokens, max_seq_length):\n",
190 | " '''\n",
191 | " Segments: 0 for the first sequence, \n",
192 | " 1 for the second\n",
193 | " '''\n",
194 | " segments = []\n",
195 | " segment_id = 0\n",
196 | " for token in tokens:\n",
197 | " segments.append(current_segment_id)\n",
198 | " if token == \"[SEP]\":\n",
199 | " segment_id = 1\n",
200 | " '''Remaining are padded with 0'''\n",
201 | " remaining_segment = [0] * (max_seq_length - len(tokens))\n",
202 | " segment_data = segments + remaining_segment\n",
203 | " return segment_data\n",
204 | " \n",
205 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n",
206 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
207 | " remaining_ids = [0] * (max_seq_length-len(token_ids))\n",
208 | " input_ids = token_ids + remaining_ids\n",
209 | " return input_ids\n",
210 | " \n",
211 | " def get_input_data(self,sentence,maxlen):\n",
212 | "\n",
213 | " sent_token = self.tokenizer.tokenize(sentence)\n",
214 | "\n",
215 | " sent_token = sent_token[:maxlen]\n",
216 | "\n",
217 | " sent_token = [\"[CLS]\"] + sent_token + [\"[SEP]\"]\n",
218 | "\n",
219 | " id = self.get_ids(sent_token, self.tokenizer, self.max_len)\n",
220 | " mask = self.get_masks(sent_token, self.max_len)\n",
221 | " segment = self.get_segments(sent_token, self.max_len)\n",
222 | " input_data = [id,mask,segment]\n",
223 | " return input_data\n",
224 | "\n",
225 | " def get_input_array(self,sentences):\n",
226 | " \n",
227 | " input_ids, input_masks, input_segments = [], [], []\n",
228 | "\n",
229 | " for sentence in tqdm(sentences,position=0, leave=True):\n",
230 | " ids,masks,segments=self.get_input_data(sentence,self.max_len-2)\n",
231 | "\n",
232 | " input_ids.append(ids)\n",
233 | " input_masks.append(masks)\n",
234 | " input_segments.append(segments)\n",
235 | " \n",
236 | " input_array = [np.asarray(input_ids, dtype=np.int32),np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]\n",
237 | " return input_array"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "bert_model_obj = BertModel()\n",
247 | "preprocess_bert_data_obj = PreprocessingBertData()"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "train_sentences = loaddata_obj.train_df[\"sentence\"].tolist()\n",
257 | "train_labels = loaddata_obj.train_df[\"polarity\"].tolist()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n",
267 | "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n",
268 | "\n",
269 | "train_input_ids, train_input_masks, train_segment_ids = x\n",
270 | "train_labels = y"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "class DesignModel():\n",
280 | " def __init__(self):\n",
281 | " self.model = None \n",
282 | " self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n",
283 | " self.train_labels = train_labels\n",
284 | " \n",
285 | " def bert_model(self,max_seq_length): \n",
286 | " in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n",
287 | " in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n",
288 | " in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n",
289 | " \n",
290 | " bert_inputs = [in_id, in_mask, in_segment]\n",
291 | " bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n",
292 | " \n",
293 | " bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n",
294 | " bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n",
295 | " bert_outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\", name=\"dense_output\")(bert_sequence_output)\n",
296 | " self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n",
297 | " \n",
298 | " self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
299 | " \n",
300 | " self.model.summary()\n",
301 | " \n",
302 | " def model_train(self,batch_size,num_epoch):\n",
303 | " print(\"Fitting to model\")\n",
304 | " \n",
305 | " self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n",
306 | " \n",
307 | " print(\"Model Training complete.\")\n",
308 | "\n",
309 | " def save_model(self,model,model_name): \n",
310 | " self.model.save(model_name+\".h5\")\n",
311 | " print(\"Model saved to Model folder.\")"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "model_obj = DesignModel()\n",
321 | "model_obj.bert_model(bert_model_obj.max_len)\n",
322 | "model_obj.model_train(32,1)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "model_obj.save_model(model_obj.model,\"bert\")"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "class Evaluation():\n",
341 | " def get_accuracy(self,actuals, predictions):\n",
342 | " acc = accuracy_score(actuals, predictions)\n",
343 | " return acc"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "class Prediction():\n",
353 | " def __init__(self):\n",
354 | " self.model = model_obj.model\n",
355 | " \n",
356 | " def predict_validation(self):\n",
357 | " valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n",
358 | " valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n",
359 | "\n",
360 | " preprocess_bert_data_obj = PreprocessingBertData()\n",
361 | " val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n",
362 | " prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n",
363 | " return valid_labels,prediction_labels\n",
364 | " \n",
365 | " \n",
366 | " def predict(self,query):\n",
367 | " query_seq = bert_model_obj.get_input_array([query])\n",
368 | " pred = self.model.predict(query_seq)\n",
369 | " pred = np.argmax(pred)\n",
370 | " result = load_data_obj.cat_to_intent[pred]\n",
371 | " return result"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {},
378 | "outputs": [],
379 | "source": [
380 | "pred_obj = Prediction()"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "eval_obj = Evaluation()\n",
390 | "ytest,ypred = pred_obj.predict_validation()\n",
391 | "acc = eval_obj.get_accuracy(ytest,ypred)\n",
392 | "print(\"Auc: {:.2%}\".format(acc))"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {},
399 | "outputs": [],
400 | "source": []
401 | }
402 | ],
403 | "metadata": {
404 | "kernelspec": {
405 | "display_name": "Python 3",
406 | "language": "python",
407 | "name": "python3"
408 | },
409 | "language_info": {
410 | "codemirror_mode": {
411 | "name": "ipython",
412 | "version": 3
413 | },
414 | "file_extension": ".py",
415 | "mimetype": "text/x-python",
416 | "name": "python",
417 | "nbconvert_exporter": "python",
418 | "pygments_lexer": "ipython3",
419 | "version": "3.6.9"
420 | }
421 | },
422 | "nbformat": 4,
423 | "nbformat_minor": 2
424 | }
425 |
--------------------------------------------------------------------------------
/1.3-semantic-similarity/README.md:
--------------------------------------------------------------------------------
1 | pip install -U sentence-transformers scipy
2 |
--------------------------------------------------------------------------------
/1.3-semantic-similarity/semantic-similarity-BERT.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from sentence_transformers import SentenceTransformer\n",
10 | "import scipy\n",
11 | "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n",
12 | "# Corpus with example sentences\n",
13 | "\n",
14 | "corpus = [\n",
15 | " 'A man is eating a food.',\n",
16 | " 'A man is eating a piece of bread.',\n",
17 | " 'The girl is carrying a baby.',\n",
18 | " 'A man is riding a horse.',\n",
19 | " 'A woman is playing violin.',\n",
20 | " 'Two men pushed carts through the woods.',\n",
21 | " 'A man is riding a white horse on an enclosed ground.',\n",
22 | " 'A monkey is playing drums.',\n",
23 | " 'A cheetah is running behind its prey.']\n",
24 | "queries = ['A man is eating pasta.', \n",
25 | " 'Someone in a gorilla costume is playing a set of drums.', \n",
26 | " 'A cheetah chases prey on across a field.']"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 8,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "corpus_embeddings = embedder.encode(corpus)\n",
36 | "query_embeddings = embedder.encode(queries)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 22,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "[0.21805174 0.15202328 1.04767431 0.89392366 0.96026727 0.79048636\n",
49 | " 0.8414415 0.80550679 0.90363039] A man is eating pasta.\n",
50 | "The girl is carrying a baby.\n",
51 | "[0.80833937 0.8089816 0.76493318 0.79766881 0.92636551 0.84321454\n",
52 | " 0.80365482 0.20152853 0.71403489] Someone in a gorilla costume is playing a set of drums.\n",
53 | "A woman is playing violin.\n",
54 | "[0.97539473 0.95483563 0.87328057 0.7070155 0.94015294 0.63376342\n",
55 | " 0.72819954 0.6939273 0.09933373] A cheetah chases prey on across a field.\n",
56 | "A man is eating a food.\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "import numpy as np\n",
62 | "for query, query_embedding in zip(queries, query_embeddings):\n",
63 | " distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n",
64 | " print(distances,query)\n",
65 | " print(corpus[np.argmax(distances)])"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": []
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "Python 3",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.6.9"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 2
97 | }
98 |
--------------------------------------------------------------------------------
/1.3-semantic-similarity/try_cf.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 38,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from sentence_transformers import SentenceTransformer\n",
10 | "import scipy\n",
11 | "embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n",
12 | "# Corpus with example sentences\n",
13 | "\n",
14 | "corpus = [\n",
15 | " 'i would like to clean my XYZ',\n",
16 | " 'book an appointment for XYZ cleaning',\n",
17 | " 'schedule a XYZ cleaning services',\n",
18 | " 'looking for XYZ cleaninng services',\n",
19 | " 'want an appointment for XYZ cleaning'\n",
20 | "]\n",
21 | "queries = ['i would like to clean my XYZ','book a slot for XYZ cleaning','looking for XYZ cleaninng services']"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 39,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "corpus_embeddings = embedder.encode(corpus)\n",
31 | "query_embeddings = embedder.encode(queries)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 40,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "[0. 0.31504001 0.23922292 0.20180429 0.21156411]\n",
44 | "matched sent: book an appointment for XYZ cleaning , id: 1\n",
45 | "query: i would like to clean my XYZ\n",
46 | "[0.249604 0.09753854 0.14501476 0.18097553 0.14041007]\n",
47 | "matched sent: i would like to clean my XYZ , id: 0\n",
48 | "query: book a slot for XYZ cleaning\n",
49 | "[2.01804328e-01 2.37920637e-01 1.28938306e-01 1.11133325e-13\n",
50 | " 1.53037771e-01]\n",
51 | "matched sent: book an appointment for XYZ cleaning , id: 1\n",
52 | "query: looking for XYZ cleaninng services\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "import numpy as np\n",
58 | "for query, query_embedding in zip(queries, query_embeddings):\n",
59 | " distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n",
60 | " print(distances)\n",
61 | " print(\"matched sent: \",corpus[np.argmax(distances)],\", id: \",np.argmax(distances))\n",
62 | " print(\"query: \",query)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": []
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.6.9"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 2
94 | }
95 |
--------------------------------------------------------------------------------
/1.5-named-entity-recognition/data_making.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #!/usr/bin/env python3
3 |
4 | """
5 | Created on Mon Aug 6 19:40:26 2018
6 |
7 | @author: joy
8 | """
9 | import json
10 | abs_path1 = "benchmarking_data/Train/"
11 |
12 | abs_path2 = "benchmarking_data/Validate/"
13 | import re
14 | reg = re.compile('[A-z]*\_([A-z]*)\_[A-z]*')
15 | reg2 = re.compile('[A-z]*\_([A-z]*)')
16 |
17 |
18 | def make_data_for_intent_from_json(json_file,txt_file):
19 |
20 | json_d = json.load(open(abs_path1+json_file))
21 | json_dict = json_d[reg.match(json_file).group(1)]
22 |
23 | wr = open("Intent_Data/"+txt_file,'w')
24 |
25 | for i in json_dict:
26 | each_list = i['data']
27 | sent =""
28 | for i in each_list:
29 | sent = sent + i['text']+ " "
30 | sent =sent[:-1]
31 | for i in range(3):
32 | sent = sent.replace(" "," ")
33 | wr.write(sent)
34 | wr.write('\n')
35 | print(sent)
36 |
37 |
38 | def make_data_from_json(json_file,txt_file):
39 |
40 | json_d = json.load(open(abs_path2+json_file))
41 | json_dict = json_d[reg2.match(json_file).group(1)]
42 |
43 | wr = open(abs_path2+txt_file,'w')
44 |
45 | for i in json_dict:
46 | each_list = i['data']
47 | for i in each_list:
48 | try:
49 | words = i['text'].split()
50 | print(words[0]+' '+'B-'+i['entity'])
51 | wr.write(words[0]+' '+'B-'+i['entity'])
52 | wr.write('\n')
53 | for word in words[1:]:
54 | print(word+' '+'I-'+i['entity'])
55 | wr.write(word+' '+'I-'+i['entity'])
56 | wr.write('\n')
57 | #print(i['text']+'\t'+i['entity'])
58 |
59 | except:
60 | words = i['text'].split()
61 | for word in words:
62 | print(word+' '+'O')
63 | wr.write(word+' '+'O')
64 | wr.write('\n')
65 | print('\n')
66 | wr.write('\n')
67 |
68 |
69 | def make_data_from_json_train(json_file,txt_file):
70 |
71 | json_d = json.load(open(abs_path1+json_file))
72 | json_dict = json_d[reg.match(json_file).group(1)]
73 |
74 | wr = open(abs_path1+txt_file,'w')
75 |
76 | for i in json_dict:
77 | each_list = i['data']
78 | for i in each_list:
79 | try:
80 | words = i['text'].split()
81 | print(words[0]+' '+'B-'+i['entity'])
82 | wr.write(words[0]+' '+'B-'+i['entity'])
83 | wr.write('\n')
84 | for word in words[1:]:
85 | print(word+' '+'I-'+i['entity'])
86 | wr.write(word+' '+'I-'+i['entity'])
87 | wr.write('\n')
88 | #print(i['text']+'\t'+i['entity'])
89 |
90 | except:
91 | words = i['text'].split()
92 | for word in words:
93 | print(word+' '+'O')
94 | wr.write(word+' '+'O')
95 | wr.write('\n')
96 | print('\n')
97 | wr.write('\n')
98 |
99 | import nltk
100 | def make_data_from_json_train_pos(json_file,txt_file):
101 |
102 | json_d = json.load(open(abs_path2+json_file))
103 | json_dict = json_d[reg2.match(json_file).group(1)]
104 |
105 | wr = open(abs_path2+txt_file,'w')
106 |
107 | for i in json_dict:
108 | each_list = i['data']
109 | sent = ""
110 | for i in each_list:
111 | sent = sent+i['text']+" "
112 | sent = sent.replace(" "," ")
113 | if sent[-1]==" ":
114 | sent = sent[:-1]
115 | words = []
116 | pos_tags = nltk.pos_tag(sent.split())
117 | print(pos_tags,sent)
118 | pos_tag_dict = {j:k for j,k in pos_tags}
119 | for i in each_list:
120 | try:
121 |
122 | words = i['text'].split()
123 | print(words[0]+' '+pos_tag_dict[words[0]]+" "+'B-'+i['entity'])
124 | wr.write(words[0]+" "+pos_tag_dict[words[0]]+" "+'B-'+i['entity'])
125 | wr.write('\n')
126 | for word in words[1:]:
127 | print(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity'])
128 | wr.write(word+' '+pos_tag_dict[word]+" "+'I-'+i['entity'])
129 | wr.write('\n')
130 | #print(i['text']+'\t'+i['entity'])
131 |
132 | except:
133 | words = i['text'].split()
134 | for word in words:
135 | print(word+' '+pos_tag_dict[word]+" "+'O')
136 | wr.write(word+' '+pos_tag_dict[word]+" "+'O')
137 | wr.write('\n')
138 | print('\n')
139 | wr.write('\n')
140 |
141 |
142 | import re
143 | import json
144 | import os
145 | def make_data_from_snips(input_path):
146 |
147 | for r,d,f in os.walk(input_path):
148 |
149 | for filename in f:
150 | label = os.path.basename(r)
151 | source = os.path.join(r,filename)
152 |
153 |
154 |
155 | if os.path.splitext(filename)[-1] != '.txt':
156 | continue
157 |
158 |
159 |
160 |
161 | read_file = open(source)
162 |
163 |
164 | pattern = re.compile(r'(?:[[])(?P.*?)(?:[]])(?:[(])(?P.*?)(?:[)])')
165 |
166 | corpus = dict()
167 | corpus[label] = list()
168 | for i in read_file:
169 | data = list()
170 |
171 | it = pattern.finditer(i)
172 |
173 | sent_len = len(i.strip())
174 |
175 | if sent_len == 0:
176 | continue
177 |
178 | last_span = 0
179 | for m in it:
180 |
181 | head = i[last_span:m.span()[0]]
182 | obj = dict()
183 | if head.strip():
184 | obj['text'] = head
185 |
186 | data.append(obj)
187 |
188 | obj = dict()
189 | obj['text'] = m.group('value')
190 | obj['entity'] = m.group('name')
191 |
192 | data.append(obj)
193 |
194 | last_span = m.span()[1]
195 | if last_span:
196 | obj = dict()
197 | if i[last_span :].strip():
198 | obj['text'] = i[last_span :]
199 | data.append(obj)
200 |
201 | if data:
202 |
203 | corpus[label].append({'data': data})
204 |
205 | with open(os.path.join(r,filename.split()[0] + '.json'),'w',encoding='utf-8') as fp:
206 | json.dump(corpus,fp)
207 |
208 |
209 |
210 |
211 |
212 | #make_data("book_restaurant_train.csv","book_restaurant_train.txt")
213 | '''
214 | make_data_from_json_train_pos("train_AddToPlaylist_full.json","train_AddToPlaylist_full.txt")
215 | make_data_from_json_train_pos("train_BookRestaurant_full.json","train_BookRestaurant_full.txt")
216 | make_data_from_json_train_pos("train_GetWeather_full.json","train_GetWeather_full.txt")
217 | make_data_from_json_train_pos("train_PlayMusic_full.json","train_PlayMusic_full.txt")
218 | make_data_from_json_train_pos("train_RateBook_full.json","train_RateBook_full.txt")
219 | make_data_from_json_train_pos("train_SearchCreativeWork_full.json","train_SearchCreativeWork_full.txt")
220 | make_data_from_json_train_pos("train_SearchScreeningEvent_full.json","train_SearchScreeningEvent_full.txt")
221 | '''
222 |
223 | make_data_from_json_train_pos("validate_AddToPlaylist.json","validate_AddToPlaylist.txt")
224 | make_data_from_json_train_pos("validate_BookRestaurant.json","validate_BookRestaurant.txt")
225 | make_data_from_json_train_pos("validate_GetWeather.json","validate_GetWeather.txt")
226 | make_data_from_json_train_pos("validate_PlayMusic.json","validate_PlayMusic.txt")
227 | make_data_from_json_train_pos("validate_RateBook.json","validate_RateBook.txt")
228 | make_data_from_json_train_pos("validate_SearchCreativeWork.json","validate_SearchCreativeWork.txt")
229 | make_data_from_json_train_pos("validate_SearchScreeningEvent.json","validate_SearchScreeningEvent.txt")
230 |
231 |
232 |
233 | #make_data_from_snips("flight_data")
234 |
--------------------------------------------------------------------------------
/1.5-named-entity-recognition/ner_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "For data i used https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines\n",
8 | "Then you can run data_making.py"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "from numpy import array\n",
18 | "import tensorflow as tf\n",
19 | "import glob\n",
20 | "import numpy as np\n",
21 | "import pickle\n",
22 | "\n",
23 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
24 | "from tensorflow.keras.utils import to_categorical\n",
25 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
26 | "from tensorflow.keras.models import Sequential,Model\n",
27 | "from tensorflow.keras.layers import Dense\n",
28 | "from tensorflow.keras.layers import LSTM\n",
29 | "from tensorflow.keras.layers import Input\n",
30 | "from tensorflow.keras.layers import Dropout\n",
31 | "from tensorflow.keras.layers import Embedding\n",
32 | "from tensorflow.keras.layers import TimeDistributed\n",
33 | "from tensorflow.keras.layers import Conv1D\n",
34 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
35 | "from tensorflow.keras.utils import Progbar\n",
36 | "from tensorflow.keras.models import load_model\n",
37 | "\n",
38 | "from tensorflow.keras.initializers import RandomUniform\n"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": []
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "class LoadData():\n",
55 | " def __init__(self):\n",
56 | " self.train_files = None\n",
57 | " self.validation_files = None\n",
58 | " \n",
59 | " def get_data(self):\n",
60 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
61 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "load_data_obj = LoadData()\n",
71 | "load_data_obj.get_data()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "class Preprocessing():\n",
81 | " def __init__(self):\n",
82 | " self.word_embediings_model = open(\"embeddings/glove.6B.100d.txt\", encoding=\"utf-8\")\n",
83 | " \n",
84 | " \n",
85 | " def sentence_from_file(self,filename):\n",
86 | " f = open(filename)\n",
87 | " single_file_sentences = []\n",
88 | " sentence_list = []\n",
89 | " for line in f:\n",
90 | " if len(line)==0 or line[0]==\"\\n\":\n",
91 | " if len(sentence_list) > 0:\n",
92 | " single_file_sentences.append(sentence_list)\n",
93 | " sentence_list = []\n",
94 | " continue\n",
95 | " splits = line.split(' ')\n",
96 | " sentence_list.append([splits[0],splits[1],splits[-1]])\n",
97 | "\n",
98 | " if len(sentence_list) >0:\n",
99 | " single_file_sentences.append(sentence_list)\n",
100 | " sentence_list = []\n",
101 | " return single_file_sentences\n",
102 | "\n",
103 | " def get_case_value(self,word, case_dict): \n",
104 | " case_value = 'other'\n",
105 | "\n",
106 | " count_digits = 0\n",
107 | " for char in word:\n",
108 | " if char.isdigit():\n",
109 | " count_digits += 1\n",
110 | "\n",
111 | " if word.isdigit():\n",
112 | " case_value = 'number'\n",
113 | " elif count_digits / float(len(word)) > 0.5:\n",
114 | " case_value = 'fraction'\n",
115 | " elif word.islower():\n",
116 | " case_value = 'lower'\n",
117 | " elif word.isupper():\n",
118 | " case_value = 'upper'\n",
119 | " elif word[0].isupper():\n",
120 | " case_value = 'title'\n",
121 | " elif count_digits > 0:\n",
122 | " case_value = 'leters_digit'\n",
123 | "\n",
124 | " return case_dict[case_value]\n",
125 | "\n",
126 | "\n",
127 | " def createBatches(self,data):\n",
128 | " l = []\n",
129 | " for i in data:\n",
130 | " l.append(len(i[0]))\n",
131 | " l = set(l)\n",
132 | " batches = []\n",
133 | " batch_len = []\n",
134 | " z = 0\n",
135 | " for i in l:\n",
136 | " for batch in data:\n",
137 | " if len(batch[0]) == i:\n",
138 | " batches.append(batch)\n",
139 | " z += 1\n",
140 | " batch_len.append(z)\n",
141 | " return batches,batch_len\n",
142 | "\n",
143 | " def create_tensors(self,sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id):\n",
144 | " #paddingIdx = word2Idx['PAD_TKN']\n",
145 | " unknownIdx = word_to_id['UNK_TKN']\n",
146 | "\n",
147 | " dataset = []\n",
148 | "\n",
149 | " word_count = 0\n",
150 | " unknownword_count = 0\n",
151 | "\n",
152 | " for sentence in sentences:\n",
153 | " word_indices = [] \n",
154 | " char_indices = []\n",
155 | " case_indices = []\n",
156 | " label_indices = []\n",
157 | " pos_indices = []\n",
158 | "\n",
159 | " for word,char,pos,label in sentence: \n",
160 | "\n",
161 | " word_count += 1\n",
162 | " if word in word_to_id:\n",
163 | " word_index = word_to_id[word]\n",
164 | " elif word.lower() in word_to_id:\n",
165 | " word_index = word_to_id[word.lower()] \n",
166 | " else:\n",
167 | " word_index = unknownIdx\n",
168 | " unknownword_count += 1\n",
169 | " \n",
170 | " char_index = []\n",
171 | " for x in char:\n",
172 | " char_index.append(char_to_id[x])\n",
173 | " \n",
174 | " word_indices.append(word_index)\n",
175 | " case_indices.append(self.get_case_value(word, case_to_id))\n",
176 | " pos_indices.append(pos_to_id[pos.replace('\\n','')])\n",
177 | " char_indices.append(char_index)\n",
178 | " label_indices.append(label_to_id[label])\n",
179 | " print([word_indices, case_indices, char_indices, pos_indices, label_indices])\n",
180 | " dataset.append([word_indices, case_indices, char_indices, pos_indices, label_indices]) \n",
181 | " return dataset\n",
182 | "\n",
183 | "\n",
184 | " def addCharInformatioin(self,Sentences):\n",
185 | " for i,sentence in enumerate(Sentences):\n",
186 | " for j,data in enumerate(sentence):\n",
187 | " chars = [c for c in data[0]]\n",
188 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
189 | " return Sentences\n",
190 | "\n",
191 | " def padding(self,Sentences):\n",
192 | " maxlen = 52\n",
193 | " for sentence in Sentences:\n",
194 | " char = sentence[2]\n",
195 | " for x in char:\n",
196 | " maxlen = max(maxlen,len(x))\n",
197 | " for i,sentence in enumerate(Sentences):\n",
198 | " Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')\n",
199 | " return Sentences\n",
200 | " \n",
201 | " def get_word_embeddings(self,list_sentences):\n",
202 | " wd_to_id = {}\n",
203 | " wd_em = []\n",
204 | " \n",
205 | " words = {}\n",
206 | " for sentence in list_sentences:\n",
207 | " for token,char,pos,label in sentence:\n",
208 | " words[token.lower()] = True\n",
209 | " \n",
210 | " for line in self.word_embediings_model:\n",
211 | " split = line.strip().split(\" \")\n",
212 | "\n",
213 | " if len(wd_to_id) == 0:\n",
214 | " wd_to_id[\"PAD_TKN\"] = len(wd_to_id)\n",
215 | " vector = np.zeros(len(split)-1) \n",
216 | " wd_em.append(vector)\n",
217 | "\n",
218 | " wd_to_id[\"UNK_TKN\"] = len(wd_to_id)\n",
219 | " vector = np.random.uniform(-0.25, 0.25, len(split)-1)\n",
220 | " wd_em.append(vector)\n",
221 | " if split[0].lower() in words:\n",
222 | " vector = np.array([float(num) for num in split[1:]])\n",
223 | " wd_em.append(vector)\n",
224 | " wd_to_id[split[0]] = len(wd_to_id)\n",
225 | "\n",
226 | " wd_em = np.array(wd_em)\n",
227 | " return wd_em,wd_to_id\n",
228 | " \n",
229 | " def get_feature_dict(self,sentences):\n",
230 | "\n",
231 | " labelSet = set()\n",
232 | " lb_to_id = {}\n",
233 | " for sentence in sentences:\n",
234 | " for token,char,pos,label in sentence:\n",
235 | " labelSet.add(label)\n",
236 | "\n",
237 | " for label in labelSet:\n",
238 | " lb_to_id[label] = len(lb_to_id)\n",
239 | "\n",
240 | " id_to_lb = {v: k for k, v in lb_to_id.items()}\n",
241 | "\n",
242 | " ch_to_id = {\"PADDING\":0, \"UNKNOWN\":1}\n",
243 | " for c in \" 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\\\"/\\\\%$`&=*+@^~|øæðş\":\n",
244 | " ch_to_id[c] = len(ch_to_id)\n",
245 | "\n",
246 | " cs_to_id = {\n",
247 | " 'number': 0, 'lower':1, 'upper':2, 'title':3, \n",
248 | " 'other':4, 'fraction':5, 'leters_digit': 6, \n",
249 | " 'PAD_TKN':7\n",
250 | " }\n",
251 | "\n",
252 | " pos_to_id = {\"$\":0, \"''\":1, \"(\":2, \")\":3, \",\":4, \"--\":5, \".\":6, \":\":7, \"CC\":8, \"CD\":9, \"DT\":10,\n",
253 | " \"EX\":11, \"FW\":12, \"IN\":13, \"JJ\":14, \"JJR\":15, \"JJS\":16, \"LS\":17, \"MD\":18, \"NN\":19,\n",
254 | " \"NNP\":20, \"NNPS\":21, \"NNS\":22, \"PDT\":23, \"POS\":24, \"PRP\":25, \"PRP$\":26, \"RB\":27, \n",
255 | " \"RBR\":28, \"RBS\":29, \"RP\":30, \"SYM\":31, \"TO\":32, \"UH\":33, \"VB\":34, \"VBD\":35, \"VBG\":36, \n",
256 | " \"VBN\":37, \"VBP\":38, \"VBZ\":39, \"WDT\":40, \"WP\":41, \"WP$\":42, \"WRB\":43, \"``\":44}\n",
257 | " \n",
258 | " return cs_to_id,pos_to_id,ch_to_id,lb_to_id,id_to_lb\n",
259 | " \n",
260 | " def make_batch(self,dataset):\n",
261 | " self.batch,self.batch_len = self.createBatches(dataset)\n",
262 | " return self.batch,self.batch_len\n",
263 | " \n",
264 | " def make_dataset(self,file_name):\n",
265 | " sentences = self.sentence_from_file(file_name)\n",
266 | " sentences = self.addCharInformatioin(sentences)\n",
267 | " return sentences\n",
268 | " \n",
269 | " def get_sentences(self,file_list):\n",
270 | " list_sentences = []\n",
271 | " for i in file_list:\n",
272 | " list_sentences+= self.make_dataset(i)\n",
273 | " return list_sentences\n",
274 | " "
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "preprocess_obj = Preprocessing()\n",
284 | "train_sentences = preprocess_obj.get_sentences(load_data_obj.train_files)\n",
285 | "word_emb,word_to_id = preprocess_obj.get_word_embeddings(train_sentences)\n",
286 | "\n",
287 | "'''the below function is not requred for validation data, we will load the dictionaries for validation'''\n",
288 | "case_to_id,pos_to_id,char_to_id,label_to_id,id_to_label = preprocess_obj.get_feature_dict(train_sentences)\n",
289 | "train_data_set = preprocess_obj.padding(preprocess_obj.create_tensors(train_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n",
290 | "train_batch,train_batch_len = preprocess_obj.make_batch(train_data_set)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "class DesignModel():\n",
300 | " def __init__(self,params):\n",
301 | " self.model = None\n",
302 | " self.wd_em = word_emb\n",
303 | " self.caseEmbeddings = np.identity(len(case_to_id), dtype='float32')\n",
304 | " self.posEmbeddings = np.identity(len(pos_to_id), dtype='float32') \n",
305 | " self.ch_to_id = char_to_id\n",
306 | " self.lb_to_id = label_to_id\n",
307 | " self.params = params\n",
308 | " self.train_batch = train_batch\n",
309 | " self.train_batch_len = train_batch_len\n",
310 | "\n",
311 | " \n",
312 | " def iterate_minibatches(self,dataset,batch_len): \n",
313 | " start = 0\n",
314 | " for i in batch_len:\n",
315 | " tokens = []\n",
316 | " char = []\n",
317 | " labels = []\n",
318 | " casing = []\n",
319 | " pos_tags = []\n",
320 | " data = dataset[start:i]\n",
321 | " start = i\n",
322 | " for dt in data:\n",
323 | " t,c,ch,pos,l = dt\n",
324 | " l = np.expand_dims(l,-1)\n",
325 | " tokens.append(t)\n",
326 | " char.append(ch)\n",
327 | " labels.append(l)\n",
328 | " casing.append(c)\n",
329 | " pos_tags.append(pos)\n",
330 | " yield np.asarray(labels),np.asarray(tokens),np.asarray(casing), np.asarray(char), np.asarray(pos_tags)\n",
331 | " \n",
332 | " def BiRNN_model(self):\n",
333 | " \n",
334 | " input = Input(shape=(None,),dtype='int32')\n",
335 | "\n",
336 | " words = Embedding(input_dim=self.wd_em.shape[0], output_dim=self.wd_em.shape[1], weights=[self.wd_em], trainable=False)(input)\n",
337 | "\n",
338 | " csng_input = Input(shape=(None,), dtype='int32')\n",
339 | " csng = Embedding(output_dim = self.caseEmbeddings.shape[1], input_dim = self.caseEmbeddings.shape[0], weights = [self.caseEmbeddings], trainable=False)(csng_input)\n",
340 | "\n",
341 | "\n",
342 | " char_input=Input(shape=(None,52,))\n",
343 | " embed_char_out=TimeDistributed(Embedding(len(self.ch_to_id),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input)\n",
344 | " dropout= Dropout(self.params['dropout_rate'])(embed_char_out)\n",
345 | " conv1d_out = TimeDistributed(Conv1D(kernel_size=self.params['kernel_sizes_cnn'], filters=30, padding='same',activation=params['rnn_activation'], strides=1))(dropout)\n",
346 | " maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)\n",
347 | " char = TimeDistributed(Flatten())(maxpool_out)\n",
348 | " char = Dropout(self.params['dropout_rate'])(char)\n",
349 | "\n",
350 | " pos_input = Input(shape=(None,), dtype='int32')\n",
351 | " pos = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n",
352 | "\n",
353 | "\n",
354 | " output = concatenate([words, csng, char, pos])\n",
355 | " output = Bidirectional(LSTM(self.params['units_lstm'], return_sequences=True, dropout=self.params['dropout_rate'], recurrent_dropout=0.25))(output)\n",
356 | " output = TimeDistributed(Dense(len(self.lb_to_id), activation=self.params['rnn_activation']))(output)\n",
357 | " self.model = Model(inputs=[input, csng_input, char_input, pos_input], outputs=[output])\n",
358 | " self.model.compile(loss=self.params['loss'], optimizer=self.params['optimizer'],metrics=[\"accuracy\"])\n",
359 | "\n",
360 | " def train_model(self):\n",
361 | " \n",
362 | " for epoch in range(self.params['epochs']):\n",
363 | "\n",
364 | " print(\"Epoch %d/%d\"%(epoch+1, self.params['epochs']))\n",
365 | " a = Progbar(len(preprocess_obj.batch_len))\n",
366 | " res = None\n",
367 | " for i,batch in enumerate(self.iterate_minibatches(self.train_batch,self.train_batch_len)):\n",
368 | " labels, tkns, csng, char, pos = batch \n",
369 | " res = self.model.train_on_batch([tkns, csng, char, pos], labels)\n",
370 | " a.update(i)\n",
371 | " print(\"\\n\")\n",
372 | " print(self.model.metrics_names[0],\":\",res[0],self.model.metrics_names[1],\":\",res[1])\n",
373 | " print(' ')"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "params = {\n",
383 | " \"kernel_sizes_cnn\": 3,\n",
384 | " \"optimizer\": \"nadam\",\n",
385 | " \"cnn_activation\":\"tanh\",\n",
386 | " \"rnn_activation\":\"softmax\",\n",
387 | " \"units_lstm\" : 100,\n",
388 | " \"loss\": \"sparse_categorical_crossentropy\",\n",
389 | " \"text_size\": 50,\n",
390 | " \"dropout_rate\": 0.5,\n",
391 | " \"epochs\": 100,\n",
392 | " \"model_name\": \"cnn_model\",\n",
393 | " \"batch_size\": 32,\n",
394 | " \"verbose\": True,\n",
395 | " \"metrics\":[\"accuracy\"]\n",
396 | " }\n",
397 | "model_obj = DesignModel(params)\n",
398 | "model_obj.BiRNN_model()\n",
399 | "model_obj.train_model()"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "class LoadAndSaveModels():\n",
409 | " \n",
410 | " def save_model(self,model,model_name):\n",
411 | " model.save(\"Model_Data/entity_models/\"+model_name+\".h5\")\n",
412 | " print(\"Model saved to Model folder.\")\n",
413 | " \n",
414 | " def save_dict(self, save_path,dictionaries): \n",
415 | " \n",
416 | " for item in dictionaries:\n",
417 | " \n",
418 | " with open(save_path+\"/\"+item[1]+\".txt\", \"wb\") as myFile:\n",
419 | " pickle.dump(item[0], myFile)\n",
420 | "\n",
421 | " print(\"Files saved.\")\n",
422 | " \n",
423 | " def load_dict(self,file):\n",
424 | " with open(file,\"rb\") as fp:\n",
425 | " dict = pickle.load(fp)\n",
426 | " return dict\n",
427 | " \n",
428 | " def load_model(self,model_name):\n",
429 | " model = load_model(model_name)\n",
430 | " return model\n"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "load_save = LoadAndSaveModels()\n",
440 | "load_save.save_model(model_obj.model,\"birnn\")\n",
441 | "dict = [(word_to_id,\"word_to_id\"),(label_to_id,\"label_to_id\"),(char_to_id,\"char_to_id\"),\n",
442 | " (id_to_label,\"id_to_label\"),(case_to_id,\"case_to_id\"),(pos_to_id,\"pos_to_id\")]\n",
443 | "load_save.save_dict(\"Model_Data/dict\",dict)"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": null,
449 | "metadata": {},
450 | "outputs": [],
451 | "source": [
452 | "load_save = LoadAndSaveModels()\n",
453 | "model = load_save.load_model(\"Model_Data/entity_models/birnn.h5\")\n",
454 | "word_to_id = load_save.load_dict(\"Model_Data/dict/word_to_id.txt\")\n",
455 | "case_to_id = load_save.load_dict(\"Model_Data/dict/case_to_id.txt\")\n",
456 | "pos_to_id = load_save.load_dict(\"Model_Data/dict/pos_to_id.txt\")\n",
457 | "char_to_id = load_save.load_dict(\"Model_Data/dict/char_to_id.txt\")\n",
458 | "label_to_id = load_save.load_dict(\"Model_Data/dict/label_to_id.txt\")\n",
459 | "id_to_label = load_save.load_dict(\"Model_Data/dict/id_to_label.txt\")"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "validation_sentences = preprocess_obj.get_sentences(load_data_obj.validation_files)\n",
469 | "validation_set = preprocess_obj.padding(preprocess_obj.create_tensors(validation_sentences,word_to_id,case_to_id,pos_to_id,char_to_id,label_to_id))\n",
470 | "validation_batch,validation_batch_len = preprocess_obj.make_batch(validation_set)\n"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "class Prediction():\n",
480 | " def __init__(self):\n",
481 | " self.case_to_id = case_to_id\n",
482 | " self.pos_to_id = pos_to_id\n",
483 | " self.char_to_id = char_to_id\n",
484 | " self.label_to_id = label_to_id\n",
485 | " self.id_to_label = id_to_label\n",
486 | " self.word_to_id = word_to_id\n",
487 | " def prediction(self,dataset,model):\n",
488 | " correct_labels = []\n",
489 | " predict_labels = []\n",
490 | " b = Progbar(len(dataset))\n",
491 | " for i,data in enumerate(dataset): \n",
492 | " tkns, csng, char,pos, labels = data\n",
493 | " tkns = np.asarray([tkns]) \n",
494 | " char = np.asarray([char])\n",
495 | " csng = np.asarray([csng])\n",
496 | " pos = np.asarray([pos])\n",
497 | " predict = model.predict([tkns, csng, char,pos], verbose=False)[0] \n",
498 | " predict = predict.argmax(axis=-1) \n",
499 | " correct_labels.append(labels)\n",
500 | " predict_labels.append(predict)\n",
501 | " b.update(i)\n",
502 | " return predict_labels, correct_labels\n",
503 | " \n",
504 | " def predict(self,sentence,model):\n",
505 | " sen_list = [[[i,'POS','O\\n'] for i in sentence.split()]]\n",
506 | " test_sent = preprocess_obj.addCharInformatioin(sen_list)\n",
507 | "\n",
508 | " predLabels = []\n",
509 | "\n",
510 | " test_set = preprocess_obj.padding(preprocess_obj.create_tensors(test_sent,self.word_to_id,\n",
511 | " self.case_to_id,self.pos_to_id,\n",
512 | " self.char_to_id,self.label_to_id))\n",
513 | " test_batch,test_batch_len = preprocess_obj.createBatches(test_set)\n",
514 | " for i,data in enumerate(test_batch):\n",
515 | " tokens, csng, char, pos, labels = data\n",
516 | " tokens = np.asarray([tokens]) \n",
517 | " char = np.asarray([char])\n",
518 | " csng = np.asarray([csng])\n",
519 | " pos = np.asarray([pos])\n",
520 | " pred = model.predict([tokens,csng, char,pos], verbose=False)[0] \n",
521 | " pred = pred.argmax(axis=-1) #Predict the classes \n",
522 | " predLabels.append(pred)\n",
523 | " entity_labels = []\n",
524 | " j = 0\n",
525 | " words_list = sentence.split()\n",
526 | " for i in predLabels[-1]:\n",
527 | " entity_labels.append((words_list[j],self.id_to_label[int(i)].replace(\"\\n\",\"\")))\n",
528 | " j+=1\n",
529 | "\n",
530 | " return entity_labels"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "pred_obj = Prediction()"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": null,
545 | "metadata": {},
546 | "outputs": [],
547 | "source": [
548 | "sent = \"Add Richard McNamara newest song to the Just Smile playlist\"\n",
549 | "entity_label = pred_obj.predict(sent,model)"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": null,
555 | "metadata": {},
556 | "outputs": [],
557 | "source": [
558 | "entity_label"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "class Evaluate():\n",
568 | " def compute_precision(self,guessed_sentences, correct_sentences):\n",
569 | " assert(len(guessed_sentences) == len(correct_sentences))\n",
570 | " correctCount = 0\n",
571 | " count = 0\n",
572 | "\n",
573 | "\n",
574 | " for sentenceIdx in range(len(guessed_sentences)):\n",
575 | " guessed = guessed_sentences[sentenceIdx]\n",
576 | " correct = correct_sentences[sentenceIdx]\n",
577 | " assert(len(guessed) == len(correct))\n",
578 | " idx = 0\n",
579 | " while idx < len(guessed):\n",
580 | " if guessed[idx][0] == 'B': #A new chunk starts\n",
581 | " count += 1\n",
582 | "\n",
583 | " if guessed[idx] == correct[idx]:\n",
584 | " idx += 1\n",
585 | " correctlyFound = True\n",
586 | "\n",
587 | " while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I\n",
588 | " if guessed[idx] != correct[idx]:\n",
589 | " correctlyFound = False\n",
590 | "\n",
591 | " idx += 1\n",
592 | "\n",
593 | " if idx < len(guessed):\n",
594 | " if correct[idx][0] == 'I': #The chunk in correct was longer\n",
595 | " correctlyFound = False\n",
596 | "\n",
597 | "\n",
598 | " if correctlyFound:\n",
599 | " correctCount += 1\n",
600 | " else:\n",
601 | " idx += 1\n",
602 | " else: \n",
603 | " idx += 1\n",
604 | "\n",
605 | " precision = 0\n",
606 | " if count > 0: \n",
607 | " precision = float(correctCount) / count\n",
608 | "\n",
609 | " return precision\n",
610 | " def get_metrics(self,predictions, correct, idx2Label): \n",
611 | " label_pred = [] \n",
612 | " for sentence in predictions:\n",
613 | " label_pred.append([idx2Label[element] for element in sentence])\n",
614 | "\n",
615 | " label_correct = [] \n",
616 | " for sentence in correct:\n",
617 | " label_correct.append([idx2Label[element] for element in sentence])\n",
618 | "\n",
619 | "\n",
620 | " #print label_pred\n",
621 | " #print label_correct\n",
622 | "\n",
623 | " prec = self.compute_precision(label_pred, label_correct)\n",
624 | " rec = self.compute_precision(label_correct, label_pred)\n",
625 | "\n",
626 | " f1 = 0\n",
627 | " if (rec+prec) > 0:\n",
628 | " f1 = 2.0 * prec * rec / (prec + rec);\n",
629 | "\n",
630 | " return prec, rec, f1"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": null,
636 | "metadata": {},
637 | "outputs": [],
638 | "source": [
639 | "eval_obj = Evaluate()\n",
640 | "\n",
641 | "train_predict_labels, train_correct_labels = pred_obj.prediction(train_data_set,model)\n",
642 | "pre_train, rec_train, f1_train= eval_obj.get_metrics(train_predict_labels, train_correct_labels, id_to_label)\n",
643 | "print(\"Train-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_train, rec_train, f1_train))\n",
644 | " \n",
645 | "validation_predict_labels, validation_correct_labels = pred_obj.prediction(validation_set,model)\n",
646 | "pre_test, rec_test, f1_test= eval_obj.get_metrics(validation_predict_labels, validation_correct_labels, id_to_label)\n",
647 | "print(\"Validation-Data: Precision: %.3f, Recall: %.3f, F1 Score: %.3f\" % (pre_test, rec_test, f1_test))\n"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": null,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": []
656 | }
657 | ],
658 | "metadata": {
659 | "kernelspec": {
660 | "display_name": "Python 3",
661 | "language": "python",
662 | "name": "python3"
663 | },
664 | "language_info": {
665 | "codemirror_mode": {
666 | "name": "ipython",
667 | "version": 3
668 | },
669 | "file_extension": ".py",
670 | "mimetype": "text/x-python",
671 | "name": "python",
672 | "nbconvert_exporter": "python",
673 | "pygments_lexer": "ipython3",
674 | "version": "3.8.2"
675 | }
676 | },
677 | "nbformat": 4,
678 | "nbformat_minor": 4
679 | }
680 |
--------------------------------------------------------------------------------
/1.5-named-entity-recognition/simple_ner-2.0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from numpy import array\n",
10 | "import tensorflow as tf\n",
11 | "import glob\n",
12 | "import numpy as np\n",
13 | "import pickle\n",
14 | "from datetime import datetime\n",
15 | "import nltk \n",
16 | "\n",
17 | "from sklearn.metrics import accuracy_score\n",
18 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
19 | "from tensorflow.keras.utils import to_categorical\n",
20 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
21 | "from tensorflow.keras.models import Sequential,Model\n",
22 | "from tensorflow.keras.layers import Dense\n",
23 | "from tensorflow.keras.layers import LSTM\n",
24 | "from tensorflow.keras.layers import Input\n",
25 | "from tensorflow.keras.layers import Dropout\n",
26 | "from tensorflow.keras.layers import Embedding\n",
27 | "from tensorflow.keras.layers import TimeDistributed\n",
28 | "from tensorflow.keras.layers import Conv1D\n",
29 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
30 | "from tensorflow.keras.utils import Progbar\n",
31 | "from tensorflow.keras.models import load_model\n",
32 | "\n",
33 | "from tensorflow.keras.initializers import RandomUniform\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "class LoadData():\n",
43 | " def __init__(self):\n",
44 | " self.train_files = None\n",
45 | " self.validation_files = None\n",
46 | " \n",
47 | " def get_data(self):\n",
48 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
49 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n",
50 | " \n",
51 | " def sentence_from_file(self,filename):\n",
52 | " single_data_list = list()\n",
53 | " with open(filename) as fp:\n",
54 | " sentence_list = []\n",
55 | " lines = fp.readlines()\n",
56 | " for line in lines:\n",
57 | " splits = line.split(' ')\n",
58 | " if splits[0]=='\\n':\n",
59 | " #sent = \" \".join([word[0] for word in sentence_list])\n",
60 | " #single_data_list.append((sentence_list,sent))\n",
61 | " single_data_list.append(sentence_list)\n",
62 | " sentence_list = list()\n",
63 | " else:\n",
64 | " sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n",
65 | " \n",
66 | " return single_data_list\n",
67 | " \n",
68 | " def addCharInformatioin(self,Sentences):\n",
69 | " for i,sentence in enumerate(Sentences):\n",
70 | " for j,data in enumerate(sentence):\n",
71 | " chars = [c for c in data[0]]\n",
72 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
73 | " return Sentences\n",
74 | " \n",
75 | " def prepared_data(self,files):\n",
76 | " list_sentences = list()\n",
77 | " for each_file in files:\n",
78 | " sentences = self.sentence_from_file(each_file)\n",
79 | " #sentences = self.addCharInformatioin(sentences)\n",
80 | " list_sentences+= sentences\n",
81 | " return list_sentences\n",
82 | " "
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "load_data_obj = LoadData()\n",
92 | "load_data_obj.get_data()\n",
93 | "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n",
94 | "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n",
95 | "print(trained_sen_list[:5])"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "class Preprocessing():\n",
105 | " def __init__(self):\n",
106 | " self.max_len = len(max(trained_sen_list))\n",
107 | " \n",
108 | " def make_data(self,data_list):\n",
109 | " \n",
110 | " \n",
111 | " words = list()\n",
112 | " for each_sent in data_list:\n",
113 | " for each_item in each_sent:\n",
114 | " words.append(each_item[0])\n",
115 | " words = list(set(words))\n",
116 | "\n",
117 | " \n",
118 | " pos_tags = list()\n",
119 | " for each_sent in data_list:\n",
120 | " for each_item in each_sent:\n",
121 | " pos_tags.append(each_item[1])\n",
122 | " pos_tags = list(set(pos_tags))\n",
123 | " \n",
124 | " labels = list()\n",
125 | " for each_sent in data_list:\n",
126 | " for each_item in each_sent:\n",
127 | " labels.append(each_item[2])\n",
128 | " labels = list(set(labels))\n",
129 | " \n",
130 | " \n",
131 | " self.word2idx = {w: i for i, w in enumerate(words)}\n",
132 | " self.word2idx.update({\"PAD\": len(self.word2idx), \"UNK\": len(self.word2idx)+1})\n",
133 | " self.num_words = len(self.word2idx)\n",
134 | " \n",
135 | " self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n",
136 | " self.pos_tag2idx.update({\"PAD\": len(self.pos_tag2idx), \"UNK\": len(self.pos_tag2idx)+1})\n",
137 | " self.num_pos_tags = len(self.pos_tag2idx)\n",
138 | " \n",
139 | " self.label2idx = {t: i for i, t in enumerate(labels)}\n",
140 | " self.num_lables = len(self.label2idx)\n",
141 | " \n",
142 | " def word2features(self,data, word_dict):\n",
143 | " word = data[0]\n",
144 | " postag = data[1]\n",
145 | " binary_map = {True:0,False:1,None:2}\n",
146 | " features = [word_dict[word],binary_map[word.islower()], \n",
147 | " binary_map[word.isupper()], binary_map[word.istitle()], \n",
148 | " binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n",
149 | " return features\n",
150 | "\n",
151 | "\n",
152 | " def sent2features(self,sent,word_dict):\n",
153 | " sentence_features = list()\n",
154 | " for index in range(len(sent)):\n",
155 | " sentence_features.append(self.word2features(sent[index],word_dict))\n",
156 | " \n",
157 | " return sentence_features\n",
158 | "\n",
159 | " def sent2labels(self,sent):\n",
160 | " return [label for token, postag, label in sent]\n",
161 | "\n",
162 | " def sent2tokens(self,sent):\n",
163 | " return [token for token, postag, label in sent]\n",
164 | " \n",
165 | " def create_data(self,data_list):\n",
166 | " self.sentences = data_list\n",
167 | " maxlen = max([len(item) for item in data_list])\n",
168 | " self.max_len = maxlen\n",
169 | " wd = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n",
170 | " \n",
171 | " wd = pad_sequences(maxlen=maxlen, sequences=wd, padding=\"post\",value=self.word2idx[\"PAD\"])\n",
172 | " \n",
173 | " pos = [[self.pos_tag2idx[w[1]] for w in s] for s in self.sentences]\n",
174 | " pos = pad_sequences(maxlen=maxlen, sequences=pos, padding=\"post\",value=self.pos_tag2idx[\"PAD\"])\n",
175 | "\n",
176 | " y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n",
177 | " y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n",
178 | " return (wd,pos),y"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "preprocess_obj = Preprocessing()\n",
188 | "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n",
189 | "x_train,y_train = preprocess_obj.create_data(trained_sen_list)"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "class MyCallback(tf.keras.callbacks.Callback):\n",
199 | " def __init__(self, monitor='acc', baseline=0.95):\n",
200 | " self.monitor = monitor\n",
201 | " self.baseline = baseline\n",
202 | " self.training_stop = False\n",
203 | "\n",
204 | " def on_train_begin(self, logs={}):\n",
205 | " self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n",
206 | "\n",
207 | " def on_epoch_end(self, epoch, logs={}):\n",
208 | " if logs and logs.get(self.monitor) >= self.baseline:\n",
209 | " print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n",
210 | " self.training_stop = True\n",
211 | " \n",
212 | " if self.training_stop: \n",
213 | " self.model.stop_training = True\n",
214 | "\n",
215 | "\n",
216 | "class CreateModel():\n",
217 | " def __init__(self):\n",
218 | " self.model = None\n",
219 | " self.history = None\n",
220 | " self.x_train = x_train\n",
221 | " self.y_train = y_train\n",
222 | " self.max_len = preprocess_obj.max_len\n",
223 | " self.num_words = preprocess_obj.num_words\n",
224 | " self.num_labels = preprocess_obj.num_lables\n",
225 | " self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n",
226 | " \n",
227 | " def train(self):\n",
228 | " word_input = Input(shape=(self.max_len,))\n",
229 | " word_model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n",
230 | " \n",
231 | " pos_input = Input(shape=(None,), dtype='int32')\n",
232 | " pos_model = Embedding(output_dim = self.posEmbeddings.shape[1], input_dim = self.posEmbeddings.shape[0], weights = [self.posEmbeddings], trainable=False)(pos_input)\n",
233 | "\n",
234 | " output = concatenate([word_model, pos_model])\n",
235 | " \n",
236 | " output = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(output)\n",
237 | " output = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(output)\n",
238 | " \n",
239 | " self.model = Model(inputs=[word_input, pos_input], outputs=[output])\n",
240 | " self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n",
241 | " \n",
242 | " def run(self,batch_size=32,epoch=5):\n",
243 | " logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
244 | " logdir = \"logs_tensorboard\"\n",
245 | " tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n",
246 | " \n",
247 | " val_acc = 0.99\n",
248 | " monitor_param = 'val_acc'\n",
249 | " \n",
250 | " checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n",
251 | " \n",
252 | " #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n",
253 | " self.history = self.model.fit(self.x_train, self.y_train,\n",
254 | " batch_size=batch_size, epochs=epoch,\n",
255 | " validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n",
256 | " verbose=1)\n",
257 | " def save_model(self,model_file):\n",
258 | " self.model.save(model_file)"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "model_obj = CreateModel()\n",
268 | "model_obj.train()\n",
269 | "model_obj.run(batch_size=32,epoch=100)\n",
270 | "model_obj.save_model(\"models/simple_ner_model_v2.h5\")"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "class Prediction():\n",
280 | " def __init__(self):\n",
281 | " self.word2idx = preprocess_obj.word2idx\n",
282 | " self.pos_tag2idx = preprocess_obj.pos_tag2idx\n",
283 | " self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n",
284 | " self.model = model_obj.model\n",
285 | " self.max_len = preprocess_obj.max_len\n",
286 | " def predict(self,texts):\n",
287 | " label_lists = list()\n",
288 | " for text in texts:\n",
289 | " words = text.split()\n",
290 | " tagged = nltk.pos_tag(words) \n",
291 | " \n",
292 | " wd = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n",
293 | " wd = pad_sequences(maxlen=self.max_len, sequences=wd,\n",
294 | " padding=\"post\", value=self.word2idx[\"PAD\"])\n",
295 | " \n",
296 | " pos = [[self.pos_tag2idx.get(item, self.pos_tag2idx[\"UNK\"]) for item in tagged]]\n",
297 | " pos = pad_sequences(maxlen=self.max_len, sequences=pos,\n",
298 | " padding=\"post\", value=self.pos_tag2idx[\"PAD\"])\n",
299 | " \n",
300 | " y_pred = self.model.predict([wd,pos])\n",
301 | " pred_index = np.argmax(y_pred, axis=-1)\n",
302 | " preds = pred_index.flatten().tolist()\n",
303 | " labels = [self.idx2label[ind] for ind in preds]\n",
304 | " label_lists.append(labels)\n",
305 | " \n",
306 | " print([(words[idx],labels[idx]) for idx in range(len(words))])\n",
307 | " return label_lists\n",
308 | " "
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "#print(preprocess_obj.word2idx)\n",
318 | "pred_obj = Prediction()\n",
319 | "text = \"Play the last track from Beyonce off Spotify\"\n",
320 | "y_pred = pred_obj.predict([text])"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": []
329 | }
330 | ],
331 | "metadata": {
332 | "kernelspec": {
333 | "display_name": "Python 3",
334 | "language": "python",
335 | "name": "python3"
336 | },
337 | "language_info": {
338 | "codemirror_mode": {
339 | "name": "ipython",
340 | "version": 3
341 | },
342 | "file_extension": ".py",
343 | "mimetype": "text/x-python",
344 | "name": "python",
345 | "nbconvert_exporter": "python",
346 | "pygments_lexer": "ipython3",
347 | "version": "3.6.9"
348 | }
349 | },
350 | "nbformat": 4,
351 | "nbformat_minor": 2
352 | }
353 |
--------------------------------------------------------------------------------
/1.5-named-entity-recognition/simple_ner.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from numpy import array\n",
10 | "import tensorflow as tf\n",
11 | "import glob\n",
12 | "import numpy as np\n",
13 | "import pickle\n",
14 | "from datetime import datetime\n",
15 | "\n",
16 | "from sklearn.metrics import accuracy_score\n",
17 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
18 | "from tensorflow.keras.utils import to_categorical\n",
19 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
20 | "from tensorflow.keras.models import Sequential,Model\n",
21 | "from tensorflow.keras.layers import Dense\n",
22 | "from tensorflow.keras.layers import LSTM\n",
23 | "from tensorflow.keras.layers import Input\n",
24 | "from tensorflow.keras.layers import Dropout\n",
25 | "from tensorflow.keras.layers import Embedding\n",
26 | "from tensorflow.keras.layers import TimeDistributed\n",
27 | "from tensorflow.keras.layers import Conv1D\n",
28 | "from tensorflow.keras.layers import LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate\n",
29 | "from tensorflow.keras.utils import Progbar\n",
30 | "from tensorflow.keras.models import load_model\n",
31 | "\n",
32 | "from tensorflow.keras.initializers import RandomUniform\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "class LoadData():\n",
42 | " def __init__(self):\n",
43 | " self.train_files = None\n",
44 | " self.validation_files = None\n",
45 | " \n",
46 | " def get_data(self):\n",
47 | " self.train_files = glob.glob(\"benchmarking_data/Train//*.txt\")\n",
48 | " self.validation_files = glob.glob(\"benchmarking_data/Validate//*.txt\")\n",
49 | " \n",
50 | " def sentence_from_file(self,filename):\n",
51 | " single_data_list = list()\n",
52 | " with open(filename) as fp:\n",
53 | " sentence_list = []\n",
54 | " lines = fp.readlines()\n",
55 | " for line in lines:\n",
56 | " splits = line.split(' ')\n",
57 | " if splits[0]=='\\n':\n",
58 | " #sent = \" \".join([word[0] for word in sentence_list])\n",
59 | " #single_data_list.append((sentence_list,sent))\n",
60 | " single_data_list.append(sentence_list)\n",
61 | " sentence_list = list()\n",
62 | " else:\n",
63 | " sentence_list.append((splits[0],splits[1],splits[-1].replace('\\n','')))\n",
64 | " \n",
65 | " return single_data_list\n",
66 | " \n",
67 | " def addCharInformatioin(self,Sentences):\n",
68 | " for i,sentence in enumerate(Sentences):\n",
69 | " for j,data in enumerate(sentence):\n",
70 | " chars = [c for c in data[0]]\n",
71 | " Sentences[i][j] = [data[0],chars,data[1],data[2]]\n",
72 | " return Sentences\n",
73 | " \n",
74 | " def prepared_data(self,files):\n",
75 | " list_sentences = list()\n",
76 | " for each_file in files:\n",
77 | " sentences = self.sentence_from_file(each_file)\n",
78 | " #sentences = self.addCharInformatioin(sentences)\n",
79 | " list_sentences+= sentences\n",
80 | " return list_sentences\n",
81 | " "
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "load_data_obj = LoadData()\n",
91 | "load_data_obj.get_data()\n",
92 | "trained_sen_list = load_data_obj.prepared_data(load_data_obj.train_files)\n",
93 | "validation_sen_list = load_data_obj.prepared_data(load_data_obj.validation_files)\n",
94 | "print(trained_sen_list[:5])"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "class Preprocessing():\n",
104 | " def __init__(self):\n",
105 | " self.max_len = len(max(trained_sen_list))\n",
106 | " \n",
107 | " def make_data(self,data_list):\n",
108 | " \n",
109 | " \n",
110 | " words = list()\n",
111 | " for each_sent in data_list:\n",
112 | " for each_item in each_sent:\n",
113 | " words.append(each_item[0])\n",
114 | " words = list(set(words))\n",
115 | "\n",
116 | " \n",
117 | " pos_tags = list()\n",
118 | " for each_sent in data_list:\n",
119 | " for each_item in each_sent:\n",
120 | " pos_tags.append(each_item[1])\n",
121 | " pos_tags = list(set(pos_tags))\n",
122 | " \n",
123 | " labels = list()\n",
124 | " for each_sent in data_list:\n",
125 | " for each_item in each_sent:\n",
126 | " labels.append(each_item[2])\n",
127 | " labels = list(set(labels))\n",
128 | " \n",
129 | " self.word2idx = {\"PAD\": 0, \"UNK\": 1}\n",
130 | " self.word2idx.update({w: i for i, w in enumerate(words)})\n",
131 | " self.num_words = len(self.word2idx)\n",
132 | " \n",
133 | " self.pos_tag2idx = {t: i for i, t in enumerate(pos_tags)}\n",
134 | " self.num_pos_tags = len(self.pos_tag2idx)\n",
135 | " \n",
136 | " self.label2idx = {t: i for i, t in enumerate(labels)}\n",
137 | " self.num_lables = len(self.label2idx)\n",
138 | " \n",
139 | " def word2features(self,data, word_dict):\n",
140 | " word = data[0]\n",
141 | " postag = data[1]\n",
142 | " binary_map = {True:0,False:1,None:2}\n",
143 | " features = [word_dict[word],binary_map[word.islower()], \n",
144 | " binary_map[word.isupper()], binary_map[word.istitle()], \n",
145 | " binary_map[word.isdigit()], self.pos_tag2idx[postag] ]\n",
146 | " return features\n",
147 | "\n",
148 | "\n",
149 | " def sent2features(self,sent,word_dict):\n",
150 | " sentence_features = list()\n",
151 | " for index in range(len(sent)):\n",
152 | " sentence_features.append(self.word2features(sent[index],word_dict))\n",
153 | " \n",
154 | " return sentence_features\n",
155 | "\n",
156 | " def sent2labels(self,sent):\n",
157 | " return [label for token, postag, label in sent]\n",
158 | "\n",
159 | " def sent2tokens(self,sent):\n",
160 | " return [token for token, postag, label in sent]\n",
161 | " \n",
162 | " def create_data(self,data_list):\n",
163 | " self.sentences = data_list\n",
164 | " maxlen = max([len(item) for item in data_list])\n",
165 | " self.max_len = maxlen\n",
166 | " x = [[self.word2idx[w[0]] for w in s] for s in self.sentences]\n",
167 | " #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.num_words - 1)\n",
168 | " x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=self.word2idx[\"PAD\"])\n",
169 | " #x = [self.sent2features(s,self.word2idx) for s in self.sentences]\n",
170 | " #x = pad_sequences(maxlen=maxlen, sequences=x, padding=\"post\",value=[0,2,2,2,2,len(self.pos_tag2idx)])\n",
171 | " print(x[2])\n",
172 | " y = [[self.label2idx[w[2]] for w in s] for s in self.sentences]\n",
173 | " y = pad_sequences(maxlen=maxlen, sequences=y, padding=\"post\", value=self.label2idx[\"O\"])\n",
174 | " return x,y"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "preprocess_obj = Preprocessing()\n",
184 | "preprocess_obj.make_data(trained_sen_list+validation_sen_list)\n",
185 | "x_train,y_train = preprocess_obj.create_data(trained_sen_list)"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "class MyCallback(tf.keras.callbacks.Callback):\n",
195 | " def __init__(self, monitor='acc', baseline=0.95):\n",
196 | " self.monitor = monitor\n",
197 | " self.baseline = baseline\n",
198 | " self.training_stop = False\n",
199 | "\n",
200 | " def on_train_begin(self, logs={}):\n",
201 | " self.history={'loss': [],'acc': [],'val_loss': [],'val_acc': []}\n",
202 | "\n",
203 | " def on_epoch_end(self, epoch, logs={}):\n",
204 | " if logs and logs.get(self.monitor) >= self.baseline:\n",
205 | " print(\"\\nReached %2.2f%% accuracy, so stopping training!!\" %(self.baseline*100))\n",
206 | " self.training_stop = True\n",
207 | " \n",
208 | " if self.training_stop: \n",
209 | " self.model.stop_training = True\n",
210 | "\n",
211 | "\n",
212 | "class CreateModel():\n",
213 | " def __init__(self):\n",
214 | " self.model = None\n",
215 | " self.history = None\n",
216 | " self.x_train = x_train\n",
217 | " self.y_train = y_train\n",
218 | " self.max_len = preprocess_obj.max_len\n",
219 | " self.num_words = preprocess_obj.num_words\n",
220 | " self.num_labels = preprocess_obj.num_lables\n",
221 | " self.posEmbeddings = np.identity(len(preprocess_obj.pos_tag2idx), dtype='float32') \n",
222 | " \n",
223 | " def train(self):\n",
224 | " word_input = Input(shape=(self.max_len,))\n",
225 | " model = Embedding(input_dim=self.num_words, output_dim=50, input_length=self.max_len)(word_input)\n",
226 | " model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)\n",
227 | " out = TimeDistributed(Dense(self.num_labels, activation=\"softmax\"))(model)\n",
228 | " \n",
229 | " self.model = Model(word_input,out)\n",
230 | " self.model.compile(loss=\"sparse_categorical_crossentropy\", optimizer='nadam',metrics=[\"acc\"])\n",
231 | " \n",
232 | " def run(self,batch_size=32,epoch=5):\n",
233 | " logdir = \"logs_tensorboard/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
234 | " tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)\n",
235 | " \n",
236 | " val_acc = 0.99\n",
237 | " monitor_param = 'val_acc'\n",
238 | " \n",
239 | " checkpoint = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')\n",
240 | " \n",
241 | " #checkpoint = MyCallback(monitor=monitor_param,baseline=val_acc) \n",
242 | " self.history = self.model.fit(self.x_train, self.y_train,\n",
243 | " batch_size=batch_size, epochs=epoch,\n",
244 | " validation_split=0.1,callbacks=[checkpoint,tensorboard_callback],\n",
245 | " verbose=1)\n",
246 | " def save_model(self,model_file):\n",
247 | " self.model.save(model_file)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "model_obj = CreateModel()\n",
257 | "model_obj.train()\n",
258 | "model_obj.run(batch_size=32,epoch=100)\n",
259 | "model_obj.save_model(\"models/simple_ner_model.h5\")"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "class Prediction():\n",
269 | " def __init__(self):\n",
270 | " self.word2idx = preprocess_obj.word2idx\n",
271 | " self.idx2label = {v: k for k,v in preprocess_obj.label2idx.items()}\n",
272 | " self.model = model_obj.model\n",
273 | " self.max_len = preprocess_obj.max_len\n",
274 | " def predict(self,texts):\n",
275 | " label_lists = list()\n",
276 | " for text in texts:\n",
277 | " words = text.split()\n",
278 | " x = [[self.word2idx.get(word, self.word2idx[\"UNK\"]) for word in words]]\n",
279 | " x = pad_sequences(maxlen=self.max_len, sequences=x,\n",
280 | " padding=\"post\", value=self.word2idx[\"PAD\"])\n",
281 | " y_pred = self.model.predict(x)\n",
282 | " print(\"Predicted Probabilities on Test Set:\\n\",y_pred.shape)\n",
283 | " # taking tag class with maximum probability\n",
284 | " pred_index = np.argmax(y_pred, axis=-1)\n",
285 | " print(\"Predicted tag indices: \\n\",pred_index.shape)\n",
286 | " preds = pred_index.flatten().tolist()\n",
287 | " labels = [self.idx2label[ind] for ind in preds]\n",
288 | " label_lists.append(labels)\n",
289 | " \n",
290 | " print([(words[idx],labels[idx]) for idx in range(len(words))])\n",
291 | " #print(labels)\n",
292 | " return label_lists\n",
293 | " "
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "#print(preprocess_obj.word2idx)\n",
303 | "pred_obj = Prediction()\n",
304 | "'''\n",
305 | "for item in validation_sen_list:\n",
306 | " sent = \" \".join([self.word2idx[w[0]] for w in s] for item in self.sentences])\n",
307 | " \n",
308 | "'''\n",
309 | "text = \"Play the last track from Beyoncé off Spotify\"\n",
310 | "y_pred = pred_obj.predict([text,text])"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": []
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.6.9"
338 | }
339 | },
340 | "nbformat": 4,
341 | "nbformat_minor": 2
342 | }
343 |
--------------------------------------------------------------------------------
/1.6-intent-classification/README.md:
--------------------------------------------------------------------------------
1 | Use the below link to get the data.
2 | https://www.kaggle.com/joydeb28/nlp-benchmarking-data-for-intent-and-entity
3 |
--------------------------------------------------------------------------------
/1.6-intent-classification/intent_classfication_bert_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import numpy as np # linear algebra\n",
13 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
14 | "import json\n",
15 | "import os\n",
16 | "from sklearn.metrics import roc_curve\n",
17 | "from sklearn.metrics import accuracy_score\n",
18 | "from sklearn.model_selection import train_test_split\n",
19 | "from tensorflow.keras.utils import to_categorical\n",
20 | "from tensorflow.keras.models import Sequential, Model\n",
21 | "from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout\n",
22 | "from tensorflow.keras.optimizers import Adam\n",
23 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
24 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
25 | "import bert\n",
26 | "from tqdm import tqdm\n",
27 | "from tensorflow.keras import backend as K\n",
28 | "import tensorflow as tf\n",
29 | "import tensorflow_hub as hub\n",
30 | "print(\"TensorFlow Version:\",tf.__version__)\n",
31 | "print(\"Hub version: \",hub.__version__)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
39 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
40 | },
41 | "outputs": [],
42 | "source": [
43 | "class LoadingData():\n",
44 | " \n",
45 | " def __init__(self):\n",
46 | " train_file_path = os.path.join(\"benchmarking_data\",\"Train\")\n",
47 | " validation_file_path = os.path.join(\"benchmarking_data\",\"Validate\")\n",
48 | " category_id = 0\n",
49 | " self.cat_to_intent = {}\n",
50 | " self.intent_to_cat = {}\n",
51 | " \n",
52 | " for dirname, _, filenames in os.walk(train_file_path):\n",
53 | " for filename in filenames:\n",
54 | " file_path = os.path.join(dirname, filename)\n",
55 | " intent_id = filename.replace(\".json\",\"\")\n",
56 | " self.cat_to_intent[category_id] = intent_id\n",
57 | " self.intent_to_cat[intent_id] = category_id\n",
58 | " category_id+=1\n",
59 | " print(self.cat_to_intent)\n",
60 | " print(self.intent_to_cat)\n",
61 | " '''Training data'''\n",
62 | " training_data = list() \n",
63 | " for dirname, _, filenames in os.walk(train_file_path):\n",
64 | " for filename in filenames:\n",
65 | " file_path = os.path.join(dirname, filename)\n",
66 | " intent_id = filename.replace(\".json\",\"\")\n",
67 | " training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n",
68 | " self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category']) \n",
69 | " \n",
70 | " self.train_data_frame = self.train_data_frame.sample(frac = 1)\n",
71 | "\n",
72 | "\n",
73 | " \n",
74 | " '''Validation data'''\n",
75 | " validation_data = list() \n",
76 | " for dirname, _, filenames in os.walk(validation_file_path):\n",
77 | " for filename in filenames:\n",
78 | " file_path = os.path.join(dirname, filename)\n",
79 | " intent_id = filename.replace(\".json\",\"\")\n",
80 | " validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id]) \n",
81 | " self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n",
82 | "\n",
83 | " self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n",
84 | " \n",
85 | " \n",
86 | " def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n",
87 | " json_d = json.load(open(json_file)) \n",
88 | " \n",
89 | " json_dict = json_d[intent_id]\n",
90 | "\n",
91 | " sent_list = list()\n",
92 | " for i in json_dict:\n",
93 | " each_list = i['data']\n",
94 | " sent =\"\"\n",
95 | " for i in each_list:\n",
96 | " sent = sent + i['text']+ \" \"\n",
97 | " sent =sent[:-1]\n",
98 | " for i in range(3):\n",
99 | " sent = sent.replace(\" \",\" \")\n",
100 | " sent_list.append((sent,intent_id,cat))\n",
101 | " return sent_list\n",
102 | " "
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "load_data_obj = LoadingData()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "load_data_obj.train_data_frame.head()"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "load_data_obj.validation_data_frame.head().values"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "class BertModel(object):\n",
139 | " \n",
140 | " def __init__(self):\n",
141 | " \n",
142 | " self.max_len = 128\n",
143 | " bert_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1\"\n",
144 | " FullTokenizer=bert.bert_tokenization.FullTokenizer\n",
145 | " \n",
146 | " self.bert_module = hub.KerasLayer(bert_path,trainable=True)\n",
147 | "\n",
148 | " self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()\n",
149 | "\n",
150 | " self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()\n",
151 | "\n",
152 | " self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)\n",
153 | " \n",
154 | " def get_masks(self,tokens, max_seq_length):\n",
155 | " return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
156 | "\n",
157 | " def get_segments(self,tokens, max_seq_length):\n",
158 | " \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n",
159 | " segments = []\n",
160 | " current_segment_id = 0\n",
161 | " for token in tokens:\n",
162 | " segments.append(current_segment_id)\n",
163 | " if token == \"[SEP]\":\n",
164 | " current_segment_id = 1\n",
165 | " return segments + [0] * (max_seq_length - len(tokens))\n",
166 | " \n",
167 | " def get_ids(self,tokens, tokenizer, max_seq_length):\n",
168 | " \"\"\"Token ids from Tokenizer vocab\"\"\"\n",
169 | " token_ids = tokenizer.convert_tokens_to_ids(tokens,)\n",
170 | " input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n",
171 | " return input_ids\n",
172 | " def create_single_input(self,sentence,maxlen):\n",
173 | "\n",
174 | " stokens = self.tokenizer.tokenize(sentence)\n",
175 | "\n",
176 | " stokens = stokens[:maxlen]\n",
177 | "\n",
178 | " stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]\n",
179 | "\n",
180 | " ids = self.get_ids(stokens, self.tokenizer, self.max_len)\n",
181 | " masks = self.get_masks(stokens, self.max_len)\n",
182 | " segments = self.get_segments(stokens, self.max_len)\n",
183 | "\n",
184 | " return ids,masks,segments\n",
185 | "\n",
186 | " def create_input_array(self,sentences):\n",
187 | " \n",
188 | " input_ids, input_masks, input_segments = [], [], []\n",
189 | "\n",
190 | " for sentence in tqdm(sentences,position=0, leave=True):\n",
191 | " ids,masks,segments=self.create_single_input(sentence,self.max_len-2)\n",
192 | "\n",
193 | " input_ids.append(ids)\n",
194 | " input_masks.append(masks)\n",
195 | " input_segments.append(segments)\n",
196 | " \n",
197 | " tensor = [np.asarray(input_ids, dtype=np.int32), \n",
198 | " np.asarray(input_masks, dtype=np.int32), \n",
199 | " np.asarray(input_segments, dtype=np.int32)]\n",
200 | " return tensor"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "class PreprocessingBertData():\n",
210 | " \n",
211 | " def prepare_data_x(self,train_sentences):\n",
212 | " x = bert_model_obj.create_input_array(train_sentences)\n",
213 | " return x\n",
214 | " \n",
215 | " def prepare_data_y(self,train_labels):\n",
216 | " y = list()\n",
217 | " for item in train_labels:\n",
218 | " label = item\n",
219 | " y.append(label)\n",
220 | " y = np.array(y)\n",
221 | " return y\n",
222 | " \n"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "bert_model_obj = BertModel()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "train_sentences = load_data_obj.train_data_frame[\"query\"].tolist()\n",
241 | "train_labels = load_data_obj.train_data_frame[\"category\"].tolist()\n",
242 | "\n",
243 | "preprocess_bert_data_obj = PreprocessingBertData()\n",
244 | "x = preprocess_bert_data_obj.prepare_data_x(train_sentences)\n",
245 | "y = preprocess_bert_data_obj.prepare_data_y(train_labels)\n",
246 | "\n",
247 | "train_input_ids, train_input_masks, train_segment_ids = x\n",
248 | "train_labels = y\n"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "class DesignModel():\n",
258 | " def __init__(self):\n",
259 | " self.model = None \n",
260 | " self.train_data = [train_input_ids, train_input_masks, train_segment_ids]\n",
261 | " self.train_labels = train_labels\n",
262 | " \n",
263 | " def bert_model(self,max_seq_length): \n",
264 | " in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_ids\")\n",
265 | " in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_masks\")\n",
266 | " in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n",
267 | " \n",
268 | " bert_inputs = [in_id, in_mask, in_segment]\n",
269 | " bert_pooled_output, bert_sequence_output = bert_model_obj.bert_module(bert_inputs)\n",
270 | " \n",
271 | " bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)\n",
272 | " bert_output = tf.keras.layers.Dropout(0.2)(bert_output)\n",
273 | " bert_outputs = tf.keras.layers.Dense(len(load_data_obj.cat_to_intent), activation=\"softmax\", name=\"dense_output\")(x)\n",
274 | " self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_outputs)\n",
275 | " \n",
276 | " self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),\n",
277 | " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
278 | " metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name=\"acc\")])\n",
279 | " \n",
280 | " self.model.summary()\n",
281 | " \n",
282 | " def model_train(self,batch_size,num_epoch):\n",
283 | " print(\"Fitting to model\")\n",
284 | " self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)\n",
285 | " print(\"Model Training complete.\")\n",
286 | "\n",
287 | " def save_model(self,model,model_name): \n",
288 | " self.model.save(model_name+\".h5\")\n",
289 | " print(\"Model saved to Model folder.\")"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "model_obj = DesignModel()\n",
299 | "model_obj.bert_model(bert_model_obj.max_len)\n",
300 | "model_obj.model_train(32,1)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "model_obj.save_model(model_obj.model,\"bert\")"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "class Evaluation():\n",
319 | " def get_accuracy(self,actuals, predictions):\n",
320 | " acc = accuracy_score(actuals, predictions)\n",
321 | " return acc"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "class Prediction():\n",
331 | " def __init__(self):\n",
332 | " self.model = model_obj.model\n",
333 | " \n",
334 | " def predict_validation(self):\n",
335 | " valid_sentences = load_data_obj.validation_data_frame[\"query\"].tolist()\n",
336 | " valid_labels = load_data_obj.validation_data_frame[\"category\"].tolist()\n",
337 | "\n",
338 | " preprocess_bert_data_obj = PreprocessingBertData()\n",
339 | " val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)\n",
340 | " prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))\n",
341 | " return valid_labels,prediction_labels\n",
342 | " \n",
343 | " \n",
344 | " def predict(self,query):\n",
345 | " query_seq = bert_model_obj.create_input_array([query])\n",
346 | " pred = self.model.predict(query_seq)\n",
347 | " pred = np.argmax(pred)\n",
348 | " result = load_data_obj.cat_to_intent[pred]\n",
349 | " return result"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "pred_obj = Prediction()\n",
359 | "#pred_obj.predict_validation()"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "querylist = [['I want to see Medal for the General', 'SearchScreeningEvent', 1],\n",
369 | " ['Book a reservation for 5 people at the top-rated brasserie restaurant',\n",
370 | " 'BookRestaurant', 5],\n",
371 | " ['Can I put this tune onto my sin estres playlist?',\n",
372 | " 'AddToPlaylist', 6],\n",
373 | " ['add the artist Pete Murray to my relaxing playlist',\n",
374 | " 'AddToPlaylist', 6],\n",
375 | " ['Book me a reservation for a party of 3 at a pub in Northern Mariana Islands',\n",
376 | " 'BookRestaurant', 5]]\n",
377 | "for query in querylist:\n",
378 | " result = pred_obj.predict(query[0])\n",
379 | " print(\"Predicted Intent: \"+str(result)+\"\\tActual Intent: \"+(load_data_obj.cat_to_intent[query[2]])+\"\\tQuery: \"+str(query[0]))\n"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "eval_obj = Evaluation()\n",
389 | "ytest,ypred = pred_obj.predict_validation()\n",
390 | "acc = eval_obj.get_accuracy(ytest,ypred)\n",
391 | "print(\"Auc: {:.2%}\".format(acc))\n"
392 | ]
393 | }
394 | ],
395 | "metadata": {
396 | "kernelspec": {
397 | "display_name": "Python 3",
398 | "language": "python",
399 | "name": "python3"
400 | },
401 | "language_info": {
402 | "codemirror_mode": {
403 | "name": "ipython",
404 | "version": 3
405 | },
406 | "file_extension": ".py",
407 | "mimetype": "text/x-python",
408 | "name": "python",
409 | "nbconvert_exporter": "python",
410 | "pygments_lexer": "ipython3",
411 | "version": "3.6.9"
412 | }
413 | },
414 | "nbformat": 4,
415 | "nbformat_minor": 4
416 | }
417 |
--------------------------------------------------------------------------------
/1.6-intent-classification/intent_classfication_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.6.9"
21 | },
22 | "colab": {
23 | "name": "intent_classfication_keras.ipynb",
24 | "provenance": [],
25 | "collapsed_sections": []
26 | },
27 | "accelerator": "GPU"
28 | },
29 | "cells": [
30 | {
31 | "cell_type": "code",
32 | "metadata": {
33 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
34 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
35 | "id": "H9qTF3ffa7Pc",
36 | "colab_type": "code",
37 | "colab": {}
38 | },
39 | "source": [
40 | "# Data\n",
41 | "# "
42 | ],
43 | "execution_count": 1,
44 | "outputs": []
45 | },
46 | {
47 | "cell_type": "code",
48 | "metadata": {
49 | "id": "dVcaGQx5bw1n",
50 | "colab_type": "code",
51 | "colab": {}
52 | },
53 | "source": [
54 | "import numpy as np\n",
55 | "import pandas as pd\n",
56 | "import json\n",
57 | "import os\n",
58 | "import en_core_web_sm\n",
59 | "from sklearn.metrics import roc_curve\n",
60 | "from sklearn.metrics import accuracy_score\n",
61 | "from sklearn.model_selection import train_test_split\n",
62 | "from tensorflow.keras.utils import to_categorical\n",
63 | "from tensorflow.keras.models import Sequential, Model, load_model\n",
64 | "from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Bidirectional, Activation\n",
65 | "from tensorflow.keras.optimizers import Adam\n",
66 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
67 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
68 | "from tensorflow.keras.layers import LSTM\n",
69 | "from tensorflow.keras.layers import SimpleRNN\n",
70 | "from tensorflow.keras.layers import Conv1D\n",
71 | "from tensorflow.keras.layers import Dropout\n",
72 | "from tensorflow.keras.layers import BatchNormalization\n",
73 | "from tensorflow.keras.layers import GlobalMaxPooling1D\n",
74 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
75 | "from tensorflow.keras.preprocessing.sequence import pad_sequences"
76 | ],
77 | "execution_count": 18,
78 | "outputs": []
79 | },
80 | {
81 | "cell_type": "code",
82 | "metadata": {
83 | "id": "lystXd0mbNXk",
84 | "colab_type": "code",
85 | "colab": {
86 | "base_uri": "https://localhost:8080/",
87 | "height": 121
88 | },
89 | "outputId": "a9dcaebf-a417-408f-8fa8-11ed934a3efb"
90 | },
91 | "source": [
92 | "from google.colab import drive\n",
93 | "drive.mount(\"/content/drive\")"
94 | ],
95 | "execution_count": 3,
96 | "outputs": [
97 | {
98 | "output_type": "stream",
99 | "text": [
100 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code\n",
101 | "\n",
102 | "Enter your authorization code:\n",
103 | "··········\n",
104 | "Mounted at /content/drive\n"
105 | ],
106 | "name": "stdout"
107 | }
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "metadata": {
113 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
114 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
115 | "id": "jIf3M0bca7Pg",
116 | "colab_type": "code",
117 | "colab": {}
118 | },
119 | "source": [
120 | "class LoadingData():\n",
121 | " \n",
122 | " def __init__(self):\n",
123 | " data_dir = \"/content/drive/My Drive/Projects/Data\"\n",
124 | " train_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Train\")\n",
125 | " validation_file_path = os.path.join(data_dir,\"benchmarking_data\",\"Validate\")\n",
126 | " category_id = 0\n",
127 | " self.cat_to_intent = {}\n",
128 | " self.intent_to_cat = {}\n",
129 | " \n",
130 | " for dirname, _, filenames in os.walk(train_file_path):\n",
131 | " for filename in filenames:\n",
132 | " file_path = os.path.join(dirname, filename)\n",
133 | " intent_id = filename.replace(\".json\",\"\")\n",
134 | " self.cat_to_intent[category_id] = intent_id\n",
135 | " self.intent_to_cat[intent_id] = category_id\n",
136 | " category_id+=1\n",
137 | " '''Training data'''\n",
138 | " training_data = list() \n",
139 | " for dirname, _, filenames in os.walk(train_file_path):\n",
140 | " for filename in filenames:\n",
141 | " file_path = os.path.join(dirname, filename)\n",
142 | " intent_id = filename.replace(\".json\",\"\")\n",
143 | " training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])\n",
144 | " self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category']) \n",
145 | " \n",
146 | " self.train_data_frame = self.train_data_frame.sample(frac = 1)\n",
147 | "\n",
148 | "\n",
149 | " \n",
150 | " '''Validation data'''\n",
151 | " validation_data = list() \n",
152 | " for dirname, _, filenames in os.walk(validation_file_path):\n",
153 | " for filename in filenames:\n",
154 | " file_path = os.path.join(dirname, filename)\n",
155 | " intent_id = filename.replace(\".json\",\"\")\n",
156 | " validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id]) \n",
157 | " self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])\n",
158 | "\n",
159 | " self.validation_data_frame = self.validation_data_frame.sample(frac = 1)\n",
160 | " \n",
161 | " \n",
162 | " def make_data_for_intent_from_json(self,json_file,intent_id,cat):\n",
163 | " json_d = json.load(open(json_file)) \n",
164 | " \n",
165 | " json_dict = json_d[intent_id]\n",
166 | "\n",
167 | " sent_list = list()\n",
168 | " for i in json_dict:\n",
169 | " each_list = i['data']\n",
170 | " sent =\"\"\n",
171 | " for i in each_list:\n",
172 | " sent = sent + i['text']+ \" \"\n",
173 | " sent =sent[:-1]\n",
174 | " for i in range(3):\n",
175 | " sent = sent.replace(\" \",\" \")\n",
176 | " sent_list.append((sent,intent_id,cat))\n",
177 | " return sent_list\n",
178 | " "
179 | ],
180 | "execution_count": 7,
181 | "outputs": []
182 | },
183 | {
184 | "cell_type": "code",
185 | "metadata": {
186 | "id": "CpWQixmea7Pi",
187 | "colab_type": "code",
188 | "colab": {}
189 | },
190 | "source": [
191 | "load_data_obj = LoadingData()"
192 | ],
193 | "execution_count": 8,
194 | "outputs": []
195 | },
196 | {
197 | "cell_type": "code",
198 | "metadata": {
199 | "id": "Hy352jKEa7Pl",
200 | "colab_type": "code",
201 | "colab": {
202 | "base_uri": "https://localhost:8080/",
203 | "height": 195
204 | },
205 | "outputId": "70dc188d-ad57-4a29-c184-aac5abb806ae"
206 | },
207 | "source": [
208 | "load_data_obj.train_data_frame.head()"
209 | ],
210 | "execution_count": 10,
211 | "outputs": [
212 | {
213 | "output_type": "execute_result",
214 | "data": {
215 | "text/html": [
216 | "\n",
217 | "\n",
230 | "
\n",
231 | " \n",
232 | " \n",
233 | " | \n",
234 | " query | \n",
235 | " intent | \n",
236 | " category | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " 8770 | \n",
242 | " rate the current novel 5 stars | \n",
243 | " RateBook | \n",
244 | " 4 | \n",
245 | "
\n",
246 | " \n",
247 | " 6557 | \n",
248 | " Find the schedule for Kingsman: The Secret Ser... | \n",
249 | " SearchScreeningEvent | \n",
250 | " 3 | \n",
251 | "
\n",
252 | " \n",
253 | " 721 | \n",
254 | " find Bells Break Their Towers , a video game | \n",
255 | " SearchCreativeWork | \n",
256 | " 0 | \n",
257 | "
\n",
258 | " \n",
259 | " 229 | \n",
260 | " show creativity of A Catholic Education | \n",
261 | " SearchCreativeWork | \n",
262 | " 0 | \n",
263 | "
\n",
264 | " \n",
265 | " 3680 | \n",
266 | " Will it be warm in Powersville Guam 23 hours f... | \n",
267 | " GetWeather | \n",
268 | " 1 | \n",
269 | "
\n",
270 | " \n",
271 | "
\n",
272 | "
"
273 | ],
274 | "text/plain": [
275 | " query ... category\n",
276 | "8770 rate the current novel 5 stars ... 4\n",
277 | "6557 Find the schedule for Kingsman: The Secret Ser... ... 3\n",
278 | "721 find Bells Break Their Towers , a video game ... 0\n",
279 | "229 show creativity of A Catholic Education ... 0\n",
280 | "3680 Will it be warm in Powersville Guam 23 hours f... ... 1\n",
281 | "\n",
282 | "[5 rows x 3 columns]"
283 | ]
284 | },
285 | "metadata": {
286 | "tags": []
287 | },
288 | "execution_count": 10
289 | }
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "metadata": {
295 | "id": "GSDQwGEBa7Pn",
296 | "colab_type": "code",
297 | "colab": {
298 | "base_uri": "https://localhost:8080/",
299 | "height": 195
300 | },
301 | "outputId": "c7ec5bc6-af6d-4899-d9f6-244d00b7369e"
302 | },
303 | "source": [
304 | "load_data_obj.validation_data_frame.head()"
305 | ],
306 | "execution_count": 11,
307 | "outputs": [
308 | {
309 | "output_type": "execute_result",
310 | "data": {
311 | "text/html": [
312 | "\n",
313 | "\n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " | \n",
330 | " query | \n",
331 | " intent | \n",
332 | " category | \n",
333 | "
\n",
334 | " \n",
335 | " \n",
336 | " \n",
337 | " 699 | \n",
338 | " I want to see Married to the Enemy 2 at a cine... | \n",
339 | " SearchScreeningEvent | \n",
340 | " 3 | \n",
341 | "
\n",
342 | " \n",
343 | " 22 | \n",
344 | " Please look up the song The Mad Magician . | \n",
345 | " SearchCreativeWork | \n",
346 | " 0 | \n",
347 | "
\n",
348 | " \n",
349 | " 139 | \n",
350 | " rate the current essay zero out of 6 stars | \n",
351 | " RateBook | \n",
352 | " 4 | \n",
353 | "
\n",
354 | " \n",
355 | " 599 | \n",
356 | " Add the album to my Club Hits playlist. | \n",
357 | " AddToPlaylist | \n",
358 | " 6 | \n",
359 | "
\n",
360 | " \n",
361 | " 16 | \n",
362 | " Please help me find the Late Night Heartbroken... | \n",
363 | " SearchCreativeWork | \n",
364 | " 0 | \n",
365 | "
\n",
366 | " \n",
367 | "
\n",
368 | "
"
369 | ],
370 | "text/plain": [
371 | " query ... category\n",
372 | "699 I want to see Married to the Enemy 2 at a cine... ... 3\n",
373 | "22 Please look up the song The Mad Magician . ... 0\n",
374 | "139 rate the current essay zero out of 6 stars ... 4\n",
375 | "599 Add the album to my Club Hits playlist. ... 6\n",
376 | "16 Please help me find the Late Night Heartbroken... ... 0\n",
377 | "\n",
378 | "[5 rows x 3 columns]"
379 | ]
380 | },
381 | "metadata": {
382 | "tags": []
383 | },
384 | "execution_count": 11
385 | }
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "metadata": {
391 | "id": "tRmVRiTCa7Pp",
392 | "colab_type": "code",
393 | "colab": {}
394 | },
395 | "source": [
396 | "class Preprocessing():\n",
397 | " def __init__(self):\n",
398 | " self.x_train = None\n",
399 | " self.y_train = None\n",
400 | " self.x_valid = None\n",
401 | " self.y_valid = None\n",
402 | " self.spacy_model = en_core_web_sm.load()\n",
403 | " self.tokenizer = None\n",
404 | "\n",
405 | " def createData(self):\n",
406 | " self.tokenizer = Tokenizer(num_words=None)\n",
407 | " self.max_len = 50\n",
408 | " self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(load_data_obj.train_data_frame['query'].tolist(),load_data_obj.train_data_frame['category'].tolist(),test_size=0.1)\n",
409 | " self.tokenizer.fit_on_texts(list(self.x_train) + list(self.x_valid))\n",
410 | " self.x_train = self.tokenizer.texts_to_sequences(self.x_train)\n",
411 | " self.x_valid = self.tokenizer.texts_to_sequences(self.x_valid)\n",
412 | "\n",
413 | " #zero pad the sequences\n",
414 | " self.x_train = pad_sequences(self.x_train, maxlen=self.max_len)\n",
415 | " self.x_valid = pad_sequences(self.x_valid, maxlen=self.max_len)\n",
416 | " self.y_train = to_categorical(self.y_train)\n",
417 | " self.y_valid = to_categorical(self.y_valid)\n",
418 | " self.word_index = self.tokenizer.word_index\n",
419 | " \n",
420 | " def getSpacyEmbeddings(self,sentneces):\n",
421 | " sentences_vectors = list()\n",
422 | " for item in sentneces:\n",
423 | " query_vec = self.spacy_model(item) \n",
424 | " sentences_vectors.append(query_vec.vector)\n",
425 | " return sentences_vectors\n",
426 | " \n",
427 | " \n",
428 | " \n",
429 | " \n",
430 | " "
431 | ],
432 | "execution_count": 12,
433 | "outputs": []
434 | },
435 | {
436 | "cell_type": "code",
437 | "metadata": {
438 | "id": "LoyTb5Gza7Pr",
439 | "colab_type": "code",
440 | "colab": {}
441 | },
442 | "source": [
443 | "preprocess_obj = Preprocessing()\n",
444 | "preprocess_obj.createData()"
445 | ],
446 | "execution_count": 13,
447 | "outputs": []
448 | },
449 | {
450 | "cell_type": "code",
451 | "metadata": {
452 | "id": "rYI77Z4za7Pt",
453 | "colab_type": "code",
454 | "colab": {
455 | "base_uri": "https://localhost:8080/",
456 | "height": 34
457 | },
458 | "outputId": "5bccac12-dacc-497c-8413-5734ca64df0b"
459 | },
460 | "source": [
461 | "preprocess_obj.y_train.shape"
462 | ],
463 | "execution_count": 14,
464 | "outputs": [
465 | {
466 | "output_type": "execute_result",
467 | "data": {
468 | "text/plain": [
469 | "(12405, 7)"
470 | ]
471 | },
472 | "metadata": {
473 | "tags": []
474 | },
475 | "execution_count": 14
476 | }
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "metadata": {
482 | "id": "ZHI2TUvNa7Pv",
483 | "colab_type": "code",
484 | "colab": {
485 | "base_uri": "https://localhost:8080/",
486 | "height": 34
487 | },
488 | "outputId": "4b1f3c97-2873-4c48-8d51-492e70e70828"
489 | },
490 | "source": [
491 | "preprocess_obj.y_valid.shape"
492 | ],
493 | "execution_count": 15,
494 | "outputs": [
495 | {
496 | "output_type": "execute_result",
497 | "data": {
498 | "text/plain": [
499 | "(1379, 7)"
500 | ]
501 | },
502 | "metadata": {
503 | "tags": []
504 | },
505 | "execution_count": 15
506 | }
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "metadata": {
512 | "id": "T3WXq62ha7Px",
513 | "colab_type": "code",
514 | "colab": {}
515 | },
516 | "source": [
517 | "class DesignModel():\n",
518 | " def __init__(self):\n",
519 | " self.model = None\n",
520 | " self.x_train = preprocess_obj.x_train\n",
521 | " self.y_train = preprocess_obj.y_train\n",
522 | " self.x_valid = preprocess_obj.x_valid\n",
523 | " self.y_valid = preprocess_obj.y_valid\n",
524 | " \n",
525 | " def simple_rnn(self):\n",
526 | " self.model = Sequential()\n",
527 | " self.model.add(Embedding(len(preprocess_obj.word_index) + 1,100,input_length=preprocess_obj.max_len))\n",
528 | " self.model.add(SimpleRNN(100))\n",
529 | " self.model.add(Dense(len(load_data_obj.cat_to_intent), activation='sigmoid'))\n",
530 | " self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
531 | " \n",
532 | " \n",
533 | " def model_train(self,batch_size,num_epoch):\n",
534 | " print(\"Fitting to model\")\n",
535 | " self.model.fit(self.x_train, self.y_train, batch_size=batch_size, epochs=num_epoch, validation_data=[self.x_valid, self.y_valid])\n",
536 | " print(\"Model Training complete.\")\n",
537 | "\n",
538 | " def save_model(self,model_name): \n",
539 | " self.model.save(model_name+\".h5\")\n",
540 | " print(\"Model saved to Model folder.\")"
541 | ],
542 | "execution_count": 24,
543 | "outputs": []
544 | },
545 | {
546 | "cell_type": "code",
547 | "metadata": {
548 | "id": "Tye8X7FFa7Pz",
549 | "colab_type": "code",
550 | "colab": {
551 | "base_uri": "https://localhost:8080/",
552 | "height": 235
553 | },
554 | "outputId": "3e26c58d-4bef-4bd1-f789-9662a0243bef"
555 | },
556 | "source": [
557 | "model_obj = DesignModel()\n",
558 | "model_obj.simple_rnn()\n",
559 | "model_obj.model_train(64,5)\n",
560 | "model_obj.save_model(\"srnn\")"
561 | ],
562 | "execution_count": 25,
563 | "outputs": [
564 | {
565 | "output_type": "stream",
566 | "text": [
567 | "Fitting to model\n",
568 | "Epoch 1/5\n",
569 | "194/194 [==============================] - 9s 46ms/step - loss: 0.8717 - accuracy: 0.8039 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
570 | "Epoch 2/5\n",
571 | "194/194 [==============================] - 9s 46ms/step - loss: 0.0855 - accuracy: 0.9852 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
572 | "Epoch 3/5\n",
573 | "194/194 [==============================] - 9s 46ms/step - loss: 0.0321 - accuracy: 0.9948 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
574 | "Epoch 4/5\n",
575 | "194/194 [==============================] - 9s 45ms/step - loss: 0.0157 - accuracy: 0.9980 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
576 | "Epoch 5/5\n",
577 | "194/194 [==============================] - 9s 45ms/step - loss: 0.0098 - accuracy: 0.9990 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
578 | "Model Training complete.\n",
579 | "Model saved to Model folder.\n"
580 | ],
581 | "name": "stdout"
582 | }
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "metadata": {
588 | "id": "sZ1dlcGta7P1",
589 | "colab_type": "code",
590 | "colab": {}
591 | },
592 | "source": [
593 | "class Evaluation():\n",
594 | " def get_accuracy(self,actuals, predictions):\n",
595 | " acc = accuracy_score(actuals, predictions)\n",
596 | " return acc"
597 | ],
598 | "execution_count": 26,
599 | "outputs": []
600 | },
601 | {
602 | "cell_type": "code",
603 | "metadata": {
604 | "id": "UdXO8h31a7P3",
605 | "colab_type": "code",
606 | "colab": {}
607 | },
608 | "source": [
609 | "class Prediction():\n",
610 | " def __init__(self,model_name):\n",
611 | " self.model = load_model(model_name+\".h5\")\n",
612 | " self.tokenizer = preprocess_obj.tokenizer\n",
613 | " self.max_len = preprocess_obj.max_len\n",
614 | " \n",
615 | " def predict_validation(self):\n",
616 | " self.xtest = load_data_obj.validation_data_frame['query'].tolist()\n",
617 | " self.ytest = load_data_obj.validation_data_frame['category'].tolist()\n",
618 | " self.xtest = self.tokenizer.texts_to_sequences(self.xtest)\n",
619 | " self.xtest = pad_sequences(self.xtest, maxlen=self.max_len)\n",
620 | " self.ypred = self.model.predict(self.xtest)\n",
621 | " self.ypred = [np.argmax(item) for item in self.ypred]\n",
622 | " \n",
623 | " def predict(self,query):\n",
624 | " query_seq = self.tokenizer.texts_to_sequences([query])\n",
625 | " query_pad = pad_sequences(query_seq, maxlen=self.max_len)\n",
626 | " pred = self.model.predict(query_pad)\n",
627 | " pred = np.argmax(pred)\n",
628 | " result = load_data_obj.cat_to_intent[pred]\n",
629 | " return result"
630 | ],
631 | "execution_count": 27,
632 | "outputs": []
633 | },
634 | {
635 | "cell_type": "code",
636 | "metadata": {
637 | "id": "1QAb7Mr-a7P5",
638 | "colab_type": "code",
639 | "colab": {}
640 | },
641 | "source": [
642 | "pred_obj = Prediction(\"srnn\")\n",
643 | "pred_obj.predict_validation()"
644 | ],
645 | "execution_count": 28,
646 | "outputs": []
647 | },
648 | {
649 | "cell_type": "code",
650 | "metadata": {
651 | "id": "8bX7S8VFa7P6",
652 | "colab_type": "code",
653 | "colab": {
654 | "base_uri": "https://localhost:8080/",
655 | "height": 101
656 | },
657 | "outputId": "909f7973-a5de-44cd-b027-7a55a13d5efd"
658 | },
659 | "source": [
660 | "querylist = [\n",
661 | " 'rate The Gift: Imagination and the Erotic Life of Property five stars',\n",
662 | " 'table for Breadline Cafe in Minnesota next friday',\n",
663 | " 'Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?',\n",
664 | " 'Play some sixties songs on Google Music',\n",
665 | " 'rate this textbook four out of 6']\n",
666 | "for query in querylist:\n",
667 | " result = pred_obj.predict(query)\n",
668 | " print(\"Intent: \"+str(result)+\"\\tQuery: \"+str(query))"
669 | ],
670 | "execution_count": 29,
671 | "outputs": [
672 | {
673 | "output_type": "stream",
674 | "text": [
675 | "Intent: RateBook\tQuery: rate The Gift: Imagination and the Erotic Life of Property five stars\n",
676 | "Intent: BookRestaurant\tQuery: table for Breadline Cafe in Minnesota next friday\n",
677 | "Intent: GetWeather\tQuery: Will it be hot at 13:19 in De Funiak Springs Serbia and Montenegro ?\n",
678 | "Intent: PlayMusic\tQuery: Play some sixties songs on Google Music\n",
679 | "Intent: RateBook\tQuery: rate this textbook four out of 6\n"
680 | ],
681 | "name": "stdout"
682 | }
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "metadata": {
688 | "id": "gCFpIFH_a7P8",
689 | "colab_type": "code",
690 | "colab": {
691 | "base_uri": "https://localhost:8080/",
692 | "height": 34
693 | },
694 | "outputId": "d82b6973-49a1-462a-a0ca-661357100520"
695 | },
696 | "source": [
697 | "eval_obj = Evaluation()\n",
698 | "acc = eval_obj.get_accuracy(pred_obj.ytest,pred_obj.ypred)\n",
699 | "print(\"Auc: {:.2%}\".format(acc))\n"
700 | ],
701 | "execution_count": 30,
702 | "outputs": [
703 | {
704 | "output_type": "stream",
705 | "text": [
706 | "Auc: 97.14%\n"
707 | ],
708 | "name": "stdout"
709 | }
710 | ]
711 | }
712 | ]
713 | }
--------------------------------------------------------------------------------
/1.7-entity-recognition/resume-entities-for-ner.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joydeb28/NLP-Notebooks/9669ec6e416e449f67cedb50c143fa4d94bbd44c/1.7-entity-recognition/resume-entities-for-ner.zip
--------------------------------------------------------------------------------
/1.8-next-word-prediction/cab_booking.txt:
--------------------------------------------------------------------------------
1 | I would like to book a Cab
2 | Can you please book a cab from Goa to Mumbai
3 | I would like to book taxi for Chennai Airport
4 | I want to take a cab for airport
5 | Could you please book a cab from me
6 | I need a cab urgent for airport
7 | Can you arrange a cab as soon as possible
8 | I would like to cancel my booking
9 | Could yo please cancel my booking
10 | Can you please cancel my tomorrows booking
11 | I want to cancel my upcoming booking
12 |
--------------------------------------------------------------------------------
/1.8-next-word-prediction/next_word_prediction_keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Next Word Prediction Model Using Tensorflow & keras "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Importing Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from numpy import array\n",
24 | "import numpy as np\n",
25 | "import tensorflow as tf\n",
26 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
27 | "from tensorflow.keras.utils import to_categorical\n",
28 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
29 | "from tensorflow.keras.models import Sequential\n",
30 | "from tensorflow.keras.layers import Dense\n",
31 | "from tensorflow.keras.layers import LSTM\n",
32 | "from tensorflow.keras.layers import Dropout\n",
33 | "from tensorflow.keras.layers import Embedding\n",
34 | "from tensorflow.keras.models import load_model\n"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "Preprocessing Data"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "class Preprocessing():\n",
51 | " \n",
52 | " def __init__(self,input_file):\n",
53 | " self.input_data_file = input_file\n",
54 | " self.data = None\n",
55 | " self.vocab_size = None\n",
56 | " self.encoded_data = None\n",
57 | " self.max_length = None\n",
58 | " self.sequences = None\n",
59 | " self.x = None\n",
60 | " self.y = None\n",
61 | " self.tokenizer = None\n",
62 | " \n",
63 | " def load_data(self):\n",
64 | " fp = open(self.input_data_file,'r')\n",
65 | " self.data = fp.read().splitlines() \n",
66 | " fp.close()\n",
67 | " \n",
68 | " def encode_data(self):\n",
69 | " self.tokenizer = Tokenizer()\n",
70 | " self.tokenizer.fit_on_texts(self.data)\n",
71 | " self.encoded_data = self.tokenizer.texts_to_sequences(self.data)\n",
72 | " self.vocab_size = len(self.tokenizer.word_counts)+1\n",
73 | " \n",
74 | " def generate_sequence(self):\n",
75 | " seq_list = list()\n",
76 | " for item in self.encoded_data:\n",
77 | " l = len(item)\n",
78 | " for id in range(1,l):\n",
79 | " seq_list.append(item[:id+1])\n",
80 | " self.max_length = max([len(seq) for seq in seq_list])\n",
81 | " self.sequences = pad_sequences(seq_list, maxlen=self.max_length, padding='pre')\n",
82 | " self.sequences = array(self.sequences)\n",
83 | " \n",
84 | " def get_data(self):\n",
85 | " self.x = self.sequences[:,:-1]\n",
86 | " self.y = self.sequences[:,-1]\n",
87 | " self.y = to_categorical(self.y,num_classes=self.vocab_size)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "pr = Preprocessing('cab_booking.txt')\n",
97 | "pr.load_data()\n",
98 | "pr.encode_data()\n",
99 | "pr.generate_sequence()\n",
100 | "pr.get_data()"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Model"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "class Model():\n",
117 | " def __init__(self):\n",
118 | " self.model = None\n",
119 | " self.history = None\n",
120 | " self.x = None\n",
121 | " self.y = None\n",
122 | " self.vocab_size = pr.vocab_size\n",
123 | " self.max_len = pr.max_length\n",
124 | " \n",
125 | " \n",
126 | " def create_model(self):\n",
127 | " self.model = Sequential()\n",
128 | " self.model.add(Embedding(self.vocab_size,10,input_length=self.max_len-1))\n",
129 | " self.model.add(LSTM(50))\n",
130 | " self.model.add(Dropout(0.1))\n",
131 | " self.model.add(Dense(self.vocab_size,activation='softmax'))\n",
132 | " self.model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
133 | " print(self.model.summary())\n",
134 | " def run(self,epochs,batch_size):\n",
135 | " self.history = self.model.fit(self.x,self.y,epochs=epochs,batch_size=batch_size,validation_split=0.2)\n",
136 | " \n",
137 | " def save(self):\n",
138 | " self.model.save(\"word_prediction_model.h5\")\n",
139 | " "
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "model_obj = Model()\n",
149 | "model_obj.x = pr.x\n",
150 | "model_obj.y = pr.y\n",
151 | "model_obj.create_model()"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "model_obj.run(700,2)\n",
161 | "model_obj.save()"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "Prediction"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "class Prediction():\n",
178 | " def __init__(self,tokenizer,max_len):\n",
179 | " self.model = None\n",
180 | " self.tokenizer = tokenizer\n",
181 | " self.idx2word = {v:k for k,v in self.tokenizer.word_index.items()}\n",
182 | " self.max_length = max_len\n",
183 | " \n",
184 | " def load_model(self):\n",
185 | " self.model = load_model(\"word_prediction_model.h5\")\n",
186 | " \n",
187 | " def predict_words(self,text,num_words):\n",
188 | " encoded_data = self.tokenizer.texts_to_sequences([text])[0]\n",
189 | " padded_data = pad_sequences([encoded_data],maxlen = self.max_length-1,padding='pre')\n",
190 | " y_preds = self.model.predict(padded_data)\n",
191 | " y_preds = np.argsort(-y_preds)\n",
192 | " y_preds = y_preds[0][:num_words]\n",
193 | " possible_words = [self.idx2word[item] for item in y_preds]\n",
194 | " print(text,possible_words)\n",
195 | " print(possible_words)\n"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "pred = Prediction(pr.tokenizer,pr.max_length) \n",
205 | "pred.load_model()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "pred.predict_words(\"I would like to\",2)\n",
215 | "pred.predict_words(\"can you please\",2)"
216 | ]
217 | }
218 | ],
219 | "metadata": {
220 | "kernelspec": {
221 | "display_name": "Python 3",
222 | "language": "python",
223 | "name": "python3"
224 | },
225 | "language_info": {
226 | "codemirror_mode": {
227 | "name": "ipython",
228 | "version": 3
229 | },
230 | "file_extension": ".py",
231 | "mimetype": "text/x-python",
232 | "name": "python",
233 | "nbconvert_exporter": "python",
234 | "pygments_lexer": "ipython3",
235 | "version": "3.6.9"
236 | }
237 | },
238 | "nbformat": 4,
239 | "nbformat_minor": 2
240 | }
241 |
--------------------------------------------------------------------------------
/1.9-smart-compose/README.md:
--------------------------------------------------------------------------------
1 | Data Source
2 | https://www.usingenglish.com/articles/100-most-useful-emailing-phrases.html
3 | http://english.teamdev.com/resources/useful-phrases
4 | https://blog.talaera.com/business-emails-phrases
5 |
--------------------------------------------------------------------------------
/1.9-smart-compose/data/dataset.txt:
--------------------------------------------------------------------------------
1 | content
2 | Dear Sir/Madam
3 | Dear
4 | Hello
5 | Hi
6 | Hi Team
7 | Good morning Team
8 | Good afternoon
9 | I recently read about that
10 | I recently heard about that
11 | Thank you for taking the time to write to us
12 | Thank you for taking the time to give us some feedback
13 | Thank you for your email
14 | Congratulations on
15 | Hope you're having a great day!
16 | Hope you're feeling great!
17 | Explaining Why You're Writing
18 | I wanted to tell you that
19 | I am writing to tell you about
20 | I'm writing to tell you that
21 | This email is to confirm that
22 | We're sending you this email because
23 | In this email, we wanted to tell you about
24 | We are writing to
25 | We wish to inform you of
26 | I'm writing concerning
27 | I'm writing just so you know
28 | I'm writing to remind you about
29 | I'm writing to let you know that
30 | This email is just to let you know that
31 | Just a quick reminder that
32 | I would like to inform you that
33 | This is just to let you know that
34 | Wanted to give you a friendly reminder that
35 | I am contacting you for the following reason
36 | Just a quick email to see how you're doing
37 | I just wanted to let you know that
38 | So happy we reconnected after this time
39 | So glad that we're in touch again
40 | Can't believe it's been a year since we last spoke! Feels like yesterday
41 | Glad you're back in our life!
42 | Glad to see our old friends again!
43 | It's always nice to get in touch with old friends!
44 | Long time no see! Glad to hear from you again
45 | I highly recommend visiting our new Knowledge Base
46 | I highly recommend checking out this new article
47 | Here is a copy of the information
48 | We brought together some of the best tutorials which
49 | What's new since you left? Lots Starting with
50 | We're confident you'll see big improvements since
51 | We know you're busy but we'd hate to see you miss out on this opportunity!
52 | You can do so much with
53 | Learn how to
54 | We thought you might find this useful
55 | Referring to the Previous Contact
56 | Thank you for your letter
57 | Thank you for contacting us
58 | In reply to your request
59 | Thank you for your letter regarding
60 | Regarding our telephone conversation yesterday
61 | Further to our meeting last week
62 | I would just like to confirm the main points we discussed on Tuesday
63 | I'm writing in reply to your email
64 | In reply to your email
65 | We understand from your email that you're interested in
66 | We talked last week about
67 | We had a phone call
68 | It was nice to hear from you
69 | I was glad to catch up
70 | Making a Request
71 | We would appreciate it if you would
72 | I would be grateful if you could
73 | Could you possibly tell us
74 | In addition, I would like to receive
75 | It would be helpful if you could send us
76 | I am interested in receiving
77 | I would appreciate your attention to this matter
78 | Please let me know what action you propose to take
79 | I would be grateful if you could send me further information about
80 | Would it be possible to have a quick chat?
81 | Would you mind if I took the day off
82 | I was hoping you could do something
83 | What would you like to do next?
84 | Could you please send me the mail
85 | Let's discuss your next step
86 | It would be great if you could
87 | Would you mind having a quick chat?
88 | I was wondering if you could
89 | Could you confirm these details?
90 | Would you like me to send you the link?
91 | Here are the details on
92 | Furthermore
93 | In addition, I would like to
94 | We're glad the issues got sorted out despite the delay
95 | For example
96 | For instance
97 | In other words
98 | In order to fix this bug, we would need to research it a bit further
99 | That's why
100 | I'm pleased to hear that
101 | First of all
102 | Firstly
103 | Secondly
104 | There seems to be a problem with the new feature
105 | Here are the possible solutions
106 | While running the tests we've discovered that it is not working properly
107 | Which option would you like us to work on?
108 | What would you like to go with?
109 | Which solution works best for you?
110 | We've found a bug
111 | Here's how we would like to take care of this issue
112 | How would you like us to solve this issue?
113 | We can see three options
114 | There are two ways to solve this
115 | We've come up with a workaround for this issue
116 | This solution is better but it will take longer to implement
117 | If we go with the first option we might run into some problems with in the future
118 | Could you please clarify what you would like us to do about it?
119 | If I understood you correctly you would like us to
120 | What exactly do you mean by?
121 | Could you please clarify when you would like us to finish this?
122 | When exactly are you expecting to have this feature?
123 | Could you explain what you mean by
124 | Could you be more specific?
125 | Could you please repeat it?
126 | Could you repeat what you said?
127 | Could you give us some more details?
128 | When would it be convenient for you too?
129 | Which option would work best for you?
130 | What would you like us to do next?
131 | Would you like to?
132 | Would you prefer to?
133 | Would you rather or?
134 | How would you feel about?
135 | What do you feel is the next step?
136 | Is it possible to?
137 | Could you check it, please?
138 | Just book time on my calendar and I can answer all your questions
139 | Help us give you the best advice by telling us a bit more about your project
140 | I'd love it if you could walk me through your project
141 | This may be a great time to take a look at our Knowledge Base
142 | If you're interested drop me a line and we can have a quick chat to discuss your further steps
143 | Come check out what's new and get inspired!
144 | Could you please keep us updated on this?
145 | If you have any questions please email or call me
146 | Please feel free to contact me anytime
147 | If there's anything I can do for you please let me know
148 | You can drop a mail if there's anything you'd like to discuss
149 | Feel free to call me
150 | Let's discuss this at the meeting if you don't mind
151 | At our last meeting, we talked about
152 | At the meeting, we agreed to
153 | We'd like to have a meeting about
154 | Let's have a meeting sometime this week
155 | How about taking this over at a meeting?
156 | Why don't we talk this over at a meeting?
157 | I'd be glad to tell you more about this at the meeting today
158 | This issue came up at the meeting we had on
159 | Let's have a meeting to discuss this issue
160 | I've set up a meeting
161 | Our company would be pleased to work with you
162 | If there's anything I can help you with just let me know
163 | We would be happy to help
164 | Thanking
165 | Thank you for your consideration
166 | I appreciate that you took the time to give me these details
167 | Thanks for taking the time to give us your feedback
168 | Thank you for writing to us
169 | Thanks a lot for everything
170 | Thank you for your time
171 | Thank you very much for
172 | Many thanks for
173 | You're so helpful
174 | That's thoughtful of you
175 | I appreciate your help
176 | Thank you for your patience
177 | Thank you for clearing this up
178 | Thank you for helping us in this matter
179 | We are pleased to announce that
180 | We are pleased to inform you that
181 | We have some good news for you
182 | It is my pleasure to let you know that
183 | I'm glad to tell you that
184 | You will be pleased to learn that
185 | We regret to inform you that
186 | I regret to inform you that due to a mistake in our database
187 | Unfortunately, we cannot
188 | we are unable to
189 | After careful consideration, we have decided to
190 | I'm afraid it would be impossible to do
191 | Despite my best efforts it has proved to be impossible to
192 | I'm afraid I've got some bad news for you
193 | We apologize for the delay
194 | I regret any inconvenience caused by
195 | I apologize for the problems you've had
196 | Please accept my apologies
197 | Sorry for any inconveniences this situation may have caused
198 | I would like to apologize for the delay
199 | I would like to apologize for the inconvenience
200 | Once again I apologize for any inconveniences
201 | We are sorry for the delay
202 | I'd like to apologize for making you wait
203 | Sorry to keep you waiting
204 | I'm sorry but
205 | Sorry again for
206 | Please confirm
207 | We'll get back to you as soon as we can
208 | Thank you for your order
209 | We're glad that you chose us to help you with this!
210 | I am attaching
211 | Please find the attachment
212 | You will find attached
213 | I've attached the file for your review
214 | The attached file contains
215 | Here's the attachment we discussed
216 | Please take a look at the attached file
217 | Take a look at the attachment I've attached to this email
218 | I've attached
219 | If we can be of any further assistance please let us know
220 | For further details
221 | If you require more information
222 | Thank you for taking this into consideration
223 | We hope you are happy with this arrangement
224 | We look forward to a successful working relationship in the future
225 | We would be very pleased to do business with your company
226 | I would be happy to have an opportunity to work with your firm
227 | I look forward to seeing you next week
228 | Looking forward to hearing from you
229 | I would appreciate your reply
230 | I look forward to doing business with you in the future
231 | I enjoyed working with you and look forward to
232 | Thank you once more for your help in this matter
233 | If you require any further information please let me know
234 | Let me know if you need any help
235 | If I can help in any way please do not hesitate to contact me
236 | If there's anything I can do to help you just drop me a line
237 | Do not hesitate to contact us again
238 | if there's anything we can help you with
239 | Thank you for your help
240 | I'd love to hear your feedback
241 | Hope to hear from you soon
242 | Thank you for your cooperation
243 | I'd appreciate your reply
244 | Please let me know what you think
245 | Thanks again
246 | Thank you for taking your time
247 | Happy holidays!
248 | Sincerely
249 | Yours sincerely
250 | Sincerely yours
251 | Yours faithfully
252 | Kind regards
253 | Yours truly
254 | Many thanks
255 | Regards
256 | Best regards
257 | With best wishes
258 | Best wishes
259 | Best
260 | All the best
261 | Thanks
262 | Have a great weekend!
263 | Have a wonderful day!
264 | Have a productive day!
265 | I hope you had a good weekend
266 | I hope you had a great trip
267 | Hope you had a nice break
268 | I hope you are well
269 | I hope all is well
270 | Hope you're enjoying your holiday
271 | I hope this email finds you well
272 | I hope you enjoyed the event
273 | I'm glad we had a chance to chat at the convention
274 | It was great to see you on Thursday
275 | It was a pleasure to meet you yesterday
276 | I am writing to you about our last meeting
277 | I am writing to you with regards to concerning
278 | I am writing to you regarding
279 | I am writing to ask
280 | I am writing to let you know
281 | I am writing to confirm
282 | I am writing to check
283 | I am writing to invite you
284 | I am writing to update you on
285 | I am writing to you to follow up on
286 | I am contacting you to inform
287 | I am reaching out because
288 | This is just a quick note to
289 | This is just a quick reminder
290 | I wanted to let you know that
291 | Might I take a moment of your time to
292 | I just got your request for
293 | I just read your email about
294 | As we discussed I would like to send you
295 | Thank you for your email about
296 | Thanks for your email
297 | Thanks for your feedback on
298 | Thanks for your invitation
299 | Thanks for your suggestion
300 | Thanks for sending
301 | Thanks for asking about
302 | Thanks for your quick reply
303 | Thanks for getting back to me so quickly
304 | Thank you for reaching out to me
305 | 1d Apologizing
306 | Sorry for my late reply
307 | Sorry it took me so long to get back to you
308 | I apologize for the late response
309 | Sorry it's been so long since my last email
310 | I was sorry to hear about
311 | Please accept our apologies for any inconvenience caused
312 | I'm enclosing the file
313 | The parts in bold are the changes I made
314 | The parts in red are the changes I made
315 | The parts in blue are the changes we made
316 | Here's the document that you asked for
317 | Please take a look at the file I've attached to this email
318 | Could you please?
319 | Could you possibly tell me?
320 | Can you please fill out this form?
321 | I'd appreciate it if you could
322 | I'd be very grateful if you could
323 | It would be very helpful if you could send
324 | If possible I'd like to know more about
325 | Please find my two main questions below
326 | 2c Asking for clarifications
327 | I didn't fully understand
328 | Could you please explain that again?
329 | I didn't quite get your point
330 | Could you repeat what you said about it?
331 | If you could please shed some light on this topic I would appreciate it
332 | Could you please clarify?
333 | If I understood you correctly you would like me to
334 | What exactly do you mean by
335 | In other words, would you like us to
336 | Thank you for letting me know
337 | Thank you for the heads up
338 | Thank you for the notice
339 | Please note
340 | Quick reminder
341 | Just a friendly reminder that
342 | Thank you for sharing
343 | I'd like to inform you that
344 | Thanks for keeping me in the loop
345 | Please keep me informed
346 | Please keep me posted
347 | Please keep me updated
348 | Please keep me in the loop
349 | Please let me know if this is OK with you
350 | What are your thoughts on this?
351 | What do you think?
352 | we're waiting for approval
353 | We just need the thumbs up
354 | We just need the the green light
355 | You totally have the green light!
356 | He approved of it so you can go ahead with the project
357 | I'd like to schedule a meeting if you are available
358 | I am available on
359 | if that's convenient for you
360 | Would you be available on
361 | If so I'll send you an invite shortly
362 | Can you make it on
363 | If so I'll book accordingly
364 | I'm afraid I can't make it on
365 | We need to reschedule our meeting
366 | We need to postpone our meeting
367 | We need to put back our meeting
368 | We need to cancel our meeting
369 | We need to move our meeting
370 | We need to rearrange our meeting
371 | We are sorry to inform you that the interview scheduled for
372 | We are sorry to inform you that the meeting scheduled for
373 | Unfortunately
374 | I'm afraid it will not be possible to
375 | Unfortunately, I have to tell you that
376 | I'm afraid that we can't
377 | I regret to inform you that
378 | After careful consideration, we have decided
379 | It's against company policy to
380 | I tried my best but
381 | Despite my best efforts
382 | I can't see how
383 | I'm sorry but it's out of my hands
384 | I'm afraid I won't be able to
385 | I'm sorry to tell you that
386 | Do you need a reply?
387 | Are you asking for a favor or you are meeting soon?
388 | These sentences are perfect for those moments!
389 | Looking forward to hearing from you soon
390 | I look forward to hearing from you soon
391 | Please let me know if this works
392 | Please let me know if you are available
393 | Please let me know if that sounds good
394 | Please let me know if you can
395 | Please let me know if you can help
396 | Please let me know if you need to reschedule
397 | I look forward to seeing
398 | I look forward to meeting you
399 | See you on next week
400 | Thank you in advance
401 | Thank you for everything
402 | Cheers
403 | Any feedback you can give me on this would be greatly appreciated
404 | Any feedback you can give me on this would be highly appreciated
405 | Any feedback you can give me on this would be much appreciated
406 | If you could have it ready
407 | I would appreciate it
408 | I would appreciate your help in this matter
409 | 3b Offering help or information
410 | I hope you find this helpful
411 | I hope it's clearer now
412 | I hope that answers all your questions
413 | If you have any questions
414 | If you have more questions
415 | In the meantime, if you need any more information
416 | If you need more information
417 | If you need more info
418 | If you need further information
419 | I know that's a lot to take in so let me know if anything I've said doesn't make sense
420 | please do not hesitate to contact me
421 | please feel free to contact me
422 | please feel free to get in touch
423 | please let me know
424 | drop me an email
425 | drop me a mail
426 | Thank you for your understanding
427 | Thanks again for your understanding
428 | Thanks for your patience
429 | Once again please accept our apologies for any inconvenience caused
430 | Once again please accept our apologies for the inconvenience caused
431 | Once again please accept our apologies for the delay
432 | Once again please accept our apologies for the misunderstanding
433 | I hope this is okay with you
434 | I hope we can find a solution soon
435 | I hope you can understand
436 | Sorry I couldn't be of more help
437 | Good morning
438 | Hope you're having a great!
439 | This email is to confirm that we've received your payment
440 | I'm sending you this email because
441 | In this email, I wanted to tell you about
442 | I highly recommend
443 | It was nice to hear from you yesterday
444 | I was glad to catch up yesterday
445 | Could you possibly tell us more
446 | Could you please send me the link
447 | Just wondered if you could send me a copy
448 | We're glad that the issues got sorted out despite the delay
449 | Talking about Problems and Solutions
450 | However, the second solution will take much longer and we cannot give even a rough estimate at the moment
451 | We'd like to research this problem a bit more to give you a more detailed list of options
452 | Could you please clarify what you would like us to do about
453 | I didn't quite get your point about
454 | Could you repeat what you said about
455 | Could you give us some more details on the
456 | When would it be convenient for you to
457 | Have you given any additional consideration to
458 | Could you do something?
459 | Talking about Meetings
460 | This issue came up at the meeting we had on Friday
461 | Here's the link
462 | You will be pleased to hear that
463 | Giving Bad News
464 | I'm afraid it would not be possible toThan
465 | That's not possible
466 | I can't see any way to
467 | It's out of my hands
468 | Talking about Vacations and Holidays
469 | I'm planning a vacation
470 | Would that be all right with you?
471 | We have a national holiday in our country on
472 | Therefore our office will not be working on that date
473 | I'm currently on vacation
474 | If you have questions please drop a mail
475 | When would it be all right for me to have a week-long vacation?
476 | I'm going to be on vacation
477 | going to have a day off
478 | Today I am not feeling well
479 | I'm on vacation now until
480 | I will read and answer all emails as soon as I get back
481 | If this is urgent please contact
482 | Please find attached
483 | The attached files contain
484 | Please take a look at the attachment
485 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLP-Tutorial
2 | Natural Language Processing
3 | https://www.topbots.com/generalized-language-models-tasks-datasets/
4 |
5 | #### If you find this repository helpful, a star ⭐ would be greatly appreciated!
6 | #### Created by Joydeb Mondal
7 |
--------------------------------------------------------------------------------
/simple-efficient-summarizer.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"Data\nAmazon fine food reviews from Kaggle"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nimport tensorflow as tf","execution_count":11,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Loading the data"},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class LoadData():\n def __init__(self):\n data = pd.read_csv(\"/kaggle/input/amazon-fine-food-reviews/Reviews.csv\")\n self.data = data.drop([\"Id\",\"ProductId\",\"UserId\",\"ProfileName\",\"HelpfulnessNumerator\",\"HelpfulnessDenominator\",\"Score\",\"Time\"],axis=1)\n ","execution_count":19,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Calling load data object"},{"metadata":{"trusted":true},"cell_type":"code","source":"load_data = LoadData()\ndata = load_data.data","execution_count":20,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.7.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4}
--------------------------------------------------------------------------------