├── .DS_Store
├── .idea
├── misc.xml
├── modules.xml
├── sbdsubjectclassifier.iml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── char_cnn_model
├── char_cnn.py
└── cnn_model.py
├── rnn_model
├── RnnModelGenerator.py
├── RnnModelGeneratorGpu.py
└── RnnModelMain.py
├── tfidf_ordering
├── idf_score_calculator.py
└── tfidf_ordering.py
└── use_with_mlp.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeerLabs/sbdsubjectclassifier/c954add24c88e0b8bd66b18f08d4a8894346538c/.DS_Store
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/sbdsubjectclassifier.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | true
181 | DEFINITION_ORDER
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 | Python
194 |
195 |
196 |
197 |
198 | PyPep8NamingInspection
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 | 1562022873128
263 |
264 |
265 | 1562022873128
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 SeerLabs
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sbdsubjectclassifier:Scholarly Big Data Subject Category Classifier
2 |
3 | In this project, we attempt to classify research papers into their subject areas. By splitting abstracts into words and converting each word into a n-dimensional word embedding,
4 | we project the text data into a vector space with time-steps. In order to select the words, we use TF-IDF weights to find the most important words in the abstract and sort them based on the TF-IDF values.
5 | To classify the data, we use 2 flavors of Recurrent Neural networks (RNN's) namely : LSTM and GRU. We tried various Word Embedding (WE) models such as
6 | GloVe, SciBert and FastText. We also use Universal Sentence Encoder (USE) with Multi-layers perceptron (MLP) and char-level CNN to compare the performance of the above model.
7 |
8 | ### Requirements:
9 | 1. Keras
10 | 2. tensorflow (tensorflow_gpu recommended)
11 | 3. nltk
12 | 4. pandas
13 | 5. sklearn
14 | 6. tensorflow_hub(for USE)
15 |
16 | ### Models
17 | [RNNs with WE](https://github.com/SeerLabs/sbdsubjectclassifier/tree/master/keras_model):
18 | This model takes text data path (.csv file with data and label) and WE file path as inputs and classifies the data using RNN's.
19 |
20 | #### step1: clean the data.
21 | In order to clean and order the data, use [tf_idf sorting module](https://github.com/SeerLabs/sbdsubjectclassifier/blob/master/tfidf_ordering/tfidf_ordering.py).
22 | This module splits the sentences into words and removes stopwords, punctuations and numbers from the words. These words are lemmatized and then sorted based on TF-IDF values.
23 | This module takes 3 arguments:
24 | 1. data_path : path to the text data
25 | 2. max_len (optional) : maximum length of the words to be retained in each text sequence (in our case, abstract).
26 | 3. tf_idf ordering (optional) : boolean value. set to 'True' to sort the values based on TF-IDF valuses. 'False' retains the order instead of sorting.
27 | ```
28 | from tfidf_ordering import tfidf_ordering
29 | tfidf_ordering(data_path,tfidf_sorting=True,max_len=80)
30 | ```
31 | The cleaned data will be saved in a csv file named 'final_tfidf_ordered_data.csv'.
32 |
33 | #### step2: Run the model.
34 | After cleaning the data, build and run the [model](https://github.com/SeerLabs/sbdsubjectclassifier/tree/master/rnn_model) to classify the data. Arguments for the model are:
35 | 1. abstracts_path : provide the path to the cleaned data, i.e 'final_tfidf_ordered_data.csv'.
36 | 2. WE_path : provide the path to the WE file.
37 | 3. max_len (optional) : maximum length of the words to be retained in each text sequence (in our case, abstract). Default value-80.
38 | 4. nodes (optional) : No of rnn cells in each layer. Default-128
39 | 5. layers (optional) : No of rnn layers required. Default : 2
40 | 6. loss (optional) : default='categorical_crossentropy',
41 | 7. optimizer (optional) : default ='Adam'
42 | 8. activation (optional) : default = 'tanh'
43 | 9. dropout fraction (optional) : default = '0.2'
44 | 10. batch_size (optional) : size of each batch for stochastic gradient descent. default=1000
45 | 11. epochs (optional) : No of epochs for training
46 | 12. gpus (optional) : No of gpus in case of multi gpu training. Default : None. If None, triggers cpu model.
47 |
48 | Run the following to create a model object which takes abstracts_path and WE_path as input (add optional arguments if required) and prints accuracy and f1-score of the classification.
49 | ```
50 | from RnnModelMain import Model
51 | Model(abstracts_path, WE_path)
52 | ```
53 | [character-level CNN](https://github.com/SeerLabs/sbdsubjectclassifier/tree/master/char_cnn_model):
54 | Similarly, to implement character-level CNN model, implement the following:
55 |
56 | ```
57 | from char_cnn import char_cnn
58 | char_cnn(abstracts_path)
59 | ```
60 | optional arguments are : batch_size,epochs and gpus
61 |
62 |
63 | [USE with MLP](https://github.com/SeerLabs/sbdsubjectclassifier/blob/master/use_with_mlp.py):
64 | To implement character-level CNN model, implement the following:
65 |
66 | ```
67 | from use_with_mlp import MlpModelWithUSE
68 | MlpModelWithUSE(abstracts_path)
69 | ```
70 | optional arguments are : nodes, layers, loss,optimizer, activation, dropout, batch_size, epochs,gpus
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/char_cnn_model/char_cnn.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 | import string
4 | from sklearn.model_selection import train_test_split
5 |
6 | from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, classification_report, accuracy_score
7 | import cnn_model
8 | import numpy as np
9 | import pandas as pd
10 | from cnn_model import cnn_model
11 |
12 | class char_cnn:
13 |
14 | def __init__(self, data_path, batch_size=1000,epochs=50, gpus= None):
15 | self.abstracts_path = data_path
16 | self.batch_size = batch_size
17 | self.nb_filter = 256
18 | self.dense_outputs = 1024
19 | self.filter_kernels = [7, 7, 3, 3, 3, 3]
20 | self.gpus = gpus
21 | self.epochs = epochs
22 | self.main()
23 |
24 | def main(self):
25 | abstracts_file = pd.read_csv(self.abstracts_path, index_col=['abstract', 'labels'])
26 | abstracts = abstracts_file['abstract']
27 | labels = np.array(abstracts_file['label'], dtype=np.int16)
28 | classes = len(set(labels))
29 | alphabet = set(list(string.ascii_lowercase) + list(string.digits) +
30 | list(string.punctuation) + ['\n'])
31 | vocab_size = len(alphabet)
32 | vocab = dict()
33 | for ix, t in enumerate(alphabet):
34 | vocab[t] = ix
35 |
36 | X_train, X_test, y_train, y_test = train_test_split(abstracts, labels, stratify=labels, test_size=0.1,
37 | random_state=42)
38 |
39 | model = cnn_model(self.filter_kernels, self.dense_outputs,vocab_size,
40 | self.nb_filter, classes,self.batch_size,vocab,gpus=self.gpus)
41 |
42 | model_dnn = model.create_model()
43 | model_dnn = model.train(self.batch_size, self.epochs, X_train, y_train, classes,
44 | model_dnn)
45 | y_pred = model.test(X_test, self.batch_size, model_dnn)
46 | print(f1_score(y_test, y_pred, average='micro'))
47 | print(recall_score(y_test, y_pred, average='micro'))
48 | print(precision_score(y_test, y_pred, average='micro'))
49 | print(accuracy_score(y_test, y_pred))
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
54 |
--------------------------------------------------------------------------------
/char_cnn_model/cnn_model.py:
--------------------------------------------------------------------------------
1 | from keras.models import Model
2 | from keras.optimizers import SGD, Adam
3 | from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding
4 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
5 | from keras.initializers import RandomNormal
6 | import tensorflow as tf
7 | from keras.utils import multi_gpu_model, to_categorical
8 | import numpy as np
9 |
10 |
11 | class cnn_model:
12 |
13 | def __init__(self, filter_kernels, dense_outputs,vocab_size, nb_filter, classes, batch_size,vocab, max_len= 1014,gpus=None):
14 | self.filter_kernels = filter_kernels
15 | self.dense_outputs = dense_outputs
16 | self.max_len = max_len
17 | self.vocab_size = vocab_size
18 | self.nb_filter = nb_filter
19 | self.classes = classes
20 | self.gpus = gpus
21 | self.batch_size = batch_size
22 | self.vocab = vocab
23 |
24 | def one_hot(self, x):
25 | return tf.one_hot(x, self.vocab_size, on_value=1.0, off_value=0.0, axis=-1, dtype=tf.float32)
26 |
27 | def one_hot_outshape(self, in_shape):
28 | return in_shape[0], in_shape[1], self.vocab_size
29 |
30 | def create_model(self):
31 | initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)
32 |
33 | # Define what the input shape looks like
34 | inputs = Input(shape=(self.max_len,), dtype='int64')
35 |
36 | embedded = Lambda(self.one_hot, output_shape=self.one_hot_outshape)(inputs)
37 |
38 | # All the convolutional layers...
39 | conv = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[0], kernel_initializer=initializer,
40 | padding='valid', activation='relu',
41 | input_shape=(self.max_len, self.vocab_size))(embedded)
42 | conv = MaxPooling1D(pool_size=3)(conv)
43 |
44 | conv1 = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[1],
45 | kernel_initializer=initializer,
46 | padding='valid', activation='relu')(conv)
47 | conv1 = MaxPooling1D(pool_size=3)(conv1)
48 |
49 | conv2 = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[2],
50 | kernel_initializer=initializer,
51 | padding='valid', activation='relu')(conv1)
52 |
53 | conv3 = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[3],
54 | kernel_initializer=initializer,
55 | padding='valid', activation='relu')(conv2)
56 |
57 | conv4 = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[4],
58 | kernel_initializer=initializer,
59 | padding='valid', activation='relu')(conv3)
60 |
61 | conv5 = Convolution1D(filters=self.nb_filter, kernel_size=self.filter_kernels[5],
62 | kernel_initializer=initializer,
63 | padding='valid', activation='relu')(conv4)
64 | conv5 = MaxPooling1D(pool_size=3)(conv5)
65 | conv5 = Flatten()(conv5)
66 |
67 | # Two dense layers with dropout of .5
68 | z = Dropout(0.5)(Dense(self.dense_outputs, activation='relu')(conv5))
69 | z = Dropout(0.5)(Dense(self.dense_outputs, activation='relu')(z))
70 |
71 | # Output dense layer with softmax activation
72 | pred = Dense(self.classes, activation='softmax', name='output')(z)
73 |
74 | model = Model(inputs=inputs, outputs=pred)
75 |
76 | adam = Adam(lr=0.001)
77 | if self.gpus != None:
78 | model = multi_gpu_model(model, gpus=self.gpus)
79 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
80 | return model
81 |
82 | def encode_data(self, x, maxlen, vocab):
83 | # Iterate over the loaded data and create a matrix of size (len(x), maxlen)
84 | # Each character is encoded into a one-hot array later at the lambda layer.
85 | # Chars not in the vocab are encoded as -1, into an all zero vector.
86 |
87 | input_data = np.zeros((len(x), maxlen), dtype=np.int)
88 | for dix, sent in enumerate(x):
89 | counter = 0
90 | for c in sent:
91 | if c == ' ':
92 | continue
93 | if counter >= maxlen:
94 | pass
95 | else:
96 | ix = vocab.get(c, -1) # get index from vocab dictionary, if not in vocab, return -1
97 | if c == ' ':
98 | print(ix, 'space')
99 | input_data[dix, counter] = ix
100 | counter += 1
101 | return input_data
102 |
103 | def train(self,batch_size,epochs,X_train, y_train,classes,model):
104 | for n_epoch in range(epochs):
105 | batch_length = int(len(X_train) / batch_size)
106 | for batch in range(batch_length + 1):
107 | if batch == batch_length:
108 | x1 = batch * batch_size
109 | train_data = X_train[x1:]
110 | label_data = y_train[x1:]
111 | else:
112 | x1 = batch * batch_size
113 | x2 = (batch + 1) * batch_size
114 | train_data = X_train[x1:x2]
115 | label_data = y_train[x1:x2]
116 | model.train_on_batch(self.encode_data(train_data, self.max_len, self.vocab), to_categorical(label_data, classes))
117 | return model
118 |
119 | def test(self, X_test,batch_size,model):
120 | y_pred = np.array([])
121 | batch_length = int(len(X_test) / batch_size)
122 | for batch in range(batch_length + 1):
123 | if batch == batch_length:
124 | x1 = batch * batch_size
125 | test_data = X_test[x1:]
126 | else:
127 | x1 = batch * batch_size
128 | x2 = (batch + 1) * batch_size
129 | test_data = X_test[x1:x2]
130 | y_pred = np.append(y_pred, model.predict_classes(self.encode_data(test_data, self.max_len, self.vocab)))
131 | return y_pred
132 |
--------------------------------------------------------------------------------
/rnn_model/RnnModelGenerator.py:
--------------------------------------------------------------------------------
1 | from keras import Sequential
2 | from keras.layers import Bidirectional, GRU, Dropout, Dense, LSTM
3 | from keras.utils import to_categorical
4 | from keras_preprocessing.sequence import pad_sequences
5 | import numpy as np
6 | import nltk as nk
7 |
8 |
9 | class RnnModels:
10 |
11 | def __init__(self,nodes, layers, classes,loss,optimizer,activation,input_shape,dropout):
12 | self.nodes = nodes
13 | self.layers = layers
14 | self.classes = classes
15 | self.loss = loss
16 | self.optimizer = optimizer
17 | self.activation = activation
18 | self.input_shape = input_shape
19 | self.dropout = dropout
20 |
21 |
22 | def get_bigru_model(self):
23 | model_dnn = Sequential()
24 | model_dnn.add(Bidirectional(GRU(self.nodes, return_sequences=True,activation=self.activation),
25 | input_shape=self.input_shape))
26 | model_dnn.add(Dropout(self.dropout))
27 |
28 | for i in range(int(self.layers-2)):
29 | model_dnn.add(Bidirectional(GRU(self.nodes, return_sequences=True,activation=self.activation)))
30 | model_dnn.add(Dropout(self.dropout))
31 | model_dnn.add(Bidirectional(GRU(self.nodes, return_sequences=False,activation=self.activation)))
32 | model_dnn.add(Dropout(self.dropout))
33 | model_dnn.add(Dense(self.classes, activation='sigmoid'))
34 | model_dnn.compile(loss= self.loss, optimizer=self.optimizer, metrics=['accuracy'])
35 | return model_dnn
36 |
37 | def get_bilstm_model(self):
38 | model_dnn = Sequential()
39 | model_dnn.add(Bidirectional(LSTM(self.nodes, return_sequences=True,activation=self.activation),
40 | input_shape=self.input_shape))
41 | model_dnn.add(Dropout(self.dropout))
42 | for i in range(int(self.layers - 2)):
43 | model_dnn.add(Bidirectional(LSTM(self.nodes, return_sequences=True,activation=self.activation)))
44 | model_dnn.add(Dropout(self.dropout))
45 | model_dnn.add(Bidirectional(LSTM(self.nodes, return_sequences=False,activation=self.activation)))
46 | model_dnn.add(Dropout(self.dropout))
47 | model_dnn.add(Dense(self.classes, activation='sigmoid'))
48 | model_dnn.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
49 | return model_dnn
50 |
51 | def get_gru_model(self):
52 | model_dnn = Sequential()
53 | model_dnn.add(GRU(self.nodes, return_sequences=True, input_shape=self.input_shape,
54 | activation=self.activation))
55 | model_dnn.add(Dropout(self.dropout))
56 |
57 | for i in range(int(self.layers - 2)):
58 | model_dnn.add(GRU(self.nodes, return_sequences=True, activation=self.activation))
59 | model_dnn.add(Dropout(self.dropout))
60 | model_dnn.add(GRU(self.nodes, return_sequences=False, activation=self.activation))
61 | model_dnn.add(Dropout(self.dropout))
62 | model_dnn.add(Dense(self.classes, activation='sigmoid'))
63 | model_dnn.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
64 | return model_dnn
65 |
66 | def get_lstm_model(self):
67 | model_dnn = Sequential()
68 | model_dnn.add(LSTM(self.nodes, return_sequences=True, input_shape=self.input_shape,
69 | activation=self.activation))
70 | model_dnn.add(Dropout(self.dropout))
71 |
72 | for i in range(int(self.layers - 2)):
73 | model_dnn.add(LSTM(self.nodes, return_sequences=True,activation=self.activation))
74 | model_dnn.add(Dropout(self.dropout))
75 | model_dnn.add(LSTM(self.nodes, return_sequences=False,activation=self.activation))
76 | model_dnn.add(Dropout(self.dropout))
77 | model_dnn.add(Dense(self.classes, activation='sigmoid'))
78 | model_dnn.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
79 | return model_dnn
80 |
81 | def mini_batch_generator(self, X_mini, max_len,WE_model):
82 | data = []
83 | count_new = 0
84 | count_in = 0
85 | # X_mini = list(map(lambda x: list(tf_idf_ordering(x)), X_mini))
86 | for abstract in X_mini:
87 | abstract = nk.word_tokenize(abstract)
88 | feature = []
89 | count = 0
90 | for word in abstract:
91 | if count >= max_len:
92 | break
93 | word_new = word.lower()
94 | if word_new in WE_model:
95 | count = count + 1
96 | feature.append(WE_model[word_new])
97 | else:
98 | count_new = count_new + 1
99 | data.append(list(feature))
100 | count_in = count_in + count
101 | # if count_new > 0:
102 | # print(count_in, count_new)
103 | data1 = pad_sequences(data, padding='post', maxlen=max_len)
104 | return np.array(data1, dtype=np.float16)
105 |
106 | def train(self,batch_size,epochs,X_train, y_train,max_len,WE_model,classes,model):
107 | for n_epoch in range(epochs):
108 | batch_length = int(len(X_train) / batch_size)
109 | for batch in range(batch_length + 1):
110 | if batch == batch_length:
111 | x1 = batch * batch_size
112 | train_data = X_train[x1:]
113 | label_data = y_train[x1:]
114 | else:
115 | x1 = batch * batch_size
116 | x2 = (batch + 1) * batch_size
117 | train_data = X_train[x1:x2]
118 | label_data = y_train[x1:x2]
119 | model.train_on_batch(self.mini_batch_generator(train_data,max_len, WE_model),
120 | to_categorical(label_data, classes))
121 | return model
122 |
123 | def test(self, X_test,batch_size,model,max_len,WE_model):
124 | y_pred = np.array([])
125 | batch_length = int(len(X_test) / batch_size)
126 | for batch in range(batch_length + 1):
127 | if batch == batch_length:
128 | x1 = batch * batch_size
129 | test_data = X_test[x1:]
130 | else:
131 | x1 = batch * batch_size
132 | x2 = (batch + 1) * batch_size
133 | test_data = X_test[x1:x2]
134 | y_pred = np.append(y_pred, model.predict_classes(self.mini_batch_generator(test_data,max_len, WE_model)))
135 | return y_pred
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/rnn_model/RnnModelGeneratorGpu.py:
--------------------------------------------------------------------------------
1 | from keras import Sequential
2 | from keras.layers import Bidirectional, CuDNNGRU, Dropout, Dense, CuDNNLSTM, Activation
3 | from keras.utils import to_categorical
4 | from keras_preprocessing.sequence import pad_sequences
5 | import numpy as np
6 | import nltk as nk
7 | from keras import backend as K
8 | import tensorflow as tf
9 |
10 | K.tensorflow_backend._get_available_gpus()
11 | from keras.utils import multi_gpu_model
12 |
13 |
14 | class RnnModelsGpu:
15 |
16 | def __init__(self, nodes, layers, classes, loss, optimizer, activation, input_shape, dropout, gpus):
17 | self.nodes = nodes
18 | self.layers = layers
19 | self.classes = classes
20 | self.loss = loss
21 | self.optimizer = optimizer
22 | self.activation = activation
23 | self.input_shape = input_shape
24 | self.dropout = dropout
25 | self.gpus = gpus
26 |
27 | def get_bigru_model(self):
28 | model_dnn = Sequential()
29 | model_dnn.add(Bidirectional(CuDNNGRU(self.nodes, return_sequences=True),
30 | input_shape=self.input_shape))
31 | model_dnn.add(Dropout(self.dropout))
32 |
33 | for i in range(int(self.layers - 2)):
34 | model_dnn.add(Bidirectional(CuDNNGRU(self.nodes, return_sequences=True)))
35 | model_dnn.add(Dropout(self.dropout))
36 | model_dnn.add(Bidirectional(CuDNNGRU(self.nodes, return_sequences=False)))
37 | model_dnn.add(Dropout(self.dropout))
38 | model_dnn.add(Dense(self.classes))
39 | model_dnn.add(Activation(tf.nn.softmax))
40 | print(model_dnn.summary())
41 | model_gpu = multi_gpu_model(model_dnn, gpus=self.gpus)
42 | model_gpu.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
43 | return model_gpu
44 |
45 | def get_bilstm_model(self):
46 | model_dnn = Sequential()
47 | model_dnn.add(Bidirectional(CuDNNLSTM(self.nodes, return_sequences=True),
48 | input_shape=self.input_shape))
49 | model_dnn.add(Dropout(self.dropout))
50 | for i in range(int(self.layers - 2)):
51 | model_dnn.add(Bidirectional(CuDNNLSTM(self.nodes, return_sequences=True)))
52 | model_dnn.add(Dropout(self.dropout))
53 | model_dnn.add(Bidirectional(CuDNNLSTM(self.nodes, return_sequences=False)))
54 | model_dnn.add(Dropout(self.dropout))
55 | model_dnn.add(Dense(self.classes))
56 | model_dnn.add(Activation(tf.nn.softmax))
57 | model_gpu = multi_gpu_model(model_dnn, gpus=self.gpus)
58 | model_gpu.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
59 | return model_gpu
60 |
61 | def get_gru_model(self):
62 | model_dnn = Sequential()
63 | model_dnn.add(CuDNNGRU(self.nodes, return_sequences=True, input_shape=self.input_shape))
64 | model_dnn.add(Dropout(self.dropout))
65 |
66 | for i in range(int(self.layers - 2)):
67 | model_dnn.add(CuDNNGRU(self.nodes, return_sequences=True))
68 | model_dnn.add(Dropout(self.dropout))
69 | model_dnn.add(CuDNNGRU(self.nodes, return_sequences=False))
70 | model_dnn.add(Dropout(self.dropout))
71 | model_dnn.add(Dense(self.classes))
72 | model_dnn.add(Activation(tf.nn.softmax))
73 | model_gpu = multi_gpu_model(model_dnn, gpus=self.gpus)
74 | model_gpu.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
75 | return model_gpu
76 |
77 | def get_lstm_model(self):
78 | model_dnn = Sequential()
79 | model_dnn.add(CuDNNLSTM(self.nodes, return_sequences=True, input_shape=self.input_shape))
80 | model_dnn.add(Dropout(self.dropout))
81 |
82 | for i in range(int(self.layers - 2)):
83 | model_dnn.add(CuDNNLSTM(self.nodes, return_sequences=True))
84 | model_dnn.add(Dropout(self.dropout))
85 | model_dnn.add(CuDNNLSTM(self.nodes, return_sequences=False))
86 | model_dnn.add(Dropout(self.dropout))
87 | model_dnn.add(Dense(self.classes))
88 | model_dnn.add(Activation(tf.nn.softmax))
89 | model_gpu = multi_gpu_model(model_dnn, gpus=self.gpus)
90 | model_gpu.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
91 | return model_gpu
92 |
93 | def mini_batch_generator(self, X_mini, max_len, WE_model):
94 | data = []
95 | count_new = 0
96 | count_in = 0
97 | # X_mini = list(map(lambda x: list(tf_idf_ordering(x)), X_mini))
98 | for abstract in X_mini:
99 | abstract = nk.word_tokenize(abstract)
100 | feature = []
101 | count = 0
102 | for word in abstract:
103 | if count >= max_len:
104 | break
105 | word_new = word.lower()
106 | if word_new in WE_model:
107 | count = count + 1
108 | feature.append(WE_model[word_new])
109 | else:
110 | count_new = count_new + 1
111 | data.append(list(feature))
112 | count_in = count_in + count
113 | # if count_new > 0:
114 | # print(count_in, count_new)
115 | data1 = pad_sequences(data, padding='post', maxlen=max_len)
116 | return np.array(data1, dtype=np.float16)
117 |
118 | def train(self, batch_size, epochs, X_train, y_train, max_len, WE_model, classes, model):
119 | for n_epoch in range(epochs):
120 | batch_length = int(len(X_train) / batch_size)
121 | for batch in range(batch_length + 1):
122 | if batch == batch_length:
123 | x1 = batch * batch_size
124 | train_data = X_train[x1:]
125 | label_data = y_train[x1:]
126 | else:
127 | x1 = batch * batch_size
128 | x2 = (batch + 1) * batch_size
129 | train_data = X_train[x1:x2]
130 | label_data = y_train[x1:x2]
131 | model.train_on_batch(self.mini_batch_generator(train_data, max_len, WE_model),
132 | to_categorical(label_data, classes))
133 | return model
134 |
135 | def test(self, X_test, batch_size, model, max_len, WE_model):
136 | y_pred = np.array([])
137 | batch_length = int(len(X_test) / batch_size)
138 | for batch in range(batch_length + 1):
139 | if batch == batch_length:
140 | x1 = batch * batch_size
141 | test_data = X_test[x1:]
142 | else:
143 | x1 = batch * batch_size
144 | x2 = (batch + 1) * batch_size
145 | test_data = X_test[x1:x2]
146 | y_pred = np.append(y_pred, model.predict(self.mini_batch_generator(test_data, max_len, WE_model)))
147 | y_pred = y_pred.argmax(axis=-1)
148 | y_pred = np.reshape(y_pred, (len(X_test), 1))
149 | return y_pred
150 |
--------------------------------------------------------------------------------
/rnn_model/RnnModelMain.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
5 | from sklearn.model_selection import train_test_split
6 | from rnn_model.RnnModelGenerator import RnnModels
7 |
8 | from rnn_model.RnnModelGeneratorGpu import RnnModelsGpu
9 |
10 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
11 |
12 |
13 | class Model:
14 |
15 | def __init__(self, abstracts_path, WE_path, max_len=80, nodes=128, layers=2, loss='categorical_crossentropy',
16 | optimizer='Adam', activation='tanh', dropout='0.2', batch_size=1000, epochs=50, tf_idf_sorting=True,
17 | gpus=None):
18 | self.abstracts_path = abstracts_path
19 | self.tf_idf_sorting = tf_idf_sorting
20 | self.WE_path = WE_path
21 | self.max_len = max_len
22 | self.nodes = nodes
23 | self.layers = layers
24 | self.loss = loss
25 | self.optimizer = optimizer
26 | self.activation = activation
27 | self.dropout = dropout
28 | self.batch_size = batch_size
29 | self.epochs = epochs
30 | self.gpus = gpus
31 | self.main()
32 |
33 | def number(self, word):
34 | try:
35 | float(word)
36 | return True
37 | except:
38 | return False
39 |
40 | def get_we_model(self, WE_path):
41 | file = open(WE_path)
42 | WE_model = dict()
43 | for line in file:
44 | splitLine = line.split()
45 | word = splitLine[0]
46 | embedding = np.array([float(val) for val in splitLine[1:]], dtype=np.float32)
47 | WE_model[word] = embedding
48 | return WE_model
49 |
50 |
51 | def main(self):
52 | abstracts_file = pd.read_csv(self.abstracts_path, index_col=['abstract', 'labels'])
53 | abstracts = abstracts_file['abstract']
54 | labels = np.array(abstracts_file['label'], dtype=np.int16)
55 | classes = len(set(labels))
56 | WE_model = self.get_we_model(self.WE_path)
57 | X_train, X_test, y_train, y_test = train_test_split(abstracts, labels, stratify=labels, test_size=0.1,
58 | random_state=42)
59 | input_shape = (self.max_len, len(WE_model[next(iter(WE_model))]))
60 |
61 | if self.gpus is None:
62 | model = RnnModels(self.nodes, self.layers, classes, self.loss, self.optimizer, self.activation, input_shape,
63 | self.dropout)
64 | else:
65 | model = RnnModelsGpu(self.nodes, self.layers, classes, self.loss, self.optimizer, self.activation,
66 | input_shape,self.dropout, self.gpus)
67 |
68 | model_dnn = model.get_bigru_model()
69 | model_dnn = model.train(self.batch_size, self.epochs, X_train, y_train, self.max_len, WE_model, classes,
70 | model_dnn)
71 | y_pred = model.test(X_test, self.batch_size, model_dnn, self.max_len, WE_model)
72 | print(f1_score(y_test, y_pred, average='micro'))
73 | print(recall_score(y_test, y_pred, average='micro'))
74 | print(precision_score(y_test, y_pred, average='micro'))
75 | print(accuracy_score(y_test, y_pred))
76 |
77 | if __name__ == '__main__':
78 | main()
79 |
--------------------------------------------------------------------------------
/tfidf_ordering/idf_score_calculator.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import string
3 | from collections import Counter
4 | import pandas as pd
5 | from nltk.corpus import stopwords
6 | import math
7 | from nltk.stem import WordNetLemmatizer
8 | from sklearn.externals import joblib
9 | import nltk as nk
10 | from nltk.corpus import wordnet
11 |
12 |
13 | class IDFScoreCalculator:
14 |
15 | def __init__(self,data_path):
16 | self.data_path = data_path
17 |
18 | def number(self,word):
19 | try:
20 | float(word)
21 | return True
22 | except:
23 | return False
24 |
25 | def get_wordnet_pos(self,word):
26 | tag = nk.pos_tag([word])[0][1][0].upper()
27 | tag_dict = {"J": wordnet.ADJ,
28 | "N": wordnet.NOUN,
29 | "V": wordnet.VERB,
30 | "R": wordnet.ADV}
31 |
32 | return tag_dict.get(tag, wordnet.NOUN)
33 |
34 |
35 | def main(self):
36 | data = pd.read_csv(self.data_path)
37 | data.columns = ['abstract', 'labels']
38 | all_abstracts = data['abstract']
39 | file_count = len(all_abstracts)
40 | trivial_words = stopwords.words('english') + list(string.printable)
41 | lemmatizer = WordNetLemmatizer()
42 | final_list = list(map(lambda x: list(set([lemmatizer.lemmatize(word.lower(),
43 | self.get_wordnet_pos(word.lower())) for word in nk.word_tokenize(x) if
44 | word.lower() not in trivial_words and not self.number(word)])), all_abstracts))
45 |
46 | flatten = [item for sublist in final_list for item in sublist]
47 | idf_z = dict(Counter(flatten))
48 | print(len(idf_z))
49 | print(file_count)
50 | for key in idf_z:
51 | idf_z[key] = math.log10(file_count / idf_z[key])
52 |
53 | print('model completed')
54 |
55 | try:
56 | joblib.dump(idf_z, 'idf_weights.pkl')
57 | return idf_z
58 | except:
59 | pass
60 |
61 |
62 | if __name__ == '__main__':
63 | main()
--------------------------------------------------------------------------------
/tfidf_ordering/tfidf_ordering.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import string
3 | from collections import OrderedDict
4 | import pandas as pd
5 | from nltk.corpus import stopwords
6 | from nltk.stem import WordNetLemmatizer
7 | import nltk as nk
8 | from nltk.corpus import wordnet
9 | from idf_score_calculator import IDFScoreCalculator
10 |
11 |
12 | class tfidf_ordering:
13 |
14 | def __init__(self,data_path,tfidf_sorting=True,max_len=80):
15 | self.tfidf_sorting = tfidf_sorting
16 | self.data_path = data_path
17 | self.max_len = max_len
18 | self.idf_weights = IDFScoreCalculator(data_path)
19 |
20 | def number(self,word):
21 | try:
22 | float(word)
23 | return True
24 | except:
25 | return False
26 |
27 | def get_wordnet_pos(self,word):
28 | tag = nk.pos_tag([word])[0][1][0].upper()
29 | tag_dict = {"J": wordnet.ADJ,
30 | "N": wordnet.NOUN,
31 | "V": wordnet.VERB,
32 | "R": wordnet.ADV}
33 |
34 | return tag_dict.get(tag, wordnet.NOUN)
35 |
36 |
37 | def tf_idf_ordering(self, abstract):
38 | lemmatizer = WordNetLemmatizer()
39 | trivial_words = stopwords.words('english') + list(string.printable)
40 | words = set([lemmatizer.lemmatize(word.lower()) for word in nk.word_tokenize(abstract) if
41 | word.lower() not in trivial_words and not self.number(word)])
42 |
43 | tf_idf_list = dict()
44 | for word in words:
45 | try:
46 | tf_idf_list[word] = self.idf_weights[word]
47 | except:
48 | print('in except')
49 | tf_idf_list[word] = 0
50 |
51 | if self.tfidf_sorting:
52 | final_dict = OrderedDict(sorted(tf_idf_list.items(), key=lambda x: x[1], reverse=True))
53 | else:
54 | position_list = dict()
55 | pos = 0
56 | for word in words:
57 | position_list[word] = pos
58 | pos = pos + 1
59 | first_dict = OrderedDict(sorted(tf_idf_list.items(), key=lambda x: x[1], reverse=True))
60 | # print(count)
61 | unordered_abstract = list(first_dict[:self.max_len])
62 | final_dict = dict()
63 | for word in unordered_abstract:
64 | final_dict[word] = position_list[word]
65 | final_dict = OrderedDict(sorted(final_dict.items(), key=lambda x: x[1], reverse=False))
66 | return list(final_dict)
67 |
68 | def main(self):
69 | data = pd.read_csv(self.data_path)
70 | data.columns = ['abstract', 'labels']
71 | final_list = list(map(lambda x: list(self.tf_idf_ordering(x)), data['abstract']))
72 | ordered_list = []
73 | for abstract in final_list:
74 | ordered_list.append(" ".join(abstract))
75 | data['abstract'] = ordered_list
76 | data.to_csv('final_tfidf_ordered_data.csv')
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/use_with_mlp.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from keras import Sequential
4 | from keras.layers import Dense, Dropout, Activation
5 | from keras.utils import multi_gpu_model, to_categorical
6 | from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
7 | from sklearn.model_selection import train_test_split
8 | import tensorflow as tf
9 | import tensorflow_hub as hub
10 |
11 | embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
12 |
13 |
14 | class MlpModelWithUSE:
15 |
16 | def __init__(self, abstracts_path, nodes=128, layers=4, loss='categorical_crossentropy',
17 | optimizer='Adam', activation='relu', dropout='0.2', batch_size=1000, epochs=50,
18 | gpus=None):
19 | self.abstracts_path = abstracts_path
20 | self.nodes = nodes
21 | self.layers = layers
22 | self.loss = loss
23 | self.optimizer = optimizer
24 | self.activation = activation
25 | self.dropout = dropout
26 | self.batch_size = batch_size
27 | self.epochs = epochs
28 | self.gpus = gpus
29 | self.main()
30 |
31 | def MlpModel(self, nodes, layers, classes, loss, optimizer, activation, input_shape, dropout, gpus):
32 | model_mlp = Sequential()
33 | model_mlp.add(Dense(nodes, input_dim=input_shape, activation=activation))
34 | model_mlp.add(Dropout(dropout))
35 | for i in range(layers - 2):
36 | model_mlp.add(Dense(nodes, activation=activation))
37 | model_mlp.add(Dropout(dropout))
38 | model_mlp.add(Dense(classes))
39 | model_mlp.add(Activation(tf.nn.softmax))
40 | if gpus is None:
41 | model_mlp.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
42 | return model_mlp
43 | model_gpu = multi_gpu_model(model_mlp, gpus=self.gpus)
44 | model_gpu.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
45 | return model_gpu
46 |
47 | def mini_batch_generator(self, train_data):
48 | with tf.Session() as session:
49 | session.run([tf.global_variables_initializer(), tf.tables_initializer()])
50 | return session.run(embed(train_data))
51 |
52 | def train(self, batch_size, epochs, X_train, y_train, classes, model):
53 | for n_epoch in range(epochs):
54 | batch_length = int(len(X_train) / batch_size)
55 | for batch in range(batch_length + 1):
56 | if batch == batch_length:
57 | x1 = batch * batch_size
58 | train_data = X_train[x1:]
59 | label_data = y_train[x1:]
60 | else:
61 | x1 = batch * batch_size
62 | x2 = (batch + 1) * batch_size
63 | train_data = X_train[x1:x2]
64 | label_data = y_train[x1:x2]
65 | model.train_on_batch(self.mini_batch_generator(train_data),
66 | to_categorical(label_data, classes))
67 | return model
68 |
69 | def test(self, X_test, batch_size, model):
70 | y_pred = np.array([])
71 | batch_length = int(len(X_test) / batch_size)
72 | for batch in range(batch_length + 1):
73 | if batch == batch_length:
74 | x1 = batch * batch_size
75 | test_data = X_test[x1:]
76 | else:
77 | x1 = batch * batch_size
78 | x2 = (batch + 1) * batch_size
79 | test_data = X_test[x1:x2]
80 | y_pred = np.append(y_pred, model.predict(self.mini_batch_generator(test_data)))
81 | y_pred = y_pred.argmax(axis=-1)
82 | y_pred = np.reshape(y_pred, (len(X_test), 1))
83 | return y_pred
84 |
85 | def main(self):
86 | abstracts_file = pd.read_csv(self.abstracts_path, index_col=['abstract', 'labels'])
87 | abstracts = abstracts_file['abstract']
88 | labels = np.array(abstracts_file['label'], dtype=np.int16)
89 | classes = len(set(labels))
90 | X_train, X_test, y_train, y_test = train_test_split(abstracts, labels, stratify=labels, test_size=0.1,
91 | random_state=42)
92 | model = self.MlpModel(self.nodes, self.layers, classes, self.loss, self.optimizer, self.activation, 512,
93 | self.dropout, self.gpus)
94 | model_dnn = self.train(self.batch_size, self.epochs, X_train, y_train, classes, model)
95 | y_pred = self.test(X_test, self.batch_size, model_dnn)
96 | print(f1_score(y_test, y_pred, average='micro'))
97 | print(recall_score(y_test, y_pred, average='micro'))
98 | print(precision_score(y_test, y_pred, average='micro'))
99 | print(accuracy_score(y_test, y_pred))
100 |
101 | if __name__ == '__main__':
102 | main()
103 |
--------------------------------------------------------------------------------