├── .idea
├── Text-Classification.iml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── models
├── RMDL.py
├── adversarial_abblstm.py
├── attn_bi_lstm.py
├── attn_lstm_hierarchical.py
├── cnn.py
├── ind_rnn_tc.py
├── modules
│ ├── attention.py
│ ├── indRNN.py
│ └── multihead.py
└── multi_head.py
└── utils
├── __init__.py
├── __pycache__
├── __init__.cpython-36.pyc
├── model_helper.cpython-36.pyc
└── prepare_data.cpython-36.pyc
├── model_helper.py
└── prepare_data.py
/.idea/Text-Classification.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 | true
199 | DEFINITION_ORDER
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 | CSS
215 |
216 |
217 | GeneralJavaScript
218 |
219 |
220 | JavaScript
221 |
222 |
223 | Potentially confusing code constructsJavaScript
224 |
225 |
226 | Probable bugsCSS
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 | 1529654687399
355 |
356 |
357 | 1529654687399
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text-Classification
2 | Implement some state-of-the-art text classification models with TensorFlow.
3 |
4 | ## Requirement
5 |
6 | - Python3
7 | - TensorFlow >= 1.4
8 |
9 | Note: Original code is written in TensorFlow 1.4, while the `VocabularyProcessor` is depreciated, updated code changes to use `tf.keras.preprocessing.text` to do preprocessing. The **new** preprocessing function is named `data_preprocessing_v2`
10 |
11 | ## Dataset
12 |
13 | You can load the data with
14 |
15 | ```python
16 | dbpedia = tf.contrib.learn.datasets.load_dataset('dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
17 | ```
18 |
19 | Or download it from [Baidu Yun](https://pan.baidu.com/s/1hz4Oh9A4udLzFjudyVDazw).
20 |
21 | ## Attention is All Your Need
22 |
23 | Paper: [Attention Is All You Need](http://arxiv.org/abs/1605.07725)
24 |
25 | See multi_head.py
26 |
27 | Use self-attention where **Query = Key = Value = sentence after word embedding**
28 |
29 | Multihead Attention module is implemented by [Kyubyong](https://github.com/Kyubyong/transformer)
30 |
31 | ## IndRNN for Text Classification
32 |
33 | Paper: [Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN](https://arxiv.org/abs/1803.04831)
34 |
35 | IndRNNCell is implemented by [batzener](https://github.com/batzner/indrnn)
36 |
37 | ## Attention-Based Bidirection LSTM for Text Classification
38 |
39 | Paper: [Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification](http://www.aclweb.org/anthology/P16-2034)
40 |
41 | See attn_bi_lstm.py
42 |
43 | ## Hierarchical Attention Networks for Text Classification
44 |
45 | Paper: [Hierarchical Attention Networks for Document Classification](http://aclweb.org/anthology/N16-1174)
46 |
47 | See attn_lstm_hierarchical.py
48 |
49 | Attention module is implemented by [ilivans/tf-rnn-attention ](https://github.com/ilivans/tf-rnn-attention).
50 |
51 | ## Adversarial Training Methods For Supervised Text Classification
52 |
53 | Paper: [Adversarial Training Methods For Semi-Supervised Text Classification](http://arxiv.org/abs/1605.07725)
54 |
55 | See: adversrial_abblstm.py
56 |
57 |
58 | ## Convolutional Neural Networks for Sentence Classification
59 |
60 | Paper: [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882)
61 |
62 | See: cnn.py
63 |
64 |
65 | ## RMDL: Random Multimodel Deep Learning for Classification
66 |
67 | Paper: [RMDL: Random Multimodel Deep Learning for Classification](https://arxiv.org/abs/1805.01890)
68 |
69 | See: RMDL.py
70 | See: [RMDL Github](https://github.com/kk7nc/RMDL)
71 |
72 |
73 |
74 |
75 | **Note**: The parameters are not fine-tuned, you can modify the kernel as you want.
76 | ## Performance
77 |
78 | | Model | Test Accuracy | Notes |
79 | | ----------------------------------- | ------------- | ----------------------- |
80 | | Attention-based Bi-LSTM | 98.23 % | |
81 | | HAN | 89.15% | 1080Ti 10 epochs 12 min |
82 | | Adversarial Attention-based Bi-LSTM | 98.5% | AWS p2 2 hours |
83 | | IndRNN | 98.39% | 1080Ti 10 epochs 10 min |
84 | | Attention is All Your Need | 97.81% | 1080Ti 15 epochs 8 min |
85 | | RMDL | 98.91% | 2X Tesla Xp (3 RDLs) |
86 | | CNN | 98.37% | |
87 |
88 | ## Welcome To Contribute
89 |
90 | If you have any models implemented with great performance, you're welcome to contribute. Also, I'm glad to help if you have any problems with the project, feel free to raise a issue.
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/models/RMDL.py:
--------------------------------------------------------------------------------
1 | '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
2 | RMDL: Random Multimodel Deep Learning for Classification
3 |
4 | * Copyright (C) 2018 Kamran Kowsari
5 | * Last Update: May 3rd, 2018
6 | * This file is part of RMDL project, University of Virginia.
7 | * Free to use, change, share and distribute source code of RMDL
8 | * Refrenced paper : RMDL: Random Multimodel Deep Learning for Classification
9 | * Refrenced paper : An Improvement of Data Classification using Random Multimodel Deep Learning (RMDL)
10 | * Comments and Error: email: kk7nc@virginia.edu
11 | '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
12 |
13 |
14 | import os
15 | from RMDL import text_feature_extraction as txt
16 | from sklearn.model_selection import train_test_split
17 | from RMDL.Download import Download_WOS as WOS
18 | import numpy as np
19 | from RMDL import RMDL_Text as RMDL
20 | import tensorflow as tf
21 | import numpy as np
22 | import pandas as pd
23 | import tensorflow as tf
24 | from tensorflow.keras.preprocessing.sequence import pad_sequences
25 | from sklearn.utils import shuffle
26 |
27 | def load_data(file_name, sample_ratio=1, n_class=15, one_hot=True):
28 | '''load data from .csv file'''
29 | names = ["class", "title", "content"]
30 | csv_file = pd.read_csv(file_name, names=names)
31 | shuffle_csv = csv_file.sample(frac=sample_ratio)
32 | x = pd.Series(shuffle_csv["content"])
33 | y = pd.Series(shuffle_csv["class"])
34 | if one_hot:
35 | y = to_one_hot(y, n_class)
36 | return x, y
37 |
38 |
39 |
40 | if __name__ == "__main__":
41 | dbpedia = tf.contrib.learn.datasets.load_dataset('dbpedia')
42 |
43 |
44 | X_train, y_train = load_data("dbpedia_data/dbpedia_csv/train.csv", sample_ratio=1e-2, one_hot=False)
45 | X_test, y_test = load_data("dbpedia_data/dbpedia_csv/test.csv", one_hot=False)
46 |
47 |
48 | batch_size = 100
49 | sparse_categorical = 0
50 | n_epochs = [500, 500, 500] ## DNN--RNN-CNN
51 | Random_Deep = [3, 3, 3] ## DNN--RNN-CNN
52 |
53 | RMDL.Text_Classification(X_train, y_train, X_test, y_test,
54 | batch_size=batch_size,
55 | sparse_categorical=True,
56 | random_deep=Random_Deep,
57 | epochs=n_epochs)
58 |
--------------------------------------------------------------------------------
/models/adversarial_abblstm.py:
--------------------------------------------------------------------------------
1 | from tensorflow.contrib.rnn import BasicLSTMCell
2 | from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
3 | import time
4 | from utils.prepare_data import *
5 | from utils.model_helper import *
6 |
7 |
8 | def scale_l2(x, norm_length):
9 | # shape(x) = (batch, num_timesteps, d)
10 | # Divide x by max(abs(x)) for a numerically stable L2 norm.
11 | # 2norm(x) = a * 2norm(x/a)
12 | # Scale over the full sequence, dims (1, 2)
13 | alpha = tf.reduce_max(tf.abs(x), (1, 2), keepdims=True) + 1e-12
14 | l2_norm = alpha * tf.sqrt(
15 | tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keepdims=True) + 1e-6)
16 | x_unit = x / l2_norm
17 | return norm_length * x_unit
18 |
19 |
20 | def normalize(emb, weights):
21 | # weights = vocab_freqs / tf.reduce_sum(vocab_freqs) ?? 这个实现没问题吗
22 | print("Weights: ", weights)
23 | mean = tf.reduce_sum(weights * emb, 0, keep_dims=True)
24 | var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keep_dims=True)
25 | stddev = tf.sqrt(1e-6 + var)
26 | return (emb - mean) / stddev
27 |
28 |
29 | class AdversarialClassifier(object):
30 | def __init__(self, config):
31 | self.max_len = config["max_len"]
32 | self.hidden_size = config["hidden_size"]
33 | self.vocab_size = config["vocab_size"]
34 | self.embedding_size = config["embedding_size"]
35 | self.n_class = config["n_class"]
36 | self.learning_rate = config["learning_rate"]
37 | self.epsilon = config["epsilon"]
38 |
39 | # placeholder
40 | self.x = tf.placeholder(tf.int32, [None, self.max_len])
41 | self.label = tf.placeholder(tf.int32, [None])
42 | self.keep_prob = tf.placeholder(tf.float32)
43 |
44 | def _add_perturbation(self, embedded, loss):
45 | """Adds gradient to embedding and recomputes classification loss."""
46 | grad, = tf.gradients(
47 | loss,
48 | embedded,
49 | aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
50 | grad = tf.stop_gradient(grad)
51 | perturb = scale_l2(grad, self.epsilon)
52 | return embedded + perturb
53 |
54 | def _get_freq(self, vocab_freq, word2idx):
55 | """get a frequency dict format as {word_idx: word_freq}"""
56 | words = vocab_freq.keys()
57 | freq = [0] * self.vocab_size
58 | for word in words:
59 | word_idx = word2idx.get(word)
60 | word_freq = vocab_freq[word]
61 | freq[word_idx] = word_freq
62 | return freq
63 |
64 | def build_graph(self, vocab_freq, word2idx):
65 | vocab_freqs = tf.constant(self._get_freq(vocab_freq, word2idx),
66 | dtype=tf.float32, shape=(self.vocab_size, 1))
67 | weights = vocab_freqs / tf.reduce_sum(vocab_freqs)
68 | embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
69 | trainable=True, name="embedding_var")
70 | embedding_norm = normalize(embeddings_var, weights)
71 | batch_embedded = tf.nn.embedding_lookup(embedding_norm, self.x)
72 |
73 | W = tf.Variable(tf.random_normal([self.hidden_size], stddev=0.1))
74 | W_fc = tf.Variable(tf.truncated_normal([self.hidden_size, self.n_class], stddev=0.1))
75 | b_fc = tf.Variable(tf.constant(0., shape=[self.n_class]))
76 |
77 | def cal_loss_logit(embedded, keep_prob, reuse=True, scope="loss"):
78 | with tf.variable_scope(scope, reuse=reuse) as scope:
79 | rnn_outputs, _ = bi_rnn(BasicLSTMCell(self.hidden_size),
80 | BasicLSTMCell(self.hidden_size),
81 | inputs=embedded, dtype=tf.float32)
82 |
83 | # Attention
84 | H = tf.add(rnn_outputs[0], rnn_outputs[1]) # fw + bw
85 | M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE)
86 | # alpha (bs * sl, 1)
87 | alpha = tf.nn.softmax(tf.matmul(tf.reshape(M, [-1, self.hidden_size]),
88 | tf.reshape(W, [-1, 1])))
89 | r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(alpha, [-1, self.max_len,
90 | 1])) # supposed to be (batch_size * HIDDEN_SIZE, 1)
91 | r = tf.squeeze(r)
92 | h_star = tf.tanh(r)
93 | drop = tf.nn.dropout(h_star, keep_prob)
94 |
95 | # Fully connected layer(dense layer)
96 | y_hat = tf.nn.xw_plus_b(drop, W_fc, b_fc)
97 |
98 | return y_hat, tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=self.label))
99 |
100 | logits, self.cls_loss = cal_loss_logit(batch_embedded, self.keep_prob, reuse=False)
101 | embedding_perturbated = self._add_perturbation(batch_embedded, self.cls_loss)
102 | adv_logits, self.adv_loss = cal_loss_logit(embedding_perturbated, self.keep_prob, reuse=True)
103 | self.loss = self.cls_loss + self.adv_loss
104 |
105 | # optimization
106 | loss_to_minimize = self.loss
107 | tvars = tf.trainable_variables()
108 | gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
109 | grads, global_norm = tf.clip_by_global_norm(gradients, 1.0)
110 |
111 | self.global_step = tf.Variable(0, name="global_step", trainable=False)
112 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
113 | self.train_op = self.optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step,
114 | name='train_step')
115 | self.prediction = tf.argmax(tf.nn.softmax(logits), 1)
116 |
117 | print("graph built successfully!")
118 |
119 |
120 | if __name__ == '__main__':
121 | # load data
122 | x_train, y_train = load_data("../dbpedia_data/dbpedia_csv/train.csv", sample_ratio=1e-2, one_hot=False)
123 | x_test, y_test = load_data("../dbpedia_data/dbpedia_csv/test.csv", one_hot=False)
124 |
125 | # data preprocessing
126 | x_train, x_test, vocab_freq, word2idx, vocab_size = \
127 | data_preprocessing_with_dict(x_train, x_test, max_len=32)
128 | print("train size: ", len(x_train))
129 | print("vocab size: ", vocab_size)
130 |
131 | # split dataset to test and dev
132 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
133 | split_dataset(x_test, y_test, 0.1)
134 | print("Validation Size: ", dev_size)
135 |
136 | config = {
137 | "max_len": 32,
138 | "hidden_size": 64,
139 | "vocab_size": vocab_size,
140 | "embedding_size": 128,
141 | "n_class": 15,
142 | "learning_rate": 1e-3,
143 | "batch_size": 32,
144 | "train_epoch": 10,
145 | "epsilon": 5,
146 | }
147 |
148 | classifier = AdversarialClassifier(config)
149 | classifier.build_graph(vocab_freq, word2idx)
150 |
151 | # auto GPU growth, avoid occupy all GPU memory
152 | tf_config = tf.ConfigProto()
153 | tf_config.gpu_options.allow_growth = True
154 | sess = tf.Session(config=tf_config)
155 |
156 | sess.run(tf.global_variables_initializer())
157 | dev_batch = (x_dev, y_dev)
158 | start = time.time()
159 | for e in range(config["train_epoch"]):
160 |
161 | t0 = time.time()
162 | print("Epoch %d start !" % (e + 1))
163 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, config["batch_size"]):
164 | return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
165 |
166 | t1 = time.time()
167 |
168 | print("Train Epoch time: %.3f s" % (t1 - t0))
169 | dev_acc = run_eval_step(classifier, sess, dev_batch)
170 | print("validation accuracy: %.3f " % dev_acc)
171 |
172 | print("Training finished, time consumed : ", time.time() - start, " s")
173 | print("Start evaluating: \n")
174 | cnt = 0
175 | test_acc = 0
176 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, config["batch_size"]):
177 | acc = run_eval_step(classifier, sess, (x_batch, y_batch))
178 | test_acc += acc
179 | cnt += 1
180 |
181 | print("Test accuracy : %f %%" % (test_acc / cnt * 100))
182 |
--------------------------------------------------------------------------------
/models/attn_bi_lstm.py:
--------------------------------------------------------------------------------
1 | from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
2 | from tensorflow.contrib.rnn import BasicLSTMCell
3 | from utils.prepare_data import *
4 | import time
5 | from utils.model_helper import *
6 |
7 |
8 | class ABLSTM(object):
9 | def __init__(self, config):
10 | self.max_len = config["max_len"]
11 | self.hidden_size = config["hidden_size"]
12 | self.vocab_size = config["vocab_size"]
13 | self.embedding_size = config["embedding_size"]
14 | self.n_class = config["n_class"]
15 | self.learning_rate = config["learning_rate"]
16 |
17 | # placeholder
18 | self.x = tf.placeholder(tf.int32, [None, self.max_len])
19 | self.label = tf.placeholder(tf.int32, [None])
20 | self.keep_prob = tf.placeholder(tf.float32)
21 |
22 | def build_graph(self):
23 | print("building graph")
24 | # Word embedding
25 | embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
26 | trainable=True)
27 | batch_embedded = tf.nn.embedding_lookup(embeddings_var, self.x)
28 |
29 | rnn_outputs, _ = bi_rnn(BasicLSTMCell(self.hidden_size),
30 | BasicLSTMCell(self.hidden_size),
31 | inputs=batch_embedded, dtype=tf.float32)
32 |
33 | fw_outputs, bw_outputs = rnn_outputs
34 |
35 | W = tf.Variable(tf.random_normal([self.hidden_size], stddev=0.1))
36 | H = fw_outputs + bw_outputs # (batch_size, seq_len, HIDDEN_SIZE)
37 | M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE)
38 |
39 | self.alpha = tf.nn.softmax(tf.reshape(tf.matmul(tf.reshape(M, [-1, self.hidden_size]),
40 | tf.reshape(W, [-1, 1])),
41 | (-1, self.max_len))) # batch_size x seq_len
42 | r = tf.matmul(tf.transpose(H, [0, 2, 1]),
43 | tf.reshape(self.alpha, [-1, self.max_len, 1]))
44 | r = tf.squeeze(r)
45 | h_star = tf.tanh(r) # (batch , HIDDEN_SIZE
46 |
47 | h_drop = tf.nn.dropout(h_star, self.keep_prob)
48 |
49 | # Fully connected layer(dense layer)
50 | FC_W = tf.Variable(tf.truncated_normal([self.hidden_size, self.n_class], stddev=0.1))
51 | FC_b = tf.Variable(tf.constant(0., shape=[self.n_class]))
52 | y_hat = tf.nn.xw_plus_b(h_drop, FC_W, FC_b)
53 |
54 | self.loss = tf.reduce_mean(
55 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=self.label))
56 |
57 | # prediction
58 | self.prediction = tf.argmax(tf.nn.softmax(y_hat), 1)
59 |
60 | # optimization
61 | loss_to_minimize = self.loss
62 | tvars = tf.trainable_variables()
63 | gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
64 | grads, global_norm = tf.clip_by_global_norm(gradients, 1.0)
65 |
66 | self.global_step = tf.Variable(0, name="global_step", trainable=False)
67 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
68 | self.train_op = self.optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step,
69 | name='train_step')
70 | print("graph built successfully!")
71 |
72 |
73 | if __name__ == '__main__':
74 | # load data
75 | x_train, y_train = load_data("../dbpedia_data/dbpedia_csv/train.csv", sample_ratio=1e-2, one_hot=False)
76 | x_test, y_test = load_data("../dbpedia_data/dbpedia_csv/test.csv", one_hot=False)
77 |
78 | # data preprocessing
79 | x_train, x_test, vocab_size = \
80 | data_preprocessing_v2(x_train, x_test, max_len=32)
81 | print("train size: ", len(x_train))
82 | print("vocab size: ", vocab_size)
83 |
84 | # split dataset to test and dev
85 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
86 | split_dataset(x_test, y_test, 0.1)
87 | print("Validation Size: ", dev_size)
88 |
89 | config = {
90 | "max_len": 32,
91 | "hidden_size": 64,
92 | "vocab_size": vocab_size,
93 | "embedding_size": 128,
94 | "n_class": 15,
95 | "learning_rate": 1e-3,
96 | "batch_size": 4,
97 | "train_epoch": 20
98 | }
99 |
100 | classifier = ABLSTM(config)
101 | classifier.build_graph()
102 |
103 | sess = tf.Session()
104 | sess.run(tf.global_variables_initializer())
105 | dev_batch = (x_dev, y_dev)
106 | start = time.time()
107 | for e in range(config["train_epoch"]):
108 |
109 | t0 = time.time()
110 | print("Epoch %d start !" % (e + 1))
111 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, config["batch_size"]):
112 | return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
113 | attn = get_attn_weight(classifier, sess, (x_batch, y_batch))
114 | # plot the attention weight
115 | # print(np.reshape(attn, (config["batch_size"], config["max_len"])))
116 | t1 = time.time()
117 |
118 | print("Train Epoch time: %.3f s" % (t1 - t0))
119 | dev_acc = run_eval_step(classifier, sess, dev_batch)
120 | print("validation accuracy: %.3f " % dev_acc)
121 |
122 | print("Training finished, time consumed : ", time.time() - start, " s")
123 | print("Start evaluating: \n")
124 | cnt = 0
125 | test_acc = 0
126 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, config["batch_size"]):
127 | acc = run_eval_step(classifier, sess, (x_batch, y_batch))
128 | test_acc += acc
129 | cnt += 1
130 |
131 | print("Test accuracy : %f %%" % (test_acc / cnt * 100))
132 |
--------------------------------------------------------------------------------
/models/attn_lstm_hierarchical.py:
--------------------------------------------------------------------------------
1 | from modules.attention import attention
2 | from tensorflow.contrib.rnn import BasicLSTMCell
3 | from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
4 | import time
5 | from utils.prepare_data import *
6 |
7 | # Hyperparameter
8 | MAX_DOCUMENT_LENGTH = 256
9 | EMBEDDING_SIZE = 128
10 | HIDDEN_SIZE = 64
11 | ATTENTION_SIZE = 64
12 | lr = 1e-3
13 | BATCH_SIZE = 1024
14 | KEEP_PROB = 0.5
15 | LAMBDA = 0.0001
16 | MAX_LABEL = 15
17 | epochs = 10
18 |
19 |
20 | # load data
21 | x_train, y_train = load_data("../dbpedia_csv/train.csv", sample_ratio=1)
22 | x_test, y_test = load_data("../dbpedia_csv/test.csv", sample_ratio=1)
23 |
24 | # data preprocessing
25 | x_train, x_test, vocab_size = \
26 | data_preprocessing_v2(x_train, x_test, MAX_DOCUMENT_LENGTH)
27 | print(vocab_size)
28 |
29 | # split dataset to test and dev
30 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
31 | split_dataset(x_test, y_test, 0.1)
32 | print("Validation size: ", dev_size)
33 |
34 | graph = tf.Graph()
35 | with graph.as_default():
36 |
37 | batch_x = tf.placeholder(tf.int32, [None, MAX_DOCUMENT_LENGTH])
38 | batch_y = tf.placeholder(tf.float32, [None, MAX_LABEL])
39 | keep_prob = tf.placeholder(tf.float32)
40 |
41 | embeddings_var = tf.Variable(tf.random_uniform([vocab_size, EMBEDDING_SIZE], -1.0, 1.0), trainable=True)
42 | batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_x)
43 | # print(batch_embedded.shape) # (?, 256, 100)
44 | rnn_outputs, _ = tf.nn.dynamic_rnn(BasicLSTMCell(HIDDEN_SIZE), batch_embedded, dtype=tf.float32)
45 |
46 | # Attention
47 | attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
48 | drop = tf.nn.dropout(attention_output, keep_prob)
49 | shape = drop.get_shape()
50 |
51 | # Fully connected layer(dense layer)
52 | W = tf.Variable(tf.truncated_normal([shape[1].value, MAX_LABEL], stddev=0.1))
53 | b = tf.Variable(tf.constant(0., shape=[MAX_LABEL]))
54 | y_hat = tf.nn.xw_plus_b(drop, W, b)
55 |
56 |
57 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=batch_y))
58 | optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
59 |
60 | # Accuracy metric
61 | prediction = tf.argmax(tf.nn.softmax(y_hat), 1)
62 | accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, tf.argmax(batch_y, 1)), tf.float32))
63 |
64 | with tf.Session(graph=graph) as sess:
65 | sess.run(tf.global_variables_initializer())
66 | print("Initialized! ")
67 |
68 | print("Start trainning")
69 | start = time.time()
70 | for e in range(epochs):
71 |
72 | epoch_start = time.time()
73 | print("Epoch %d start !" % (e + 1))
74 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, BATCH_SIZE):
75 | fd = {batch_x: x_batch, batch_y: y_batch, keep_prob: KEEP_PROB}
76 | l, _, acc = sess.run([loss, optimizer, accuracy], feed_dict=fd)
77 |
78 | epoch_finish = time.time()
79 | print("Validation accuracy: ", sess.run([accuracy, loss], feed_dict={
80 | batch_x: x_dev,
81 | batch_y: y_dev,
82 | keep_prob: 1.0
83 | }))
84 | print("epoch finished, time consumed : ", time.time() - epoch_start, " s")
85 |
86 | print("Training finished, time consumed : ", time.time() - start, " s")
87 | print("Start evaluating: \n")
88 | cnt = 0
89 | test_acc = 0
90 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, BATCH_SIZE):
91 | fd = {batch_x: x_batch, batch_y: y_batch, keep_prob: 1.0}
92 | acc = sess.run(accuracy, feed_dict=fd)
93 | test_acc += acc
94 | cnt += 1
95 |
96 | print("Test accuracy : %f %%" % ( test_acc / cnt * 100))
97 |
98 |
99 |
100 |
--------------------------------------------------------------------------------
/models/cnn.py:
--------------------------------------------------------------------------------
1 | from utils.prepare_data import *
2 | import time
3 | from utils.model_helper import *
4 |
5 |
6 | def linear(input_, output_size, scope=None):
7 | """
8 | Linear map: output[k] = sum_i(Matrix[k, i] * input_[i] ) + Bias[k]
9 | Args:
10 | input_: a tensor or a list of 2D, batch x n, Tensors.
11 | output_size: int, second dimension of W[i].
12 | scope: VariableScope for the created subgraph; defaults to "Linear".
13 | Returns:
14 | A 2D Tensor with shape [batch x output_size] equal to
15 | sum_i(input_[i] * W[i]), where W[i]s are newly created matrices.
16 | Raises:
17 | ValueError: if some of the arguments has unspecified or wrong shape.
18 | """
19 |
20 | shape = input_.get_shape().as_list()
21 | if len(shape) != 2:
22 | raise ValueError("Linear is expecting 2D arguments: %s" % str(shape))
23 | if not shape[1]:
24 | raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape))
25 | input_size = shape[1]
26 |
27 | # Now the computation.
28 | with tf.variable_scope(scope or "SimpleLinear"):
29 | matrix = tf.get_variable("Matrix", [output_size, input_size], dtype=input_.dtype)
30 | bias_term = tf.get_variable("Bias", [output_size], dtype=input_.dtype)
31 |
32 | return tf.matmul(input_, tf.transpose(matrix)) + bias_term
33 |
34 |
35 | def highway(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu, scope='Highway'):
36 | """Highway Network (cf. http://arxiv.org/abs/1505.00387).
37 | t = sigmoid(Wy + b)
38 | z = t * g(Wy + b) + (1 - t) * y
39 | where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
40 | """
41 |
42 | with tf.variable_scope(scope):
43 | for idx in range(num_layers):
44 | g = f(linear(input_, size, scope='highway_lin_%d' % idx))
45 |
46 | t = tf.sigmoid(linear(input_, size, scope='highway_gate_%d' % idx) + bias)
47 |
48 | output = t * g + (1. - t) * input_
49 | input_ = output
50 |
51 | return output
52 |
53 |
54 | class CNNClassfier(object):
55 | def __init__(self, config):
56 | # configuration
57 | self.max_len = config["max_len"]
58 | # topic nums + 1
59 | self.num_classes = config["n_class"]
60 | self.vocab_size = config["vocab_size"]
61 | self.embedding_size = config["embedding_size"]
62 | self.filter_sizes = config["filter_sizes"]
63 | self.num_filters = config["num_filters"]
64 | self.l2_reg_lambda = config["l2_reg_lambda"]
65 | self.learning_rate = config["learning_rate"]
66 |
67 | # placeholder
68 | self.x = tf.placeholder(tf.int32, [None, self.max_len], name="input_x")
69 | self.label = tf.placeholder(tf.int32, [None], name="input_y")
70 | self.keep_prob = tf.placeholder(tf.float32, name="keep_prob")
71 |
72 | def build_graph(self):
73 | print("building graph")
74 | l2_loss = tf.constant(0.0)
75 | with tf.variable_scope("discriminator"):
76 | # Embedding:
77 | with tf.device('/cpu:0'), tf.name_scope("embedding"):
78 | self.W = tf.Variable(
79 | tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
80 | name="W")
81 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.x) # batch_size * seq * embedding_size
82 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # expand dims for conv operation
83 | pooled_outputs = list()
84 | # Create a convolution + max-pool layer for each filter size
85 | for filter_size, filter_num in zip(self.filter_sizes, self.num_filters):
86 | with tf.name_scope("cov2d-maxpool%s" % filter_size):
87 | filter_shape = [filter_size, self.embedding_size, 1, filter_num]
88 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
89 | b = tf.Variable(tf.constant(0.1, shape=[filter_num]), name="b")
90 | conv = tf.nn.conv2d(
91 | self.embedded_chars_expanded,
92 | W,
93 | strides=[1, 1, 1, 1],
94 | padding="VALID",
95 | name="conv")
96 | # print(conv.name, ": ", conv.shape) batch * (seq - filter_shape) + 1 * 1(output channel) *
97 | # filter_num
98 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
99 | pooled = tf.nn.max_pool(
100 | h,
101 | ksize=[1, self.max_len - filter_size + 1, 1, 1],
102 | strides=[1, 1, 1, 1],
103 | padding='VALID',
104 | name="pool") # 全部池化到 1x1
105 | # print(conv.name, ": ", conv.shape , "----", pooled.name, " : " ,pooled.shape)
106 | pooled_outputs.append(pooled)
107 | total_filters_num = sum(self.num_filters)
108 |
109 | self.h_pool = tf.concat(pooled_outputs, 3)
110 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, total_filters_num]) # batch * total_num
111 |
112 | # highway network
113 | with tf.name_scope("highway"):
114 | self.h_highway = highway(self.h_pool_flat, self.h_pool_flat.get_shape()[1], 1, 0)
115 |
116 | # add droppout
117 | with tf.name_scope("dropout"):
118 | self.h_drop = tf.nn.dropout(self.h_highway, self.keep_prob)
119 |
120 | with tf.name_scope("output"):
121 | W = tf.Variable(tf.truncated_normal([total_filters_num, self.num_classes], stddev=0.1), name="W")
122 | b = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="b")
123 | l2_loss += tf.nn.l2_loss(W)
124 | l2_loss += tf.nn.l2_loss(b)
125 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
126 | self.ypred_for_auc = tf.nn.softmax(self.scores)
127 | self.prediction = tf.cast(tf.argmax(self.ypred_for_auc, 1), dtype=tf.int32)
128 |
129 | with tf.name_scope("loss"):
130 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.scores, labels=self.label)
131 | self.loss = losses + self.l2_reg_lambda * l2_loss
132 | with tf.name_scope("accuracy"):
133 | self.accuracy = tf.reduce_mean(
134 | tf.cast(tf.equal(self.prediction, self.label), tf.float32))
135 |
136 | self.params = [param for param in tf.trainable_variables() if 'discriminator' in param.name]
137 | d_optimizer = tf.train.AdamOptimizer(self.learning_rate)
138 | # aggregation_method =2 能够帮助减少内存占用
139 | self.global_step = tf.Variable(0, name="global_step", trainable=False)
140 | grads_and_vars = d_optimizer.compute_gradients(self.loss, self.params, aggregation_method=2)
141 | self.train_op = d_optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
142 | print("graph built successfully!")
143 |
144 |
145 | if __name__ == '__main__':
146 | # load data
147 | x_train, y_train = load_data("../dbpedia_data/dbpedia_csv/train.csv", sample_ratio=1, one_hot=False)
148 | x_test, y_test = load_data("../dbpedia_data/dbpedia_csv/test.csv", one_hot=False)
149 |
150 | # data preprocessing
151 | x_train, x_test, vocab_size = \
152 | data_preprocessing_v2(x_train, x_test, max_len=120)
153 | print("train size: ", len(x_train))
154 | print("vocab size: ", vocab_size)
155 |
156 | # split dataset to test and dev
157 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
158 | split_dataset(x_test, y_test, 0.1)
159 | print("Validation Size: ", dev_size)
160 |
161 | config = {
162 | "max_len": 120,
163 | "vocab_size": vocab_size,
164 | "embedding_size": 32,
165 | "learning_rate": 1e-3,
166 | "l2_reg_lambda": 1e-3,
167 | "batch_size": 256,
168 | "n_class": 15,
169 |
170 | # random setting, may need fine-tune
171 | "filter_sizes": [1, 2, 3, 4, 5, 10, 20, 50, 100, 120],
172 | "num_filters": [128, 256, 256, 256, 256, 128, 128, 128, 128, 256],
173 | "train_epoch": 10,
174 | }
175 |
176 | classifier = CNNClassfier(config)
177 | classifier.build_graph()
178 |
179 | # auto GPU growth, avoid occupy all GPU memory
180 | tf_config = tf.ConfigProto()
181 | tf_config.gpu_options.allow_growth = True
182 | sess = tf.Session(config=tf_config)
183 |
184 | sess.run(tf.global_variables_initializer())
185 | dev_batch = (x_dev, y_dev)
186 | start = time.time()
187 | for e in range(config["train_epoch"]):
188 |
189 | t0 = time.time()
190 | print("Epoch %d start !" % (e + 1))
191 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, config["batch_size"]):
192 | return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
193 |
194 | t1 = time.time()
195 |
196 | print("Train Epoch time: %.3f s" % (t1 - t0))
197 | dev_acc = run_eval_step(classifier, sess, dev_batch)
198 | print("validation accuracy: %.3f " % dev_acc)
199 |
200 | print("Training finished, time consumed : ", time.time() - start, " s")
201 | print("Start evaluating: \n")
202 | cnt = 0
203 | test_acc = 0
204 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, config["batch_size"]):
205 | acc = run_eval_step(classifier, sess, (x_batch, y_batch))
206 | test_acc += acc
207 | cnt += 1
208 |
209 | print("Test accuracy : %f %%" % (test_acc / cnt * 100))
210 |
--------------------------------------------------------------------------------
/models/ind_rnn_tc.py:
--------------------------------------------------------------------------------
1 | from modules.indRNN import IndRNNCell
2 | from modules.attention import attention
3 | import time
4 | from utils.prepare_data import *
5 |
6 | # Hyperparameter
7 | MAX_DOCUMENT_LENGTH = 256
8 | EMBEDDING_SIZE = 128
9 | HIDDEN_SIZE = 64
10 | ATTENTION_SIZE = 64
11 | lr = 1e-3
12 | BATCH_SIZE = 1024
13 | KEEP_PROB = 0.5
14 | LAMBDA = 1e-3
15 | MAX_LABEL = 15
16 | epochs = 10
17 |
18 | # load data
19 | x_train, y_train = load_data("../dbpedia_csv/train.csv", sample_ratio=1)
20 | x_test, y_test = load_data("../dbpedia_csv/test.csv", sample_ratio=1)
21 |
22 | # data preprocessing
23 | x_train, x_test, vocab_size = \
24 | data_preprocessing_v2(x_train, x_test, MAX_DOCUMENT_LENGTH)
25 | print(vocab_size)
26 |
27 | # split dataset to test and dev
28 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
29 | split_dataset(x_test, y_test, 0.1)
30 | print("Validation size: ", dev_size)
31 |
32 | graph = tf.Graph()
33 | with graph.as_default():
34 |
35 | batch_x = tf.placeholder(tf.int32, [None, MAX_DOCUMENT_LENGTH])
36 | batch_y = tf.placeholder(tf.float32, [None, MAX_LABEL])
37 | keep_prob = tf.placeholder(tf.float32)
38 |
39 | embeddings_var = tf.Variable(tf.random_uniform([vocab_size, EMBEDDING_SIZE], -1.0, 1.0), trainable=True)
40 | batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_x)
41 | print(batch_embedded.shape) # (?, 256, 100)
42 |
43 | cell = IndRNNCell(HIDDEN_SIZE)
44 | rnn_outputs, _ = tf.nn.dynamic_rnn(cell, batch_embedded, dtype=tf.float32)
45 |
46 | # Attention
47 | attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True)
48 | drop = tf.nn.dropout(attention_output, keep_prob)
49 | shape = drop.get_shape()
50 |
51 | # Fully connected layer(dense layer)
52 | W = tf.Variable(tf.truncated_normal([shape[1].value, MAX_LABEL], stddev=0.1))
53 | b = tf.Variable(tf.constant(0., shape=[MAX_LABEL]))
54 | y_hat = tf.nn.xw_plus_b(drop, W, b)
55 |
56 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=batch_y))
57 | optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
58 |
59 | # Accuracy metric
60 | prediction = tf.argmax(tf.nn.softmax(y_hat), 1)
61 | accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, tf.argmax(batch_y, 1)), tf.float32))
62 |
63 |
64 | with tf.Session(graph=graph) as sess:
65 | sess.run(tf.global_variables_initializer())
66 | print("Initialized! ")
67 |
68 | print("Start trainning")
69 | start = time.time()
70 | for e in range(epochs):
71 |
72 | epoch_start = time.time()
73 | print("Epoch %d start !" % (e + 1))
74 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, BATCH_SIZE):
75 | fd = {batch_x: x_batch, batch_y: y_batch, keep_prob: KEEP_PROB}
76 | l, _, acc = sess.run([loss, optimizer, accuracy], feed_dict=fd)
77 |
78 | epoch_finish = time.time()
79 | print("Validation accuracy: ", sess.run([accuracy, loss], feed_dict={
80 | batch_x: x_dev,
81 | batch_y: y_dev,
82 | keep_prob: 1.0
83 | }))
84 | print("Epoch time: ", time.time() - epoch_start, "s")
85 |
86 | print("Training finished, time consumed : ", time.time() - start, " s")
87 | print("Start evaluating: \n")
88 | cnt = 0
89 | test_acc = 0
90 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, BATCH_SIZE):
91 | fd = {batch_x: x_batch, batch_y: y_batch, keep_prob: 1.0}
92 | acc = sess.run(accuracy, feed_dict=fd)
93 | test_acc += acc
94 | cnt += 1
95 |
96 | print("Test accuracy : %f %%" % ( test_acc / cnt * 100))
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/models/modules/attention.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | def attention(inputs, attention_size, time_major=False, return_alphas=False):
5 | """
6 | Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
7 | The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
8 | for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
9 | Variables notation is also inherited from the article
10 |
11 | Args:
12 | inputs: The Attention inputs.
13 | Matches outputs of RNN/Bi-RNN layer (not final state):
14 | In case of RNN, this must be RNN outputs `Tensor`:
15 | If time_major == False (default), this must be a tensor of shape:
16 | `[batch_size, max_time, cell.output_size]`.
17 | If time_major == True, this must be a tensor of shape:
18 | `[max_time, batch_size, cell.output_size]`.
19 | In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
20 | the backward RNN outputs `Tensor`.
21 | If time_major == False (default),
22 | outputs_fw is a `Tensor` shaped:
23 | `[batch_size, max_time, cell_fw.output_size]`
24 | and outputs_bw is a `Tensor` shaped:
25 | `[batch_size, max_time, cell_bw.output_size]`.
26 | If time_major == True,
27 | outputs_fw is a `Tensor` shaped:
28 | `[max_time, batch_size, cell_fw.output_size]`
29 | and outputs_bw is a `Tensor` shaped:
30 | `[max_time, batch_size, cell_bw.output_size]`.
31 | attention_size: Linear size of the Attention weights.
32 | time_major: The shape format of the `inputs` Tensors.
33 | If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
34 | If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
35 | Using `time_major = True` is a bit more efficient because it avoids
36 | transposes at the beginning and end of the RNN calculation. However,
37 | most TensorFlow data is batch-major, so by default this function
38 | accepts input and emits output in batch-major form.
39 | return_alphas: Whether to return attention coefficients variable along with layer's output.
40 | Used for visualization purpose.
41 | Returns:
42 | The Attention output `Tensor`.
43 | In case of RNN, this will be a `Tensor` shaped:
44 | `[batch_size, cell.output_size]`.
45 | In case of Bidirectional RNN, this will be a `Tensor` shaped:
46 | `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
47 | """
48 |
49 | if isinstance(inputs, tuple):
50 | # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
51 | inputs = tf.concat(inputs, 2)
52 |
53 | if time_major:
54 | # (T,B,D) => (B,T,D)
55 | inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
56 |
57 | hidden_size = inputs.shape[2].value # D value - hidden size of the RNN layer
58 |
59 | # Trainable parameters
60 | w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
61 | b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
62 | u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
63 |
64 | with tf.name_scope('v'):
65 | # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
66 | # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
67 | v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
68 |
69 | # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
70 | vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (B,T) shape
71 | alphas = tf.nn.softmax(vu, name='alphas') # (B,T) shape
72 |
73 | # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
74 | output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
75 |
76 | if not return_alphas:
77 | return output
78 | else:
79 | return output, alphas
--------------------------------------------------------------------------------
/models/modules/indRNN.py:
--------------------------------------------------------------------------------
1 | from tensorflow.python.ops import math_ops
2 | from tensorflow.python.ops import init_ops
3 | from tensorflow.python.ops import nn_ops
4 | from tensorflow.python.ops import clip_ops
5 | from tensorflow.python.layers import base as base_layer
6 | from tensorflow.python.ops.rnn_cell_impl import LayerRNNCell
7 |
8 |
9 | class IndRNNCell(LayerRNNCell): # 继承 LayerRNNCell
10 |
11 | def __init__(self,
12 | num_units,
13 | recurrent_min_abs=0,
14 | recurrent_max_abs=None,
15 | recurrent_kernel_initializer=None,
16 | input_kernel_initializer=None,
17 | activation=None,
18 | reuse=None,
19 | name=None):
20 | super(IndRNNCell, self).__init__(_reuse=reuse, name=name)
21 |
22 | self.input_spec = base_layer.InputSpec(ndim=2)
23 |
24 | # initialization
25 | self._num_units = num_units
26 | self._recurrent_min_abs = recurrent_min_abs
27 |
28 | self._recurrent_max_abs = recurrent_max_abs
29 | self._recurrent_recurrent_kernel_initializer = recurrent_kernel_initializer
30 | self._input_kernel_initializer = input_kernel_initializer
31 | self._activation = activation or nn_ops.relu
32 |
33 |
34 | @property
35 | def state_size(self):
36 | return self._num_units
37 |
38 | @property
39 | def output_size(self):
40 | return self._num_units
41 |
42 | def build(self, inputs_shape):
43 | '''construct the IndRNN Cell'''
44 | if inputs_shape[1].value is None:
45 | raise ValueError("Expected input shape[1] is known")
46 |
47 | input_depth = inputs_shape[1]
48 | if self._input_kernel_initializer is None:
49 | self._input_kernel_initializer = init_ops.random_normal_initializer(mean=0,
50 | stddev=1e-3)
51 | # matrix W
52 | self._input_kernel = self.add_variable(
53 | "input_kernel",
54 | shape=[input_depth, self._num_units],
55 | initializer=self._input_kernel_initializer
56 | )
57 |
58 | if self._recurrent_recurrent_kernel_initializer is None:
59 | self._recurrent_recurrent_kernel_initializer = init_ops.constant_initializer(1.)
60 |
61 | # matrix U
62 | self._recurrent_kernel = self.add_variable(
63 | "recurrent_kernel",
64 | shape=[self._num_units],
65 | initializer=self._recurrent_recurrent_kernel_initializer
66 | )
67 |
68 | # Clip the U to min - max
69 | if self._recurrent_min_abs:
70 | abs_kernel = math_ops.abs(self._recurrent_kernel)
71 | min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs)
72 | self._recurrent_kernel = math_ops.multiply(
73 | math_ops.sign(self._recurrent_kernel),
74 | min_abs_kernel
75 | )
76 | if self._recurrent_max_abs:
77 | self._recurrent_kernel = clip_ops.clip_by_value(
78 | self._recurrent_kernel,
79 | -self._recurrent_max_abs,
80 | self._recurrent_max_abs
81 | )
82 |
83 | self._bias = self.add_variable(
84 | "bias",
85 | shape=[self._num_units],
86 | initializer=init_ops.zeros_initializer(dtype=self.dtype)
87 | )
88 | # built finished
89 | self.built = True
90 |
91 |
92 | def call(self, inputs, state):
93 | '''output = new state = activation(W * x + U (*) h_t-1 + b)'''
94 |
95 | gate_inputs = math_ops.matmul(inputs, self._input_kernel)
96 | # (*)
97 | state_update = math_ops.multiply(state, self._recurrent_kernel)
98 | gate_inputs = math_ops.add(gate_inputs, state_update)
99 | gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
100 | output = self._activation(gate_inputs)
101 | return output, output
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/models/modules/multihead.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | def layer_normalization(inputs,
5 | epsilon=1e-8,
6 | scope="ln",
7 | reuse=None):
8 | with tf.variable_scope(scope, reuse=reuse):
9 | inputs_shape = inputs.get_shape()
10 | params_shape = inputs_shape[-1:]
11 |
12 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
13 | beta = tf.Variable(tf.zeros(params_shape))
14 | gamma = tf.Variable(tf.ones(params_shape))
15 | normalized = (inputs - mean) / ((variance + epsilon) ** .5)
16 | outputs = gamma * normalized + beta
17 |
18 | return outputs
19 |
20 |
21 | def multihead_attention(queries,
22 | keys,
23 | num_units=None,
24 | num_heads=8,
25 | dropout_rate=0,
26 | is_training=True,
27 | causality=False,
28 | scope="multihead_attention",
29 | reuse=None):
30 | with tf.variable_scope(scope, reuse=reuse):
31 | if num_units is None: # set default size for attention size C
32 | num_units = queries.get_shape().as_list()[-1]
33 |
34 | # Linear Projections
35 | Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # [N, T_q, C]
36 | K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # [N, T_k, C]
37 | V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # [N, T_k, C]
38 |
39 | # Split and concat
40 | Q_ = tf.concat(tf.split(Q, num_heads, axis=-1), axis=0) # [num_heads * N, T_q, C/num_heads]
41 | K_ = tf.concat(tf.split(K, num_heads, axis=-1), axis=0) # [num_heads * N, T_k, C/num_heads]
42 | V_ = tf.concat(tf.split(V, num_heads, axis=-1), axis=0) # [num_heads * N, T_k, C/num_heads]
43 |
44 | # Attention
45 | outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (num_heads * N, T_q, T_k)
46 |
47 | # Scale : outputs = outputs / sqrt( d_k)
48 | outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
49 |
50 | # Key Masking
51 | # see : https://github.com/Kyubyong/transformer/issues/3
52 | key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
53 | key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
54 | key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
55 |
56 | paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) # -infinity
57 | outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
58 |
59 | # Causality = Future blinding
60 | if causality:
61 | diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
62 | tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
63 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
64 |
65 | paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
66 | outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
67 |
68 | # Activation: outputs is a weight matrix
69 | outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
70 |
71 | # Query Masking
72 | query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
73 | query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
74 | query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
75 | outputs *= query_masks # broadcasting. (N, T_q, C)
76 |
77 | # dropouts
78 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
79 |
80 | # weighted sum
81 | outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
82 |
83 | # reshape
84 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C)
85 |
86 | # residual connection
87 | outputs += queries
88 |
89 | # layer normaliztion
90 | outputs = layer_normalization(outputs)
91 | return outputs
92 |
93 |
94 | def feedforward(inputs,
95 | num_units=[2048, 512],
96 | scope="multihead_attention",
97 | reuse=None):
98 | with tf.variable_scope(scope, reuse=reuse):
99 | # Inner layer
100 | params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
101 | "activation": tf.nn.relu, "use_bias": True}
102 | outputs = tf.layers.conv1d(**params)
103 |
104 | # Readout layer
105 | params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
106 | "activation": None, "use_bias": True}
107 | outputs = tf.layers.conv1d(**params)
108 |
109 | print("Conv ret:", outputs.shape)
110 | # Residual connection
111 | outputs += inputs
112 |
113 | # Normalize
114 | outputs = layer_normalization(outputs)
115 |
116 | return outputs
--------------------------------------------------------------------------------
/models/multi_head.py:
--------------------------------------------------------------------------------
1 | from modules.multihead import *
2 | from utils.model_helper import *
3 | import time
4 | from utils.prepare_data import *
5 |
6 |
7 | class AttentionClassifier(object):
8 | def __init__(self, config):
9 | self.max_len = config["max_len"]
10 | self.hidden_size = config["hidden_size"]
11 | self.vocab_size = config["vocab_size"]
12 | self.embedding_size = config["embedding_size"]
13 | self.n_class = config["n_class"]
14 | self.learning_rate = config["learning_rate"]
15 |
16 | # placeholder
17 | self.x = tf.placeholder(tf.int32, [None, self.max_len])
18 | self.label = tf.placeholder(tf.int32, [None])
19 | self.keep_prob = tf.placeholder(tf.float32)
20 |
21 | def build_graph(self):
22 | print("building graph...")
23 | embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
24 | trainable=True)
25 | batch_embedded = tf.nn.embedding_lookup(embeddings_var, self.x)
26 | # multi-head attention
27 | ma = multihead_attention(queries=batch_embedded, keys=batch_embedded)
28 | # FFN(x) = LN(x + point-wisely NN(x))
29 | outputs = feedforward(ma, [self.hidden_size, self.embedding_size])
30 | outputs = tf.reshape(outputs, [-1, self.max_len * self.embedding_size])
31 | logits = tf.layers.dense(outputs, units=self.n_class)
32 |
33 | self.loss = tf.reduce_mean(
34 | tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label))
35 | self.prediction = tf.argmax(tf.nn.softmax(logits), 1)
36 |
37 | # optimization
38 | loss_to_minimize = self.loss
39 | tvars = tf.trainable_variables()
40 | gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
41 | grads, global_norm = tf.clip_by_global_norm(gradients, 1.0)
42 |
43 | self.global_step = tf.Variable(0, name="global_step", trainable=False)
44 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
45 | self.train_op = self.optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step,
46 | name='train_step')
47 | print("graph built successfully!")
48 |
49 |
50 | if __name__ == '__main__':
51 | # load data
52 | x_train, y_train = load_data("../dbpedia_data/dbpedia_csv/train.csv", sample_ratio=1e-2, one_hot=False)
53 | x_test, y_test = load_data("../dbpedia_data/dbpedia_csv/test.csv", one_hot=False)
54 |
55 | # data preprocessing
56 | x_train, x_test, vocab_size = \
57 | data_preprocessing_v2(x_train, x_test, max_len=32)
58 | print("train size: ", len(x_train))
59 | print("vocab size: ", vocab_size)
60 |
61 | # split dataset to test and dev
62 | x_test, x_dev, y_test, y_dev, dev_size, test_size = \
63 | split_dataset(x_test, y_test, 0.1)
64 | print("Validation Size: ", dev_size)
65 |
66 | config = {
67 | "max_len": 32,
68 | "hidden_size": 64,
69 | "vocab_size": vocab_size,
70 | "embedding_size": 128,
71 | "n_class": 15,
72 | "learning_rate": 1e-3,
73 | "batch_size": 32,
74 | "train_epoch": 20
75 | }
76 |
77 | classifier = AttentionClassifier(config)
78 | classifier.build_graph()
79 |
80 | sess = tf.Session()
81 | sess.run(tf.global_variables_initializer())
82 | dev_batch = (x_dev, y_dev)
83 | start = time.time()
84 | for e in range(config["train_epoch"]):
85 |
86 | t0 = time.time()
87 | print("Epoch %d start !" % (e + 1))
88 | for x_batch, y_batch in fill_feed_dict(x_train, y_train, config["batch_size"]):
89 | return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
90 |
91 | t1 = time.time()
92 |
93 | print("Train Epoch time: %.3f s" % (t1 - t0))
94 | dev_acc = run_eval_step(classifier, sess, dev_batch)
95 | print("validation accuracy: %.3f " % dev_acc)
96 |
97 | print("Training finished, time consumed : ", time.time() - start, " s")
98 | print("Start evaluating: \n")
99 | cnt = 0
100 | test_acc = 0
101 | for x_batch, y_batch in fill_feed_dict(x_test, y_test, config["batch_size"]):
102 | acc = run_eval_step(classifier, sess, (x_batch, y_batch))
103 | test_acc += acc
104 | cnt += 1
105 |
106 | print("Test accuracy : %f %%" % (test_acc / cnt * 100))
107 |
108 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TobiasLee/Text-Classification/21229709953b6c1c8f3bbb923883092b217ef023/utils/__init__.py
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TobiasLee/Text-Classification/21229709953b6c1c8f3bbb923883092b217ef023/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/model_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TobiasLee/Text-Classification/21229709953b6c1c8f3bbb923883092b217ef023/utils/__pycache__/model_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/prepare_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TobiasLee/Text-Classification/21229709953b6c1c8f3bbb923883092b217ef023/utils/__pycache__/prepare_data.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/model_helper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def make_train_feed_dict(model, batch):
5 | """make train feed dict for training"""
6 | feed_dict = {model.x: batch[0],
7 | model.label: batch[1],
8 | model.keep_prob: .5}
9 | return feed_dict
10 |
11 |
12 | def make_test_feed_dict(model, batch):
13 | feed_dict = {model.x: batch[0],
14 | model.label: batch[1],
15 | model.keep_prob: 1.0}
16 | return feed_dict
17 |
18 |
19 | def run_train_step(model, sess, batch):
20 | feed_dict = make_train_feed_dict(model, batch)
21 | to_return = {
22 | 'train_op': model.train_op,
23 | 'loss': model.loss,
24 | 'global_step': model.global_step,
25 | }
26 | return sess.run(to_return, feed_dict)
27 |
28 |
29 | def run_eval_step(model, sess, batch):
30 | feed_dict = make_test_feed_dict(model, batch)
31 | prediction = sess.run(model.prediction, feed_dict)
32 | acc = np.sum(np.equal(prediction, batch[1])) / len(prediction)
33 | return acc
34 |
35 |
36 | def get_attn_weight(model, sess, batch):
37 | feed_dict = make_train_feed_dict(model, batch)
38 | return sess.run(model.alpha, feed_dict)
39 |
--------------------------------------------------------------------------------
/utils/prepare_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import tensorflow as tf
4 | from tensorflow.keras.preprocessing.sequence import pad_sequences
5 | from sklearn.utils import shuffle
6 | names = ["class", "title", "content"]
7 |
8 |
9 | def to_one_hot(y, n_class):
10 | return np.eye(n_class)[y.astype(int)]
11 |
12 |
13 | def load_data(file_name, sample_ratio=1, n_class=15, names=names, one_hot=True):
14 | '''load data from .csv file'''
15 | csv_file = pd.read_csv(file_name, names=names)
16 | shuffle_csv = csv_file.sample(frac=sample_ratio)
17 | x = pd.Series(shuffle_csv["content"])
18 | y = pd.Series(shuffle_csv["class"])
19 | if one_hot:
20 | y = to_one_hot(y, n_class)
21 | return x, y
22 |
23 |
24 | def data_preprocessing(train, test, max_len):
25 | """transform to one-hot idx vector by VocabularyProcessor"""
26 | """VocabularyProcessor is deprecated, use v2 instead"""
27 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_len)
28 | x_transform_train = vocab_processor.fit_transform(train)
29 | x_transform_test = vocab_processor.transform(test)
30 | vocab = vocab_processor.vocabulary_
31 | vocab_size = len(vocab)
32 | x_train_list = list(x_transform_train)
33 | x_test_list = list(x_transform_test)
34 | x_train = np.array(x_train_list)
35 | x_test = np.array(x_test_list)
36 |
37 | return x_train, x_test, vocab, vocab_size
38 |
39 |
40 | def data_preprocessing_v2(train, test, max_len, max_words=50000):
41 | tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
42 | tokenizer.fit_on_texts(train)
43 | train_idx = tokenizer.texts_to_sequences(train)
44 | test_idx = tokenizer.texts_to_sequences(test)
45 | train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
46 | test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
47 | # vocab size = len(word_docs) + 2 (, )
48 | return train_padded, test_padded, max_words + 2
49 |
50 |
51 | def data_preprocessing_with_dict(train, test, max_len):
52 | tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
53 | tokenizer.fit_on_texts(train)
54 | train_idx = tokenizer.texts_to_sequences(train)
55 | test_idx = tokenizer.texts_to_sequences(test)
56 | train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
57 | test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
58 | # vocab size = len(word_docs) + 2 (, )
59 | return train_padded, test_padded, tokenizer.word_docs, tokenizer.word_index, len(tokenizer.word_docs) + 2
60 |
61 |
62 | def split_dataset(x_test, y_test, dev_ratio):
63 | """split test dataset to test and dev set with ratio """
64 | test_size = len(x_test)
65 | print(test_size)
66 | dev_size = (int)(test_size * dev_ratio)
67 | print(dev_size)
68 | x_dev = x_test[:dev_size]
69 | x_test = x_test[dev_size:]
70 | y_dev = y_test[:dev_size]
71 | y_test = y_test[dev_size:]
72 | return x_test, x_dev, y_test, y_dev, dev_size, test_size - dev_size
73 |
74 |
75 | def fill_feed_dict(data_X, data_Y, batch_size):
76 | """Generator to yield batches"""
77 | # Shuffle data first.
78 | shuffled_X, shuffled_Y = shuffle(data_X, data_Y)
79 | # print("before shuffle: ", data_Y[:10])
80 | # print(data_X.shape[0])
81 | # perm = np.random.permutation(data_X.shape[0])
82 | # data_X = data_X[perm]
83 | # shuffled_Y = data_Y[perm]
84 | # print("after shuffle: ", shuffled_Y[:10])
85 | for idx in range(data_X.shape[0] // batch_size):
86 | x_batch = shuffled_X[batch_size * idx: batch_size * (idx + 1)]
87 | y_batch = shuffled_Y[batch_size * idx: batch_size * (idx + 1)]
88 | yield x_batch, y_batch
89 |
--------------------------------------------------------------------------------