├── .idea
├── dictionaries
│ └── rongshunlin.xml
├── misc.xml
├── modules.xml
├── textCNN.iml
├── vcs.xml
└── workspace.xml
├── README.md
├── __pycache__
├── data.cpython-36.pyc
└── text_cnn.cpython-36.pyc
├── data.py
├── data
└── model
│ └── vocab
├── data_set
├── polarity.neg
└── polarity.pos
├── log
├── events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local
├── events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local
└── events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local
├── model.py
├── textCNN_paddle.py
├── text_cnn.py
└── train-eval.sh
/.idea/dictionaries/rongshunlin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | Buildout
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/textCNN.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 | 1565398915368
411 |
412 |
413 | 1565398915368
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | + 修改train-eval.sh的CODE_DIR目录
2 | + 执行sh train-eval.sh 即可运行
3 |
--------------------------------------------------------------------------------
/__pycache__/data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/__pycache__/data.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/text_cnn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/__pycache__/text_cnn.cpython-36.pyc
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.6
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2019/8/11 上午8:21
4 | # @Author : ModyfiAI
5 | # @Email : rongshunlin@126.com
6 | # @File : data.py
7 | # @description : 仅供学习, 请勿用于商业用途
8 | import re
9 | import numpy as np
10 |
11 |
12 | class DataSet(object):
13 | def __init__(self, positive_data_file, negative_data_file):
14 | self.x_text, self.y = self.load_data_and_labels(positive_data_file, negative_data_file)
15 |
16 | def load_data_and_labels(self, positive_data_file, negative_data_file):
17 | # load data from files
18 | positive_data = list(open(positive_data_file, "r", encoding='utf-8').readlines())
19 | positive_data = [s.strip() for s in positive_data]
20 | negative_data = list(open(negative_data_file, "r", encoding='utf-8').readlines())
21 | negative_data = [s.strip() for s in negative_data]
22 |
23 | # split by words
24 | x_text = positive_data + negative_data
25 | x_text = [self.clean_str(sent) for sent in x_text]
26 |
27 | # generate labels
28 | positive_labels = [[0, 1] for _ in positive_data]
29 | negative_labels = [[1, 0] for _ in negative_data]
30 | y = np.concatenate([positive_labels, negative_labels], 0)
31 | return [x_text, y]
32 |
33 | def clean_str(self, string):
34 | """
35 | Tokenization/string cleaning for all datasets except for SST.
36 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
37 | """
38 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
39 | string = re.sub(r"\'s", " \'s", string)
40 | string = re.sub(r"\'ve", " \'ve", string)
41 | string = re.sub(r"n\'t", " n\'t", string)
42 | string = re.sub(r"\'re", " \'re", string)
43 | string = re.sub(r"\'d", " \'d", string)
44 | string = re.sub(r"\'ll", " \'ll", string)
45 | string = re.sub(r",", " , ", string)
46 | string = re.sub(r"!", " ! ", string)
47 | string = re.sub(r"\(", " \( ", string)
48 | string = re.sub(r"\)", " \) ", string)
49 | string = re.sub(r"\?", " \? ", string)
50 | string = re.sub(r"\s{2,}", " ", string)
51 | return string.strip().lower()
52 |
53 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
54 | """
55 | Generates a batch iterator for a dataset.
56 | """
57 | data = np.array(data)
58 | data_size = len(data)
59 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
60 | for epoch in range(num_epochs):
61 | # Shuffle the data at each epoch
62 | if shuffle:
63 | shuffle_indices = np.random.permutation(np.arange(data_size))
64 | shuffled_data = data[shuffle_indices]
65 | else:
66 | shuffled_data = data
67 | for batch_num in range(num_batches_per_epoch):
68 | start_index = batch_num * batch_size
69 | end_index = min((batch_num + 1) * batch_size, data_size)
70 | print (shuffled_data[start_index])
71 | yield shuffled_data[start_index:end_index]
72 |
--------------------------------------------------------------------------------
/data/model/vocab:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/data/model/vocab
--------------------------------------------------------------------------------
/log/events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local
--------------------------------------------------------------------------------
/log/events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local
--------------------------------------------------------------------------------
/log/events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.6
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2019/8/10 下午10:20
4 | # @Author : ModyfiAI
5 | # @Email : rongshunlin@126.com
6 | # @File : model.py
7 | # @description : 仅供学习, 请勿用于商业用途
8 |
9 | import os
10 | import tensorflow as tf
11 | import numpy as np
12 | import data
13 | import datetime
14 | from text_cnn import ModelConfig, TextCNNModel
15 | from tensorflow.contrib import learn
16 |
17 | flags = tf.flags
18 | FLAGS = flags.FLAGS
19 |
20 | # 数据路径
21 | flags.DEFINE_string("positive_data_file", "./data_set/polarity.pos", "splited by ,")
22 | flags.DEFINE_string("negative_data_file", "./data_set/polarity.neg", "splited by ,")
23 | flags.DEFINE_string("pred_data", "None", "splited by ,")
24 | flags.DEFINE_string("model_dir", "./data/model/", "output model dir")
25 | flags.DEFINE_string("output_dir", "./data/model/", "evaluate output dir")
26 | flags.DEFINE_string("vocab", None, "vocab file")
27 |
28 | # cnn 参数
29 | flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
30 | flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
31 | flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
32 | flags.DEFINE_float("drop_rate", 0.5, "Dropout keep probability (default: 0.5)")
33 | flags.DEFINE_integer("max_seq_length", 64, "Maximum sequence length")
34 | flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0")
35 |
36 | # 训练参数
37 | flags.DEFINE_bool("is_train", True, "Whether to run training.")
38 | flags.DEFINE_bool("is_eval", False, "Whether to run eval on the dev set.")
39 | flags.DEFINE_bool("is_predict", False, "Whether to run prediction.")
40 | flags.DEFINE_integer("batch_size", 128, "Batch size.")
41 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
42 | flags.DEFINE_integer("num_train_steps", 100000, "Train steps")
43 | flags.DEFINE_integer("keep_checkpoint_max", 20, "Max keep checkpoints")
44 | flags.DEFINE_integer("save_summary_steps", 1000, "Step intervals to save summary")
45 | flags.DEFINE_integer("log_step_count_steps", 1000, "Step intervals to log step info")
46 | flags.DEFINE_integer("save_checkpoints_steps", 500, "Step intervals to save checkpoints")
47 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
48 |
49 |
50 | def preprocess():
51 | data_info = data.DataSet(FLAGS.positive_data_file, FLAGS.negative_data_file)
52 | x_text, y = data_info.x_text, data_info.y
53 |
54 | # Build vocabulary
55 | max_document_length = max([len(x.split(" ")) for x in x_text])
56 | vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
57 | x = np.array(list(vocab_processor.fit_transform(x_text)))
58 | tf.logging.info("Shape of X :{}".format(str(x.shape)))
59 |
60 | # Random shuffle data
61 | np.random.seed(10)
62 | shuffle_indices = np.random.permutation(np.arange(len(y)))
63 | x_shuffled = x[shuffle_indices]
64 | y_shuffled = y[shuffle_indices]
65 |
66 | # Split train/test set
67 | dev_sample_index = -1 * int(0.1 * float(len(y)))
68 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
69 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
70 |
71 | # Init model config
72 | model_config = ModelConfig(
73 | embedding_dim=FLAGS.embedding_dim,
74 | filter_sizes=FLAGS.filter_sizes,
75 | num_filters=FLAGS.num_filters,
76 | dropout_rate=FLAGS.drop_rate,
77 | l2_reg_lambda=FLAGS.l2_reg_lambda,
78 | max_seq_length=max_document_length,
79 | vocab_size=len(vocab_processor.vocabulary_),
80 | label_size=2
81 | )
82 | tf.logging.info("Vocabulary size: {:d}".format(len(vocab_processor.vocabulary_)))
83 | tf.logging.info("Train/dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
84 | tf.logging.info("*******Init Model CONFIG*************")
85 | tf.logging.info(model_config.to_string())
86 | return x_train, y_train, vocab_processor, x_dev, y_dev, model_config
87 |
88 |
89 | def train(x_train, y_train, vocab_processor, x_dev, y_dev, model_config):
90 | with tf.Graph().as_default():
91 | sess = tf.Session()
92 | with sess.as_default():
93 | cnn = TextCNNModel(
94 | config=model_config,
95 | is_training=FLAGS.is_train
96 | )
97 | # Define Training proceduce
98 | global_step = tf.Variable(0, name="global_step", trainable=False)
99 | optimizer = tf.train.AdamOptimizer(1e-3)
100 | grads_and_vars = optimizer.compute_gradients(cnn.loss)
101 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
102 |
103 | # Checkpoint directory, Tensorflow assumes this directioon already exists so we need to create it
104 | checkpoint_dir = os.path.abspath(os.path.join(FLAGS.output_dir, "checkpoints"))
105 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
106 | if not os.path.exists(checkpoint_dir):
107 | os.makedirs(checkpoint_dir)
108 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.keep_checkpoint_max)
109 |
110 | # Write vocabulary
111 | vocab_processor.save(os.path.join(FLAGS.output_dir, "vocab"))
112 |
113 | # Initialize all variables
114 | summary_writer = tf.summary.FileWriter('./log/', sess.graph)
115 | sess.run(tf.global_variables_initializer())
116 |
117 | def train_step(x_batch, y_batch):
118 | """
119 | A singel training step
120 | :param x_batch:
121 | :param y_batch:
122 | :return:
123 | """
124 | feed_dict = {
125 | cnn.input_x: x_batch,
126 | cnn.input_y: y_batch
127 | }
128 | _, step, loss, accuracy = sess.run(
129 | [train_op, global_step, cnn.loss, cnn.accuracy],
130 | feed_dict)
131 | time_str = datetime.datetime.now().isoformat()
132 | tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
133 |
134 | def dev_step(x_batch, y_batch, writer=None):
135 | """
136 | Evaluates model on a dev set
137 | """
138 | feed_dict = {
139 | cnn.input_x: x_batch,
140 | cnn.input_y: y_batch
141 | }
142 | step, loss, accuracy = sess.run(
143 | [global_step, cnn.loss, cnn.accuracy],
144 | feed_dict)
145 | time_str = datetime.datetime.now().isoformat()
146 | tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
147 |
148 | # Generate batches
149 | batches = data.DataSet.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
150 |
151 | # Training loop, For each batch ..
152 | for batch in batches:
153 | x_batch, y_batch = zip(*batch)
154 | # train_step(x_batch, y_batch)
155 | # current_step = tf.train.global_step(sess, global_step)
156 | #
157 | # if current_step % FLAGS.save_checkpoints_steps == 0:
158 | # tf.logging.info("\nEvaluation:")
159 | # dev_step(x_dev, y_dev)
160 | # if current_step % FLAGS.save_checkpoints_steps == 0:
161 | # path = saver.save(sess, checkpoint_prefix, global_step=current_step)
162 | # tf.logging.info("Saved model checkpoint to {}\n".format(path))
163 |
164 |
165 | def main(_):
166 | tf.logging.set_verbosity(tf.logging.INFO)
167 | x_train, y_train, vocab_processor, x_dev, y_dev, config = preprocess()
168 | train(x_train, y_train, vocab_processor, x_dev, y_dev, config)
169 |
170 |
171 | if __name__ == "__main__":
172 | tf.app.run()
173 |
--------------------------------------------------------------------------------
/textCNN_paddle.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.6
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2019/8/23 上午8:59
4 | # @Author : ModyfiAI
5 | # @Email : rongshunlin@126.com
6 | # @File : textCNN_paddle
7 | # @description : 仅供学习, 请勿用于商业用途
8 |
9 | from __future__ import print_function
10 |
11 | import paddle
12 | import paddle.fluid as fluid
13 | import numpy as np
14 | import sys
15 | import math
16 | import argparse
17 |
18 | CLASS_DIM = 2
19 | EMB_DIM = 128
20 | HID_DIM = 512
21 | BATCH_SIZE = 128
22 |
23 |
24 | def parse_args():
25 | parser = argparse.ArgumentParser("conv")
26 | parser.add_argument(
27 | '--enable_ce',
28 | action='store_true',
29 | help="If set, run the task with continuous evaluation logs.")
30 | parser.add_argument(
31 | '--use_gpu', type=int, default=0, help="Whether to use GPU or not.")
32 | parser.add_argument(
33 | '--num_epochs', type=int, default=1, help="number of epochs.")
34 | args = parser.parse_args()
35 | return args
36 |
37 |
38 | def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
39 | emb = fluid.layers.embedding(
40 | input=data, size=[input_dim, emb_dim], is_sparse=True)
41 | conv_3 = fluid.nets.sequence_conv_pool(
42 | input=emb,
43 | num_filters=hid_dim,
44 | filter_size=3,
45 | act="tanh",
46 | pool_type="sqrt")
47 | conv_4 = fluid.nets.sequence_conv_pool(
48 | input=emb,
49 | num_filters=hid_dim,
50 | filter_size=4,
51 | act="tanh",
52 | pool_type="sqrt")
53 | conv_5 = fluid.nets.sequence_conv_pool(
54 | input=emb,
55 | num_filters=hid_dim,
56 | filter_size=5,
57 | act="tanh",
58 | pool_type="sqrt")
59 | prediction = fluid.layers.fc(
60 | input=[conv_3, conv_4, conv_5], size=class_dim, act="softmax")
61 | return prediction
62 |
63 |
64 | def inference_program(word_dict):
65 | data = fluid.layers.data(
66 | name="words", shape=[1], dtype="int64", lod_level=1)
67 |
68 | dict_dim = len(word_dict)
69 | net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
70 | return net
71 |
72 |
73 | def train_program(prediction):
74 | label = fluid.layers.data(name="label", shape=[1], dtype="int64")
75 | cost = fluid.layers.cross_entropy(input=prediction, label=label)
76 | avg_cost = fluid.layers.mean(cost)
77 | accuracy = fluid.layers.accuracy(input=prediction, label=label)
78 | return [avg_cost, accuracy]
79 |
80 |
81 | def optimizer_func():
82 | return fluid.optimizer.Adagrad(learning_rate=0.002)
83 |
84 |
85 | def train(use_cuda, params_dirname):
86 | place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
87 |
88 | print("Loading IMDB word dict....")
89 | word_dict = paddle.dataset.imdb.word_dict()
90 |
91 | print("Reading training data....")
92 | if args.enable_ce:
93 | train_reader = paddle.batch(
94 | paddle.dataset.imdb.train(word_dict), batch_size=BATCH_SIZE)
95 | else:
96 | train_reader = paddle.batch(
97 | paddle.reader.shuffle(
98 | paddle.dataset.imdb.train(word_dict), buf_size=25000),
99 | batch_size=BATCH_SIZE)
100 |
101 | print("Reading testing data....")
102 | test_reader = paddle.batch(
103 | paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
104 |
105 | feed_order = ['words', 'label']
106 | pass_num = args.num_epochs
107 |
108 | main_program = fluid.default_main_program()
109 | star_program = fluid.default_startup_program()
110 |
111 | if args.enable_ce:
112 | main_program.random_seed = 90
113 | star_program.random_seed = 90
114 |
115 | prediction = inference_program(word_dict)
116 | train_func_outputs = train_program(prediction)
117 | avg_cost = train_func_outputs[0]
118 |
119 | test_program = main_program.clone(for_test=True)
120 |
121 | # [avg_cost, accuracy] = train_program(prediction)
122 | sgd_optimizer = optimizer_func()
123 | sgd_optimizer.minimize(avg_cost)
124 | exe = fluid.Executor(place)
125 |
126 | def train_test(program, reader):
127 | count = 0
128 | feed_var_list = [
129 | program.global_block().var(var_name) for var_name in feed_order
130 | ]
131 | feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place)
132 | test_exe = fluid.Executor(place)
133 | accumulated = len(train_func_outputs) * [0]
134 | for test_data in reader():
135 | avg_cost_np = test_exe.run(
136 | program=program,
137 | feed=feeder_test.feed(test_data),
138 | fetch_list=train_func_outputs)
139 | accumulated = [
140 | x[0] + x[1][0] for x in zip(accumulated, avg_cost_np)
141 | ]
142 | count += 1
143 | return [x / count for x in accumulated]
144 |
145 | def train_loop():
146 |
147 | feed_var_list_loop = [
148 | main_program.global_block().var(var_name) for var_name in feed_order
149 | ]
150 | feeder = fluid.DataFeeder(feed_list=feed_var_list_loop, place=place)
151 | exe.run(star_program)
152 |
153 | for epoch_id in range(pass_num):
154 | for step_id, data in enumerate(train_reader()):
155 | metrics = exe.run(
156 | main_program,
157 | feed=feeder.feed(data),
158 | fetch_list=[var.name for var in train_func_outputs])
159 | print("step: {0}, Metrics {1}".format(
160 | step_id, list(map(np.array, metrics))))
161 | if (step_id + 1) % 10 == 0:
162 | avg_cost_test, acc_test = train_test(test_program,
163 | test_reader)
164 | print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
165 | step_id, avg_cost_test, acc_test))
166 |
167 | print("Step {0}, Epoch {1} Metrics {2}".format(
168 | step_id, epoch_id, list(map(np.array, metrics))))
169 | if math.isnan(float(metrics[0])):
170 | sys.exit("got NaN loss, training failed.")
171 | if params_dirname is not None:
172 | fluid.io.save_inference_model(params_dirname, ["words"],
173 | prediction, exe)
174 | if args.enable_ce and epoch_id == pass_num - 1:
175 | print("kpis\tconv_train_cost\t%f" % metrics[0])
176 | print("kpis\tconv_train_acc\t%f" % metrics[1])
177 | print("kpis\tconv_test_cost\t%f" % avg_cost_test)
178 | print("kpis\tconv_test_acc\t%f" % acc_test)
179 |
180 | train_loop()
181 |
182 |
183 | def infer(use_cuda, params_dirname=None):
184 | place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
185 | word_dict = paddle.dataset.imdb.word_dict()
186 |
187 | exe = fluid.Executor(place)
188 |
189 | inference_scope = fluid.core.Scope()
190 | with fluid.scope_guard(inference_scope):
191 | # Use fluid.io.load_inference_model to obtain the inference program desc,
192 | # the feed_target_names (the names of variables that will be feeded
193 | # data using feed operators), and the fetch_targets (variables that
194 | # we want to obtain data from using fetch operators).
195 | [inferencer, feed_target_names,
196 | fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)
197 |
198 | # Setup input by creating LoDTensor to represent sequence of words.
199 | # Here each word is the basic element of the LoDTensor and the shape of
200 | # each word (base_shape) should be [1] since it is simply an index to
201 | # look up for the corresponding word vector.
202 | # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
203 | # which has only one lod level. Then the created LoDTensor will have only
204 | # one higher level structure (sequence of words, or sentence) than the basic
205 | # element (word). Hence the LoDTensor will hold data for three sentences of
206 | # length 3, 4 and 2, respectively.
207 | # Note that lod info should be a list of lists.
208 | reviews_str = [
209 | 'read the book forget the movie', 'this is a great movie',
210 | 'this is very bad'
211 | ]
212 | reviews = [c.split() for c in reviews_str]
213 |
214 | UNK = word_dict['']
215 | lod = []
216 | for c in reviews:
217 | lod.append([np.int64(word_dict.get(words, UNK)) for words in c])
218 |
219 | base_shape = [[len(c) for c in lod]]
220 |
221 | tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
222 | assert feed_target_names[0] == "words"
223 | results = exe.run(
224 | inferencer,
225 | feed={feed_target_names[0]: tensor_words},
226 | fetch_list=fetch_targets,
227 | return_numpy=False)
228 | np_data = np.array(results[0])
229 | for i, r in enumerate(np_data):
230 | print("Predict probability of ", r[0], " to be positive and ", r[1],
231 | " to be negative for review \'", reviews_str[i], "\'")
232 |
233 |
234 | def main(use_cuda):
235 | if use_cuda and not fluid.core.is_compiled_with_cuda():
236 | return
237 | params_dirname = "understand_sentiment_conv.inference.model"
238 | train(use_cuda, params_dirname)
239 | infer(use_cuda, params_dirname)
240 |
241 |
242 | if __name__ == '__main__':
243 | args = parse_args()
244 | use_cuda = args.use_gpu # set to True if training with GPU
245 | main(use_cuda)
--------------------------------------------------------------------------------
/text_cnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.6
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2019/8/10 下午7:33
4 | # @Author : ModyfiAI
5 | # @Email : rongshunlin@126.com
6 | # @File : text_cnn.py
7 | # @description : 仅供学习, 请勿用于商业用途
8 |
9 | import tensorflow as tf
10 | import numpy as np
11 |
12 | class ModelConfig(object):
13 | """
14 | textcnn model
15 | """
16 |
17 | def __init__(self, embedding_dim=128, filter_sizes="3,4,5", num_filters=128, dropout_rate=0.5,
18 | l2_reg_lambda=0.0, max_seq_length=128, vocab_size=8192, label_size=64):
19 | self.embedding_dim = embedding_dim
20 | # "3,4,5" => list(3,4,5)
21 | self.filter_sizes = list(map(lambda x: int(x), filter_sizes.split(",")))
22 | self.num_filters = num_filters
23 | self.dropout_rate = dropout_rate
24 | self.l2_reg_lambda = l2_reg_lambda
25 | self.max_seq_length = max_seq_length
26 | self.vocab_size = vocab_size
27 | self.label_size = label_size
28 |
29 | def to_string(self):
30 | lines = [
31 | "embedding_dim = {:d}".format(self.embedding_dim),
32 | "filter_sizes = {}".format(self.filter_sizes),
33 | "num_filters = {:d}".format(self.num_filters),
34 | "dropout_rate = {:g}".format(self.dropout_rate),
35 | "l2_reg_lambda = {:g}".format(self.l2_reg_lambda),
36 | "max_seq_length = {:d}".format(self.max_seq_length),
37 | "vocab_size = {:d}".format(self.vocab_size),
38 | "label_size = {:d}".format(self.label_size)
39 | ]
40 | return "\n".join(lines)
41 |
42 |
43 | class TextCNNModel(object):
44 | def __init__(self,
45 | config,
46 | is_training):
47 | self._config = config
48 | tf.logging.info("\n ******TextCNN MODEL CONFIG*******")
49 | tf.logging.info(self._config.to_string())
50 |
51 | tf.logging.info("\n ******Shape of MODEL VARS********")
52 | self.input_x = tf.placeholder(tf.int32, [None, self._config.max_seq_length], name="input_x")
53 | self.input_y = tf.placeholder(tf.float32, [None, self._config.label_size], name="input_y")
54 | tf.logging.info("num_class {}".format(str(self.input_y.shape)))
55 | tf.logging.info("is_trainging :{}".format(str(is_training)))
56 | l2_loss = tf.constant(0.0)
57 |
58 | # embedding layer
59 | with tf.name_scope("embedding"):
60 | self.W = tf.Variable(tf.random_uniform([self._config.vocab_size, self._config.embedding_dim], -1.0, 1.0),
61 | name="W")
62 | self.char_emb = tf.nn.embedding_lookup(self.W, self.input_x)
63 | self.char_emb_expanded = tf.expand_dims(self.char_emb, -1)
64 | tf.logging.info("Shape of embedding_chars:{}".format(str(self.char_emb_expanded.shape)))
65 |
66 | # convolution + pooling layer
67 | pooled_outputs = []
68 | for i, filter_size in enumerate(self._config.filter_sizes):
69 | with tf.variable_scope("conv-maxpool-%s" % filter_size):
70 | # convolution layer
71 | filter_width = self._config.embedding_dim
72 | input_channel_num = 1
73 | output_channel_num = self._config.num_filters
74 | filter_shape = [filter_size, filter_width, input_channel_num, output_channel_num]
75 |
76 | n = filter_size * filter_width * input_channel_num
77 | kernal = tf.get_variable(name="kernal",
78 | shape=filter_shape,
79 | dtype=tf.float32,
80 | initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n)))
81 | bias = tf.get_variable(name="bias",
82 | shape=[output_channel_num],
83 | dtype=tf.float32,
84 | initializer=tf.zeros_initializer)
85 | # apply convolution process
86 | # conv shape: [batch_size, max_seq_len - filter_size + 1, 1, output_channel_num]
87 | conv = tf.nn.conv2d(
88 | input=self.char_emb_expanded,
89 | filter=kernal,
90 | strides=[1, 1, 1, 1],
91 | padding="VALID",
92 | name="cov")
93 | tf.logging.info("Shape of Conv:{}".format(str(conv.shape)))
94 |
95 | # apply non-linerity
96 | h = tf.nn.relu(tf.nn.bias_add(conv, bias), name="relu")
97 | tf.logging.info("Shape of h:{}".format(str(h)))
98 |
99 | # Maxpooling over the outputs
100 | pooled = tf.nn.max_pool(
101 | value=h,
102 | ksize=[1, self._config.max_seq_length - filter_size + 1, 1, 1],
103 | strides=[1, 1, 1, 1],
104 | padding="VALID",
105 | name="pool"
106 | )
107 | tf.logging.info("Shape of pooled:{}".format(str(pooled.shape)))
108 | pooled_outputs.append(pooled)
109 | tf.logging.info("Shape of pooled_outputs:{}".format(str(np.array(pooled_outputs).shape)))
110 |
111 | # concatenate all filter's output
112 | total_filter_num = self._config.num_filters * len(self._config.filter_sizes)
113 | all_features = tf.reshape(tf.concat(pooled_outputs, axis=-1), [-1, total_filter_num])
114 | tf.logging.info("Shape of all_features:{}".format(str(all_features.shape)))
115 |
116 | # apply dropout during training
117 | if is_training:
118 | all_features = tf.nn.dropout(all_features, rate=self._config.dropout_rate)
119 |
120 | with tf.name_scope("output"):
121 | # output_dense_layer = tf.layers.Dense(self._config.label_size, use_bias=True, name="output_layer")
122 | # logits = output_dense_layer(all_features)
123 | # tf.logging.info("Shape of logits:{}".format(str(logits.shape)))
124 | # self.predictions = tf.nn.softmax(logits, name="predictions")
125 | # tf.logging.info("Shape of predictions:{}".format(str(self.predictions.shape)))
126 | W = tf.get_variable(
127 | name="W",
128 | shape=[total_filter_num, self._config.label_size],
129 | initializer=tf.contrib.layers.xavier_initializer())
130 | b = tf.Variable(tf.constant(0.1, shape=[self._config.label_size]), name="b")
131 | l2_loss += tf.nn.l2_loss(W)
132 | l2_loss += tf.nn.l2_loss(b)
133 | self.scores = tf.nn.xw_plus_b(all_features, W, b, name="scores")
134 | self.predictions = tf.argmax(self.scores, 1, name="predictions")
135 |
136 | # compute loss
137 | with tf.name_scope("loss"):
138 | # losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = self.input_y)
139 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
140 | self.loss = tf.reduce_mean(losses) + self._config.l2_reg_lambda * l2_loss
141 |
142 | # #compute accuracy meric
143 | # with tf.name_scope("accuracy"):
144 | # self.accuracy = self._accuracy_op(self.predictions, self.input_y)
145 |
146 | # Accuracy
147 | with tf.name_scope("accuracy"):
148 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
149 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
150 |
151 | # def _accuracy_op(self, predictions, labels):
152 | # return tf.metrics.accuracy(labels=tf.argmax(self.input_y, axis=-1),
153 | # predictions=tf.argmax(predictions,axis=-1))
154 |
--------------------------------------------------------------------------------
/train-eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 | #如果运行的话,更改code_dir目录
4 | CODE_DIR="/home/work/work/modifyAI/textCNN"
5 | MODEL_DIR=$CODE_DIR/model
6 | TRAIN_DATA_DIR=$CODE_DIR/data_set
7 |
8 | nohup python3 $CODE_DIR/model.py \
9 | --is_train=true \
10 | --num_epochs=200 \
11 | --save_checkpoints_steps=100 \
12 | --keep_checkpoint_max=50 \
13 | --batch_size=64 \
14 | --positive_data_file=$TRAIN_DATA_DIR/polarity.pos \
15 | --negative_data_file=$TRAIN_DATA_DIR/polarity.neg \
16 | --model_dir=$MODEL_DIR > $CODE_DIR/train_log.txt 2>&1 &
--------------------------------------------------------------------------------