├── .idea ├── dictionaries │ └── rongshunlin.xml ├── misc.xml ├── modules.xml ├── textCNN.iml ├── vcs.xml └── workspace.xml ├── README.md ├── __pycache__ ├── data.cpython-36.pyc └── text_cnn.cpython-36.pyc ├── data.py ├── data └── model │ └── vocab ├── data_set ├── polarity.neg └── polarity.pos ├── log ├── events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local ├── events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local └── events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local ├── model.py ├── textCNN_paddle.py ├── text_cnn.py └── train-eval.sh /.idea/dictionaries/rongshunlin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Buildout 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/textCNN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 141 | 142 | 143 | 145 | 146 | 156 | 157 | 158 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 196 | 197 | 198 | 199 | 202 | 203 | 206 | 207 | 208 | 209 | 212 | 213 | 216 | 217 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 249 | 250 | 261 | 262 | 278 | 279 | 290 | 291 | 309 | 310 | 328 | 329 | 349 | 350 | 371 | 372 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 407 | 408 | 409 | 410 | 1565398915368 411 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 444 | 447 | 448 | 449 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | + 修改train-eval.sh的CODE_DIR目录 2 | + 执行sh train-eval.sh 即可运行 3 | -------------------------------------------------------------------------------- /__pycache__/data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/__pycache__/data.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/text_cnn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/__pycache__/text_cnn.cpython-36.pyc -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.6 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/8/11 上午8:21 4 | # @Author : ModyfiAI 5 | # @Email : rongshunlin@126.com 6 | # @File : data.py 7 | # @description : 仅供学习, 请勿用于商业用途 8 | import re 9 | import numpy as np 10 | 11 | 12 | class DataSet(object): 13 | def __init__(self, positive_data_file, negative_data_file): 14 | self.x_text, self.y = self.load_data_and_labels(positive_data_file, negative_data_file) 15 | 16 | def load_data_and_labels(self, positive_data_file, negative_data_file): 17 | # load data from files 18 | positive_data = list(open(positive_data_file, "r", encoding='utf-8').readlines()) 19 | positive_data = [s.strip() for s in positive_data] 20 | negative_data = list(open(negative_data_file, "r", encoding='utf-8').readlines()) 21 | negative_data = [s.strip() for s in negative_data] 22 | 23 | # split by words 24 | x_text = positive_data + negative_data 25 | x_text = [self.clean_str(sent) for sent in x_text] 26 | 27 | # generate labels 28 | positive_labels = [[0, 1] for _ in positive_data] 29 | negative_labels = [[1, 0] for _ in negative_data] 30 | y = np.concatenate([positive_labels, negative_labels], 0) 31 | return [x_text, y] 32 | 33 | def clean_str(self, string): 34 | """ 35 | Tokenization/string cleaning for all datasets except for SST. 36 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 37 | """ 38 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 39 | string = re.sub(r"\'s", " \'s", string) 40 | string = re.sub(r"\'ve", " \'ve", string) 41 | string = re.sub(r"n\'t", " n\'t", string) 42 | string = re.sub(r"\'re", " \'re", string) 43 | string = re.sub(r"\'d", " \'d", string) 44 | string = re.sub(r"\'ll", " \'ll", string) 45 | string = re.sub(r",", " , ", string) 46 | string = re.sub(r"!", " ! ", string) 47 | string = re.sub(r"\(", " \( ", string) 48 | string = re.sub(r"\)", " \) ", string) 49 | string = re.sub(r"\?", " \? ", string) 50 | string = re.sub(r"\s{2,}", " ", string) 51 | return string.strip().lower() 52 | 53 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 54 | """ 55 | Generates a batch iterator for a dataset. 56 | """ 57 | data = np.array(data) 58 | data_size = len(data) 59 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 60 | for epoch in range(num_epochs): 61 | # Shuffle the data at each epoch 62 | if shuffle: 63 | shuffle_indices = np.random.permutation(np.arange(data_size)) 64 | shuffled_data = data[shuffle_indices] 65 | else: 66 | shuffled_data = data 67 | for batch_num in range(num_batches_per_epoch): 68 | start_index = batch_num * batch_size 69 | end_index = min((batch_num + 1) * batch_size, data_size) 70 | print (shuffled_data[start_index]) 71 | yield shuffled_data[start_index:end_index] 72 | -------------------------------------------------------------------------------- /data/model/vocab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/data/model/vocab -------------------------------------------------------------------------------- /log/events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344814.rongshunlindeMacBook-Air.local -------------------------------------------------------------------------------- /log/events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344905.rongshunlindeMacBook-Air.local -------------------------------------------------------------------------------- /log/events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rongshunlin/ModifyAI/da3845c84d40f860af7c75ebfcc9ce22a2f56b49/log/events.out.tfevents.1566344919.rongshunlindeMacBook-Air.local -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.6 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/8/10 下午10:20 4 | # @Author : ModyfiAI 5 | # @Email : rongshunlin@126.com 6 | # @File : model.py 7 | # @description : 仅供学习, 请勿用于商业用途 8 | 9 | import os 10 | import tensorflow as tf 11 | import numpy as np 12 | import data 13 | import datetime 14 | from text_cnn import ModelConfig, TextCNNModel 15 | from tensorflow.contrib import learn 16 | 17 | flags = tf.flags 18 | FLAGS = flags.FLAGS 19 | 20 | # 数据路径 21 | flags.DEFINE_string("positive_data_file", "./data_set/polarity.pos", "splited by ,") 22 | flags.DEFINE_string("negative_data_file", "./data_set/polarity.neg", "splited by ,") 23 | flags.DEFINE_string("pred_data", "None", "splited by ,") 24 | flags.DEFINE_string("model_dir", "./data/model/", "output model dir") 25 | flags.DEFINE_string("output_dir", "./data/model/", "evaluate output dir") 26 | flags.DEFINE_string("vocab", None, "vocab file") 27 | 28 | # cnn 参数 29 | flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 30 | flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 31 | flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 32 | flags.DEFINE_float("drop_rate", 0.5, "Dropout keep probability (default: 0.5)") 33 | flags.DEFINE_integer("max_seq_length", 64, "Maximum sequence length") 34 | flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0") 35 | 36 | # 训练参数 37 | flags.DEFINE_bool("is_train", True, "Whether to run training.") 38 | flags.DEFINE_bool("is_eval", False, "Whether to run eval on the dev set.") 39 | flags.DEFINE_bool("is_predict", False, "Whether to run prediction.") 40 | flags.DEFINE_integer("batch_size", 128, "Batch size.") 41 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") 42 | flags.DEFINE_integer("num_train_steps", 100000, "Train steps") 43 | flags.DEFINE_integer("keep_checkpoint_max", 20, "Max keep checkpoints") 44 | flags.DEFINE_integer("save_summary_steps", 1000, "Step intervals to save summary") 45 | flags.DEFINE_integer("log_step_count_steps", 1000, "Step intervals to log step info") 46 | flags.DEFINE_integer("save_checkpoints_steps", 500, "Step intervals to save checkpoints") 47 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 48 | 49 | 50 | def preprocess(): 51 | data_info = data.DataSet(FLAGS.positive_data_file, FLAGS.negative_data_file) 52 | x_text, y = data_info.x_text, data_info.y 53 | 54 | # Build vocabulary 55 | max_document_length = max([len(x.split(" ")) for x in x_text]) 56 | vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) 57 | x = np.array(list(vocab_processor.fit_transform(x_text))) 58 | tf.logging.info("Shape of X :{}".format(str(x.shape))) 59 | 60 | # Random shuffle data 61 | np.random.seed(10) 62 | shuffle_indices = np.random.permutation(np.arange(len(y))) 63 | x_shuffled = x[shuffle_indices] 64 | y_shuffled = y[shuffle_indices] 65 | 66 | # Split train/test set 67 | dev_sample_index = -1 * int(0.1 * float(len(y))) 68 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 69 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 70 | 71 | # Init model config 72 | model_config = ModelConfig( 73 | embedding_dim=FLAGS.embedding_dim, 74 | filter_sizes=FLAGS.filter_sizes, 75 | num_filters=FLAGS.num_filters, 76 | dropout_rate=FLAGS.drop_rate, 77 | l2_reg_lambda=FLAGS.l2_reg_lambda, 78 | max_seq_length=max_document_length, 79 | vocab_size=len(vocab_processor.vocabulary_), 80 | label_size=2 81 | ) 82 | tf.logging.info("Vocabulary size: {:d}".format(len(vocab_processor.vocabulary_))) 83 | tf.logging.info("Train/dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 84 | tf.logging.info("*******Init Model CONFIG*************") 85 | tf.logging.info(model_config.to_string()) 86 | return x_train, y_train, vocab_processor, x_dev, y_dev, model_config 87 | 88 | 89 | def train(x_train, y_train, vocab_processor, x_dev, y_dev, model_config): 90 | with tf.Graph().as_default(): 91 | sess = tf.Session() 92 | with sess.as_default(): 93 | cnn = TextCNNModel( 94 | config=model_config, 95 | is_training=FLAGS.is_train 96 | ) 97 | # Define Training proceduce 98 | global_step = tf.Variable(0, name="global_step", trainable=False) 99 | optimizer = tf.train.AdamOptimizer(1e-3) 100 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 101 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 102 | 103 | # Checkpoint directory, Tensorflow assumes this directioon already exists so we need to create it 104 | checkpoint_dir = os.path.abspath(os.path.join(FLAGS.output_dir, "checkpoints")) 105 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 106 | if not os.path.exists(checkpoint_dir): 107 | os.makedirs(checkpoint_dir) 108 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.keep_checkpoint_max) 109 | 110 | # Write vocabulary 111 | vocab_processor.save(os.path.join(FLAGS.output_dir, "vocab")) 112 | 113 | # Initialize all variables 114 | summary_writer = tf.summary.FileWriter('./log/', sess.graph) 115 | sess.run(tf.global_variables_initializer()) 116 | 117 | def train_step(x_batch, y_batch): 118 | """ 119 | A singel training step 120 | :param x_batch: 121 | :param y_batch: 122 | :return: 123 | """ 124 | feed_dict = { 125 | cnn.input_x: x_batch, 126 | cnn.input_y: y_batch 127 | } 128 | _, step, loss, accuracy = sess.run( 129 | [train_op, global_step, cnn.loss, cnn.accuracy], 130 | feed_dict) 131 | time_str = datetime.datetime.now().isoformat() 132 | tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 133 | 134 | def dev_step(x_batch, y_batch, writer=None): 135 | """ 136 | Evaluates model on a dev set 137 | """ 138 | feed_dict = { 139 | cnn.input_x: x_batch, 140 | cnn.input_y: y_batch 141 | } 142 | step, loss, accuracy = sess.run( 143 | [global_step, cnn.loss, cnn.accuracy], 144 | feed_dict) 145 | time_str = datetime.datetime.now().isoformat() 146 | tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 147 | 148 | # Generate batches 149 | batches = data.DataSet.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) 150 | 151 | # Training loop, For each batch .. 152 | for batch in batches: 153 | x_batch, y_batch = zip(*batch) 154 | # train_step(x_batch, y_batch) 155 | # current_step = tf.train.global_step(sess, global_step) 156 | # 157 | # if current_step % FLAGS.save_checkpoints_steps == 0: 158 | # tf.logging.info("\nEvaluation:") 159 | # dev_step(x_dev, y_dev) 160 | # if current_step % FLAGS.save_checkpoints_steps == 0: 161 | # path = saver.save(sess, checkpoint_prefix, global_step=current_step) 162 | # tf.logging.info("Saved model checkpoint to {}\n".format(path)) 163 | 164 | 165 | def main(_): 166 | tf.logging.set_verbosity(tf.logging.INFO) 167 | x_train, y_train, vocab_processor, x_dev, y_dev, config = preprocess() 168 | train(x_train, y_train, vocab_processor, x_dev, y_dev, config) 169 | 170 | 171 | if __name__ == "__main__": 172 | tf.app.run() 173 | -------------------------------------------------------------------------------- /textCNN_paddle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.6 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/8/23 上午8:59 4 | # @Author : ModyfiAI 5 | # @Email : rongshunlin@126.com 6 | # @File : textCNN_paddle 7 | # @description : 仅供学习, 请勿用于商业用途 8 | 9 | from __future__ import print_function 10 | 11 | import paddle 12 | import paddle.fluid as fluid 13 | import numpy as np 14 | import sys 15 | import math 16 | import argparse 17 | 18 | CLASS_DIM = 2 19 | EMB_DIM = 128 20 | HID_DIM = 512 21 | BATCH_SIZE = 128 22 | 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser("conv") 26 | parser.add_argument( 27 | '--enable_ce', 28 | action='store_true', 29 | help="If set, run the task with continuous evaluation logs.") 30 | parser.add_argument( 31 | '--use_gpu', type=int, default=0, help="Whether to use GPU or not.") 32 | parser.add_argument( 33 | '--num_epochs', type=int, default=1, help="number of epochs.") 34 | args = parser.parse_args() 35 | return args 36 | 37 | 38 | def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): 39 | emb = fluid.layers.embedding( 40 | input=data, size=[input_dim, emb_dim], is_sparse=True) 41 | conv_3 = fluid.nets.sequence_conv_pool( 42 | input=emb, 43 | num_filters=hid_dim, 44 | filter_size=3, 45 | act="tanh", 46 | pool_type="sqrt") 47 | conv_4 = fluid.nets.sequence_conv_pool( 48 | input=emb, 49 | num_filters=hid_dim, 50 | filter_size=4, 51 | act="tanh", 52 | pool_type="sqrt") 53 | conv_5 = fluid.nets.sequence_conv_pool( 54 | input=emb, 55 | num_filters=hid_dim, 56 | filter_size=5, 57 | act="tanh", 58 | pool_type="sqrt") 59 | prediction = fluid.layers.fc( 60 | input=[conv_3, conv_4, conv_5], size=class_dim, act="softmax") 61 | return prediction 62 | 63 | 64 | def inference_program(word_dict): 65 | data = fluid.layers.data( 66 | name="words", shape=[1], dtype="int64", lod_level=1) 67 | 68 | dict_dim = len(word_dict) 69 | net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) 70 | return net 71 | 72 | 73 | def train_program(prediction): 74 | label = fluid.layers.data(name="label", shape=[1], dtype="int64") 75 | cost = fluid.layers.cross_entropy(input=prediction, label=label) 76 | avg_cost = fluid.layers.mean(cost) 77 | accuracy = fluid.layers.accuracy(input=prediction, label=label) 78 | return [avg_cost, accuracy] 79 | 80 | 81 | def optimizer_func(): 82 | return fluid.optimizer.Adagrad(learning_rate=0.002) 83 | 84 | 85 | def train(use_cuda, params_dirname): 86 | place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() 87 | 88 | print("Loading IMDB word dict....") 89 | word_dict = paddle.dataset.imdb.word_dict() 90 | 91 | print("Reading training data....") 92 | if args.enable_ce: 93 | train_reader = paddle.batch( 94 | paddle.dataset.imdb.train(word_dict), batch_size=BATCH_SIZE) 95 | else: 96 | train_reader = paddle.batch( 97 | paddle.reader.shuffle( 98 | paddle.dataset.imdb.train(word_dict), buf_size=25000), 99 | batch_size=BATCH_SIZE) 100 | 101 | print("Reading testing data....") 102 | test_reader = paddle.batch( 103 | paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) 104 | 105 | feed_order = ['words', 'label'] 106 | pass_num = args.num_epochs 107 | 108 | main_program = fluid.default_main_program() 109 | star_program = fluid.default_startup_program() 110 | 111 | if args.enable_ce: 112 | main_program.random_seed = 90 113 | star_program.random_seed = 90 114 | 115 | prediction = inference_program(word_dict) 116 | train_func_outputs = train_program(prediction) 117 | avg_cost = train_func_outputs[0] 118 | 119 | test_program = main_program.clone(for_test=True) 120 | 121 | # [avg_cost, accuracy] = train_program(prediction) 122 | sgd_optimizer = optimizer_func() 123 | sgd_optimizer.minimize(avg_cost) 124 | exe = fluid.Executor(place) 125 | 126 | def train_test(program, reader): 127 | count = 0 128 | feed_var_list = [ 129 | program.global_block().var(var_name) for var_name in feed_order 130 | ] 131 | feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place) 132 | test_exe = fluid.Executor(place) 133 | accumulated = len(train_func_outputs) * [0] 134 | for test_data in reader(): 135 | avg_cost_np = test_exe.run( 136 | program=program, 137 | feed=feeder_test.feed(test_data), 138 | fetch_list=train_func_outputs) 139 | accumulated = [ 140 | x[0] + x[1][0] for x in zip(accumulated, avg_cost_np) 141 | ] 142 | count += 1 143 | return [x / count for x in accumulated] 144 | 145 | def train_loop(): 146 | 147 | feed_var_list_loop = [ 148 | main_program.global_block().var(var_name) for var_name in feed_order 149 | ] 150 | feeder = fluid.DataFeeder(feed_list=feed_var_list_loop, place=place) 151 | exe.run(star_program) 152 | 153 | for epoch_id in range(pass_num): 154 | for step_id, data in enumerate(train_reader()): 155 | metrics = exe.run( 156 | main_program, 157 | feed=feeder.feed(data), 158 | fetch_list=[var.name for var in train_func_outputs]) 159 | print("step: {0}, Metrics {1}".format( 160 | step_id, list(map(np.array, metrics)))) 161 | if (step_id + 1) % 10 == 0: 162 | avg_cost_test, acc_test = train_test(test_program, 163 | test_reader) 164 | print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( 165 | step_id, avg_cost_test, acc_test)) 166 | 167 | print("Step {0}, Epoch {1} Metrics {2}".format( 168 | step_id, epoch_id, list(map(np.array, metrics)))) 169 | if math.isnan(float(metrics[0])): 170 | sys.exit("got NaN loss, training failed.") 171 | if params_dirname is not None: 172 | fluid.io.save_inference_model(params_dirname, ["words"], 173 | prediction, exe) 174 | if args.enable_ce and epoch_id == pass_num - 1: 175 | print("kpis\tconv_train_cost\t%f" % metrics[0]) 176 | print("kpis\tconv_train_acc\t%f" % metrics[1]) 177 | print("kpis\tconv_test_cost\t%f" % avg_cost_test) 178 | print("kpis\tconv_test_acc\t%f" % acc_test) 179 | 180 | train_loop() 181 | 182 | 183 | def infer(use_cuda, params_dirname=None): 184 | place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() 185 | word_dict = paddle.dataset.imdb.word_dict() 186 | 187 | exe = fluid.Executor(place) 188 | 189 | inference_scope = fluid.core.Scope() 190 | with fluid.scope_guard(inference_scope): 191 | # Use fluid.io.load_inference_model to obtain the inference program desc, 192 | # the feed_target_names (the names of variables that will be feeded 193 | # data using feed operators), and the fetch_targets (variables that 194 | # we want to obtain data from using fetch operators). 195 | [inferencer, feed_target_names, 196 | fetch_targets] = fluid.io.load_inference_model(params_dirname, exe) 197 | 198 | # Setup input by creating LoDTensor to represent sequence of words. 199 | # Here each word is the basic element of the LoDTensor and the shape of 200 | # each word (base_shape) should be [1] since it is simply an index to 201 | # look up for the corresponding word vector. 202 | # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], 203 | # which has only one lod level. Then the created LoDTensor will have only 204 | # one higher level structure (sequence of words, or sentence) than the basic 205 | # element (word). Hence the LoDTensor will hold data for three sentences of 206 | # length 3, 4 and 2, respectively. 207 | # Note that lod info should be a list of lists. 208 | reviews_str = [ 209 | 'read the book forget the movie', 'this is a great movie', 210 | 'this is very bad' 211 | ] 212 | reviews = [c.split() for c in reviews_str] 213 | 214 | UNK = word_dict[''] 215 | lod = [] 216 | for c in reviews: 217 | lod.append([np.int64(word_dict.get(words, UNK)) for words in c]) 218 | 219 | base_shape = [[len(c) for c in lod]] 220 | 221 | tensor_words = fluid.create_lod_tensor(lod, base_shape, place) 222 | assert feed_target_names[0] == "words" 223 | results = exe.run( 224 | inferencer, 225 | feed={feed_target_names[0]: tensor_words}, 226 | fetch_list=fetch_targets, 227 | return_numpy=False) 228 | np_data = np.array(results[0]) 229 | for i, r in enumerate(np_data): 230 | print("Predict probability of ", r[0], " to be positive and ", r[1], 231 | " to be negative for review \'", reviews_str[i], "\'") 232 | 233 | 234 | def main(use_cuda): 235 | if use_cuda and not fluid.core.is_compiled_with_cuda(): 236 | return 237 | params_dirname = "understand_sentiment_conv.inference.model" 238 | train(use_cuda, params_dirname) 239 | infer(use_cuda, params_dirname) 240 | 241 | 242 | if __name__ == '__main__': 243 | args = parse_args() 244 | use_cuda = args.use_gpu # set to True if training with GPU 245 | main(use_cuda) -------------------------------------------------------------------------------- /text_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.6 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/8/10 下午7:33 4 | # @Author : ModyfiAI 5 | # @Email : rongshunlin@126.com 6 | # @File : text_cnn.py 7 | # @description : 仅供学习, 请勿用于商业用途 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | 12 | class ModelConfig(object): 13 | """ 14 | textcnn model 15 | """ 16 | 17 | def __init__(self, embedding_dim=128, filter_sizes="3,4,5", num_filters=128, dropout_rate=0.5, 18 | l2_reg_lambda=0.0, max_seq_length=128, vocab_size=8192, label_size=64): 19 | self.embedding_dim = embedding_dim 20 | # "3,4,5" => list(3,4,5) 21 | self.filter_sizes = list(map(lambda x: int(x), filter_sizes.split(","))) 22 | self.num_filters = num_filters 23 | self.dropout_rate = dropout_rate 24 | self.l2_reg_lambda = l2_reg_lambda 25 | self.max_seq_length = max_seq_length 26 | self.vocab_size = vocab_size 27 | self.label_size = label_size 28 | 29 | def to_string(self): 30 | lines = [ 31 | "embedding_dim = {:d}".format(self.embedding_dim), 32 | "filter_sizes = {}".format(self.filter_sizes), 33 | "num_filters = {:d}".format(self.num_filters), 34 | "dropout_rate = {:g}".format(self.dropout_rate), 35 | "l2_reg_lambda = {:g}".format(self.l2_reg_lambda), 36 | "max_seq_length = {:d}".format(self.max_seq_length), 37 | "vocab_size = {:d}".format(self.vocab_size), 38 | "label_size = {:d}".format(self.label_size) 39 | ] 40 | return "\n".join(lines) 41 | 42 | 43 | class TextCNNModel(object): 44 | def __init__(self, 45 | config, 46 | is_training): 47 | self._config = config 48 | tf.logging.info("\n ******TextCNN MODEL CONFIG*******") 49 | tf.logging.info(self._config.to_string()) 50 | 51 | tf.logging.info("\n ******Shape of MODEL VARS********") 52 | self.input_x = tf.placeholder(tf.int32, [None, self._config.max_seq_length], name="input_x") 53 | self.input_y = tf.placeholder(tf.float32, [None, self._config.label_size], name="input_y") 54 | tf.logging.info("num_class {}".format(str(self.input_y.shape))) 55 | tf.logging.info("is_trainging :{}".format(str(is_training))) 56 | l2_loss = tf.constant(0.0) 57 | 58 | # embedding layer 59 | with tf.name_scope("embedding"): 60 | self.W = tf.Variable(tf.random_uniform([self._config.vocab_size, self._config.embedding_dim], -1.0, 1.0), 61 | name="W") 62 | self.char_emb = tf.nn.embedding_lookup(self.W, self.input_x) 63 | self.char_emb_expanded = tf.expand_dims(self.char_emb, -1) 64 | tf.logging.info("Shape of embedding_chars:{}".format(str(self.char_emb_expanded.shape))) 65 | 66 | # convolution + pooling layer 67 | pooled_outputs = [] 68 | for i, filter_size in enumerate(self._config.filter_sizes): 69 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 70 | # convolution layer 71 | filter_width = self._config.embedding_dim 72 | input_channel_num = 1 73 | output_channel_num = self._config.num_filters 74 | filter_shape = [filter_size, filter_width, input_channel_num, output_channel_num] 75 | 76 | n = filter_size * filter_width * input_channel_num 77 | kernal = tf.get_variable(name="kernal", 78 | shape=filter_shape, 79 | dtype=tf.float32, 80 | initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n))) 81 | bias = tf.get_variable(name="bias", 82 | shape=[output_channel_num], 83 | dtype=tf.float32, 84 | initializer=tf.zeros_initializer) 85 | # apply convolution process 86 | # conv shape: [batch_size, max_seq_len - filter_size + 1, 1, output_channel_num] 87 | conv = tf.nn.conv2d( 88 | input=self.char_emb_expanded, 89 | filter=kernal, 90 | strides=[1, 1, 1, 1], 91 | padding="VALID", 92 | name="cov") 93 | tf.logging.info("Shape of Conv:{}".format(str(conv.shape))) 94 | 95 | # apply non-linerity 96 | h = tf.nn.relu(tf.nn.bias_add(conv, bias), name="relu") 97 | tf.logging.info("Shape of h:{}".format(str(h))) 98 | 99 | # Maxpooling over the outputs 100 | pooled = tf.nn.max_pool( 101 | value=h, 102 | ksize=[1, self._config.max_seq_length - filter_size + 1, 1, 1], 103 | strides=[1, 1, 1, 1], 104 | padding="VALID", 105 | name="pool" 106 | ) 107 | tf.logging.info("Shape of pooled:{}".format(str(pooled.shape))) 108 | pooled_outputs.append(pooled) 109 | tf.logging.info("Shape of pooled_outputs:{}".format(str(np.array(pooled_outputs).shape))) 110 | 111 | # concatenate all filter's output 112 | total_filter_num = self._config.num_filters * len(self._config.filter_sizes) 113 | all_features = tf.reshape(tf.concat(pooled_outputs, axis=-1), [-1, total_filter_num]) 114 | tf.logging.info("Shape of all_features:{}".format(str(all_features.shape))) 115 | 116 | # apply dropout during training 117 | if is_training: 118 | all_features = tf.nn.dropout(all_features, rate=self._config.dropout_rate) 119 | 120 | with tf.name_scope("output"): 121 | # output_dense_layer = tf.layers.Dense(self._config.label_size, use_bias=True, name="output_layer") 122 | # logits = output_dense_layer(all_features) 123 | # tf.logging.info("Shape of logits:{}".format(str(logits.shape))) 124 | # self.predictions = tf.nn.softmax(logits, name="predictions") 125 | # tf.logging.info("Shape of predictions:{}".format(str(self.predictions.shape))) 126 | W = tf.get_variable( 127 | name="W", 128 | shape=[total_filter_num, self._config.label_size], 129 | initializer=tf.contrib.layers.xavier_initializer()) 130 | b = tf.Variable(tf.constant(0.1, shape=[self._config.label_size]), name="b") 131 | l2_loss += tf.nn.l2_loss(W) 132 | l2_loss += tf.nn.l2_loss(b) 133 | self.scores = tf.nn.xw_plus_b(all_features, W, b, name="scores") 134 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 135 | 136 | # compute loss 137 | with tf.name_scope("loss"): 138 | # losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels = self.input_y) 139 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) 140 | self.loss = tf.reduce_mean(losses) + self._config.l2_reg_lambda * l2_loss 141 | 142 | # #compute accuracy meric 143 | # with tf.name_scope("accuracy"): 144 | # self.accuracy = self._accuracy_op(self.predictions, self.input_y) 145 | 146 | # Accuracy 147 | with tf.name_scope("accuracy"): 148 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 149 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 150 | 151 | # def _accuracy_op(self, predictions, labels): 152 | # return tf.metrics.accuracy(labels=tf.argmax(self.input_y, axis=-1), 153 | # predictions=tf.argmax(predictions,axis=-1)) 154 | -------------------------------------------------------------------------------- /train-eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | #如果运行的话,更改code_dir目录 4 | CODE_DIR="/home/work/work/modifyAI/textCNN" 5 | MODEL_DIR=$CODE_DIR/model 6 | TRAIN_DATA_DIR=$CODE_DIR/data_set 7 | 8 | nohup python3 $CODE_DIR/model.py \ 9 | --is_train=true \ 10 | --num_epochs=200 \ 11 | --save_checkpoints_steps=100 \ 12 | --keep_checkpoint_max=50 \ 13 | --batch_size=64 \ 14 | --positive_data_file=$TRAIN_DATA_DIR/polarity.pos \ 15 | --negative_data_file=$TRAIN_DATA_DIR/polarity.neg \ 16 | --model_dir=$MODEL_DIR > $CODE_DIR/train_log.txt 2>&1 & --------------------------------------------------------------------------------