├── BiLstmTextRelation ├── p9_BiLstmTextRelation_model.py └── p9_BiLstmTextRelation_train.py ├── CNN ├── LICENSE ├── README.md ├── binary_class_data_loader.py ├── char_data_processor.py ├── data_helpers.py ├── eval.py ├── multi_class_data_loader.py ├── text_cnn.py ├── train.py └── word_data_processor.py ├── CNNSentenceClassificationTflearn ├── p4_cnn_sentence_classification.py ├── p4_cnn_sentence_classification_zhihu.py ├── p4_cnn_sentence_classification_zhihu2.py ├── p4_cnn_sentence_classification_zhihu2_predict.py └── p4_conv_classification_tflearn.py ├── DynamicMemoryNet ├── a8_dynamic_memory_network.py ├── a8_predict.py └── a8_train.py ├── GraphCNN ├── SVM_eval.py ├── SVM_model.py ├── SVM_train.py ├── __init__.py ├── graphcnn_eval_SVM.py ├── graphcnn_eval_multilabel.py ├── graphcnn_eval_singlelabel.py ├── graphcnn_eval_without_labels.py ├── graphcnn_generate_data.py ├── graphcnn_hier_eval_without_labels.py ├── graphcnn_hier_eval_without_labels_SVM.py ├── graphcnn_hier_eval_without_labels_all.py ├── graphcnn_hier_eval_without_labels_some.py ├── graphcnn_hier_eval_without_labels_some2.py ├── graphcnn_hier_eval_without_labels_some_root.py ├── graphcnn_input.py ├── graphcnn_model.py ├── graphcnn_option.py ├── graphcnn_train.py └── utils │ ├── NYT_utils.py │ ├── lshtc_utils.py │ ├── lshtc_utils2.py │ ├── read │ ├── tmp.py │ └── utils.py ├── HLSTM └── src │ ├── Dataset.py │ ├── EmbLayer.py │ ├── HiddenLayer.py │ ├── LSTMLayer.py │ ├── LSTMModel.py │ ├── PoolLayer.py │ ├── SentenceSortLayer.py │ ├── Update.py │ ├── test.py │ └── train.py ├── HierarchicalAttentionNetwork ├── HAN_model.py ├── p1_HierarchicalAttention_model.py ├── p1_HierarchicalAttention_model_transformer.py ├── p1_HierarchicalAttention_predict.py ├── p1_HierarchicalAttention_train.py └── p1_seq2seq.py ├── Keras_Version ├── main.py ├── model2.py ├── test2matrix_process.py └── words_index.json ├── NewGraphCNNs ├── Pytorch_GraphCNNs ├── make_graphs.py ├── make_heiring.py ├── rcv1_processer.py ├── test.py ├── test_extra.py ├── train.py └── unzip.py ├── RCNN └── v-cpp │ ├── ecnn-noada.cpp │ └── fileutil.hpp ├── README.md ├── SVM_eval.py ├── SVM_model.py ├── SVM_train.py ├── Seq2seqWithAttention ├── a1_seq2seq.py ├── a1_seq2seq_attention_model.py ├── a1_seq2seq_attention_predict.py └── a1_seq2seq_attention_train.py ├── Text2Graph ├── Text2Graph-master │ └── src │ │ └── main │ │ └── java │ │ └── ecs │ │ ├── CoreNLPService.java │ │ └── TestCoreNLP.java └── src │ └── main │ └── java │ └── ecs │ ├── CoreNLPService.java │ └── TestCoreNLP.java ├── TextCNN ├── __pycache__ │ ├── data_util.cpython-36.pyc │ └── p7_TextCNN_model.cpython-36.pyc ├── data_util.py ├── other_experiement │ ├── data_util_zhihu.py │ ├── p7_TextCNN_predict_ensemble.py │ ├── p7_TextCNN_predict_exp.py │ ├── p7_TextCNN_predict_exp512.py │ ├── p7_TextCNN_predict_exp512_0609.py │ ├── p7_TextCNN_predict_exp512_simple.py │ ├── p7_TextCNN_train_exp.py │ ├── p7_TextCNN_train_exp512.py │ ├── p7_TextCNN_train_exp_512_0609.py │ └── p8_TextCNN_predict_exp.py ├── p7_TextCNN_model.py ├── p7_TextCNN_model_multilayers.py ├── p7_TextCNN_predict.py └── p7_TextCNN_train.py ├── TextRCNN ├── p71_TextRCNN_mode2.py ├── p71_TextRCNN_model.py ├── p71_TextRCNN_predict.py └── p71_TextRCNN_train.py ├── TextRNN ├── p8_TextRNN_model.py ├── p8_TextRNN_model_multi_layers.py ├── p8_TextRNN_predict.py └── p8_TextRNN_train.py ├── __init__.py ├── __pycache__ ├── graphcnn.cpython-34.pyc ├── graphcnn_GPU.cpython-34.pyc ├── graphcnn_generate_data.cpython-34.pyc ├── graphcnn_input.cpython-34.pyc └── graphcnn_option.cpython-34.pyc ├── boosting └── a08_boosting.py ├── graphcnn_eval_SVM.py ├── graphcnn_eval_multilabel.py ├── graphcnn_eval_singlelabel.py ├── graphcnn_eval_without_labels.py ├── graphcnn_generate_data.py ├── graphcnn_hier_eval_without_labels.py ├── graphcnn_hier_eval_without_labels_SVM.py ├── graphcnn_hier_eval_without_labels_all.py ├── graphcnn_hier_eval_without_labels_some.py ├── graphcnn_hier_eval_without_labels_some2.py ├── graphcnn_hier_eval_without_labels_some_root.py ├── graphcnn_input.py ├── graphcnn_model.py ├── graphcnn_option.py ├── graphcnn_train.py └── utils ├── NYT_utils.py ├── lshtc_utils.py ├── lshtc_utils2.py ├── read ├── tmp.py └── utils.py /CNN/README.md: -------------------------------------------------------------------------------- 1 | **[This code belongs to the "Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)** 2 | 3 | It is slightly simplified implementation of Kim's [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) paper in Tensorflow. 4 | 5 | ## Requirements 6 | 7 | - Python 3 8 | - Tensorflow > 0.8 9 | - Numpy 10 | 11 | ## Training 12 | 13 | Print parameters: 14 | 15 | ```bash 16 | ./train.py --help 17 | ``` 18 | 19 | ``` 20 | optional arguments: 21 | -h, --help show this help message and exit 22 | --embedding_dim EMBEDDING_DIM 23 | Dimensionality of character embedding (default: 128) 24 | --filter_sizes FILTER_SIZES 25 | Comma-separated filter sizes (default: '3,4,5') 26 | --num_filters NUM_FILTERS 27 | Number of filters per filter size (default: 128) 28 | --l2_reg_lambda L2_REG_LAMBDA 29 | L2 regularizaion lambda (default: 0.0) 30 | --dropout_keep_prob DROPOUT_KEEP_PROB 31 | Dropout keep probability (default: 0.5) 32 | --batch_size BATCH_SIZE 33 | Batch Size (default: 64) 34 | --num_epochs NUM_EPOCHS 35 | Number of training epochs (default: 100) 36 | --evaluate_every EVALUATE_EVERY 37 | Evaluate model on dev set after this many steps 38 | (default: 100) 39 | --checkpoint_every CHECKPOINT_EVERY 40 | Save model after this many steps (default: 100) 41 | --allow_soft_placement ALLOW_SOFT_PLACEMENT 42 | Allow device soft device placement 43 | --noallow_soft_placement 44 | --log_device_placement LOG_DEVICE_PLACEMENT 45 | Log placement of ops on devices 46 | --nolog_device_placement 47 | 48 | ``` 49 | 50 | Train: 51 | 52 | ```bash 53 | ./train.py 54 | ``` 55 | 56 | ## Evaluating 57 | 58 | ```bash 59 | ./eval.py --eval_train --checkpoint_dir="./runs/1459637919/checkpoints/" 60 | ``` 61 | 62 | Replace the checkpoint dir with the output from the training. To use your own data, change the `eval.py` script to load your data. 63 | 64 | 65 | ## References 66 | 67 | - [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) 68 | - [A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1510.03820) -------------------------------------------------------------------------------- /CNN/binary_class_data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from tensorflow.contrib import learn 4 | 5 | class BinaryClassDataLoader(object): 6 | """ 7 | Load binary classification data from two files (positive and negative) and 8 | split data into train and dev. 9 | """ 10 | def __init__(self, flags, data_processor, clean_data=None, classes=None): 11 | self.__flags = flags 12 | self.__data_processor = data_processor 13 | self.__clean_data = clean_data 14 | self.__classes = classes 15 | 16 | def define_flags(self): 17 | self.__flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 18 | self.__flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") 19 | self.__flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.") 20 | 21 | def prepare_data(self): 22 | self.__resolve_params() 23 | 24 | x_text, y = self.load_data_and_labels() 25 | 26 | # Build vocabulary 27 | self.vocab_processor = self.__data_processor.vocab_processor(x_text) 28 | x = np.array(list(self.vocab_processor.fit_transform(x_text))) 29 | 30 | # Randomly shuffle data 31 | np.random.seed(10) 32 | shuffle_indices = np.random.permutation(np.arange(len(y))) 33 | x_shuffled = x[shuffle_indices] 34 | y_shuffled = y[shuffle_indices] 35 | 36 | # Split train/test set 37 | # TODO: This is very crude, should use cross-validation 38 | dev_sample_index = -1 * int(self.__dev_sample_percentage * float(len(y))) 39 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 40 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 41 | return [x_train, y_train, x_dev, y_dev] 42 | 43 | def restore_vocab_processor(self, vocab_path): 44 | self.vocab_processor = self.__data_processor.restore_vocab_processor(vocab_path) 45 | return self.vocab_processor 46 | 47 | def class_labels(self, class_indexes): 48 | if self.__classes is None: 49 | result = class_indexes 50 | else: 51 | result = [ self.__classes[idx] for idx in class_indexes ] 52 | return result 53 | 54 | def load_data_and_labels(self): 55 | """ 56 | Loads MR polarity data from files, splits the data into words and generates labels. 57 | Returns split sentences and labels. 58 | """ 59 | self.__resolve_params() 60 | 61 | # Load data from files 62 | positive_examples = list(open(self.__positive_data_file, "r").readlines()) 63 | negative_examples = list(open(self.__negative_data_file, "r").readlines()) 64 | # Split by words 65 | x_text = positive_examples + negative_examples 66 | x_text = [self.__data_processor.clean_data(sent) for sent in x_text] 67 | # Generate labels 68 | positive_labels = [[0, 1] for _ in positive_examples] 69 | negative_labels = [[1, 0] for _ in negative_examples] 70 | y = np.concatenate([positive_labels, negative_labels], 0) 71 | return [x_text, y] 72 | 73 | def __resolve_params(self): 74 | self.__dev_sample_percentage = self.__flags.FLAGS.dev_sample_percentage 75 | self.__positive_data_file = self.__flags.FLAGS.positive_data_file 76 | self.__negative_data_file = self.__flags.FLAGS.negative_data_file 77 | -------------------------------------------------------------------------------- /CNN/char_data_processor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import codecs 3 | 4 | class CharDataProcessor(object): 5 | def vocab_processor(_, *texts): 6 | max_document_length = 0 7 | for text in texts: 8 | max_doc_len = max([len(line.decode("utf-8")) for line in text]) 9 | if max_doc_len > max_document_length: 10 | max_document_length = max_doc_len 11 | return VocabularyProcessor(max_document_length) 12 | 13 | def restore_vocab_processor(_, vocab_path): 14 | return VocabularyProcessor.restore(vocab_path) 15 | 16 | def clean_data(_, string): 17 | return string 18 | 19 | class VocabularyProcessor(object): 20 | def __init__(self, max_document_length, min_frequency=0, vocabulary=None, 21 | tokenizer_fn=None): 22 | # init a class. index maxdocument length and a vocabulabrary 23 | if vocabulary == None: 24 | self.vocabulary_ = {"":0} # padding 25 | else: 26 | self.vocabulary_ = vocabulary 27 | 28 | self.index = 1 29 | self.max_document_length = max_document_length 30 | def fit_transform(self, raw_documents, unused_y=None, fit=True): 31 | result = [] 32 | for raw_document in raw_documents: 33 | # mark for this, we can find it is a [[I am a student]] 34 | result.append([self.__vocab_id(char, fit) for char in raw_document.decode("utf-8")]) 35 | 36 | if self.max_document_length == None: 37 | max_document_length = max([len(vocab_ids) for vocab_ids in result]) 38 | else: 39 | max_document_length = self.max_document_length 40 | 41 | result = self.__smooth_lengths(result, max_document_length) 42 | 43 | return result 44 | 45 | def transform(self, raw_documents): 46 | return self.fit_transform(raw_documents, None, False) 47 | 48 | def save(self, file): 49 | with codecs.open(file, 'w', 'utf-8') as f: 50 | data = {"vocabulary_": self.vocabulary_, "index": self.index, 51 | "max_document_length": self.max_document_length} 52 | f.write(json.dumps(data, ensure_ascii=False)) 53 | 54 | @classmethod 55 | def restore(cls, file): 56 | with codecs.open(file, "r", "utf-8") as f: 57 | data = json.loads(f.readline()) 58 | vp = cls(data["max_document_length"], 0, data["vocabulary_"]) 59 | vp.index = data["index"] 60 | return vp 61 | 62 | @staticmethod 63 | def __smooth_lengths(documents, length): 64 | result = [] 65 | for document in documents: 66 | if len(document) > length: 67 | doccument = document[:length] 68 | elif len(document) < length: 69 | document = document + [0] * (length - len(document)) 70 | result.append(document) 71 | return result 72 | 73 | def __vocab_id(self, char, fit = True): 74 | # every word has a id 75 | if char not in self.vocabulary_: 76 | if fit: 77 | self.vocabulary_[char] = self.index 78 | self.index += 1 79 | else: 80 | char = "" 81 | return self.vocabulary_[char] 82 | 83 | -------------------------------------------------------------------------------- /CNN/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | 6 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 7 | """ 8 | Generates a batch iterator for a dataset. 9 | """ 10 | data = np.array(data) 11 | data_size = len(data) 12 | num_batches_per_epoch = int(len(data)/batch_size) + 1 13 | for epoch in range(num_epochs): 14 | # Shuffle the data at each epoch 15 | if shuffle: 16 | shuffle_indices = np.random.permutation(np.arange(data_size)) 17 | shuffled_data = data[shuffle_indices] 18 | else: 19 | shuffled_data = data 20 | for batch_num in range(num_batches_per_epoch): 21 | start_index = batch_num * batch_size 22 | end_index = min((batch_num + 1) * batch_size, data_size) 23 | yield shuffled_data[start_index:end_index] 24 | -------------------------------------------------------------------------------- /CNN/eval.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import data_helpers 9 | from text_cnn import TextCNN 10 | #from binary_class_data_loader import BinaryClassDataLoader 11 | from multi_class_data_loader import MultiClassDataLoader 12 | #from word_data_processor import WordDataProcessor 13 | from char_data_processor import CharDataProcessor 14 | import csv 15 | 16 | # Parameters 17 | # ================================================== 18 | 19 | # Eval Parameters 20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 21 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 22 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") 23 | 24 | # Misc Parameters 25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 27 | 28 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor()) 29 | data_loader.define_flags() 30 | 31 | FLAGS = tf.flags.FLAGS 32 | FLAGS._parse_flags() 33 | print("\nParameters:") 34 | for attr, value in sorted(FLAGS.__flags.items()): 35 | print("{}={}".format(attr.upper(), value)) 36 | print("") 37 | 38 | # CHANGE THIS: Load data. Load your own data here 39 | if FLAGS.eval_train: 40 | x_raw, y_test = data_loader.load_data_and_labels() 41 | y_test = np.argmax(y_test, axis=1) 42 | else: 43 | x_raw = ["a masterpiece four years in the making", "everything is off."] 44 | y_test = [1, 0] 45 | 46 | # Map data into vocabulary 47 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") 48 | vocab_processor = data_loader.restore_vocab_processor(vocab_path) 49 | x_test = np.array(list(vocab_processor.transform(x_raw))) 50 | 51 | print("\nEvaluating...\n") 52 | 53 | # Evaluation 54 | # ================================================== 55 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 56 | graph = tf.Graph() 57 | with graph.as_default(): 58 | session_conf = tf.ConfigProto( 59 | allow_soft_placement=FLAGS.allow_soft_placement, 60 | log_device_placement=FLAGS.log_device_placement) 61 | sess = tf.Session(config=session_conf) 62 | with sess.as_default(): 63 | # Load the saved meta graph and restore variables 64 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 65 | saver.restore(sess, checkpoint_file) 66 | 67 | # Get the placeholders from the graph by name 68 | input_x = graph.get_operation_by_name("input_x").outputs[0] 69 | # input_y = graph.get_operation_by_name("input_y").outputs[0] 70 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 71 | 72 | # Tensors we want to evaluate 73 | predictions = graph.get_operation_by_name("output/predictions").outputs[0] 74 | 75 | # Generate batches for one epoch 76 | batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) 77 | 78 | # Collect the predictions here 79 | all_predictions = [] 80 | 81 | for x_test_batch in batches: 82 | batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) 83 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 84 | 85 | # Print accuracy if y_test is defined 86 | if y_test is not None: 87 | correct_predictions = float(sum(all_predictions == y_test)) 88 | print("Total number of test examples: {}".format(len(y_test))) 89 | print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) 90 | 91 | # Save the evaluation to a csv 92 | all_predictions = data_loader.class_labels(all_predictions.astype(int)) 93 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions)) 94 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") 95 | print("Saving evaluation to {0}".format(out_path)) 96 | with open(out_path, 'w') as f: 97 | csv.writer(f).writerows(predictions_human_readable) 98 | -------------------------------------------------------------------------------- /CNN/multi_class_data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | 4 | class MultiClassDataLoader(object): 5 | """ 6 | Handles multi-class training data. It takes predefined sets of "train_data_file" and "dev_data_file" 7 | of the following record format. 8 | \t 9 | ex. "what a masterpiece! Positive" 10 | 11 | Class labels are given as "class_data_file", which is a list of class labels. 12 | """ 13 | def __init__(self, flags, data_processor): 14 | self.__flags = flags 15 | self.__data_processor = data_processor 16 | self.__train_data_file = None 17 | self.__dev_data_file = None 18 | self.__class_data_file = None 19 | self.__classes_cache = None 20 | 21 | 22 | def define_flags(self): 23 | self.__flags.DEFINE_string("train_data_file", "./data/rt-polaritydata/train.txt", "Data source for the training data.") 24 | self.__flags.DEFINE_string("dev_data_file", "./data/rt-polaritydata/test.txt", "Data source for the cross validation data.") 25 | self.__flags.DEFINE_string("class_data_file", "./data/rt-polaritydata/lable.txt", "Data source for the class list.") 26 | 27 | def prepare_data(self): 28 | self.__resolve_params() 29 | x_train, y_train = self.__load_data_and_labels(self.__train_data_file) 30 | x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file) 31 | 32 | max_doc_len = max([len(doc.decode("utf-8")) for doc in x_train]) 33 | max_doc_len_dev = max([len(doc.decode("utf-8")) for doc in x_dev]) 34 | if max_doc_len_dev > max_doc_len: 35 | max_doc_len = max_doc_len_dev 36 | # Build vocabulary 37 | self.vocab_processor = self.__data_processor.vocab_processor(x_train, x_dev) 38 | x_train = np.array(list(self.vocab_processor.fit_transform(x_train))) 39 | # Build vocabulary 40 | x_dev = np.array(list(self.vocab_processor.fit_transform(x_dev))) 41 | return [x_train, y_train, x_dev, y_dev] 42 | 43 | def restore_vocab_processor(self, vocab_path): 44 | return self.__data_processor.restore_vocab_processor(vocab_path) 45 | 46 | def class_labels(self, class_indexes): 47 | return [ self.__classes()[idx] for idx in class_indexes ] 48 | 49 | def load_data_and_labels(self): 50 | self.__resolve_params() 51 | x_train, y_train = self.__load_data_and_labels(self.__train_data_file) 52 | x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file) 53 | x_all = x_train + x_dev 54 | y_all = np.concatenate([y_train, y_dev], 0) 55 | return [x_all, y_all] 56 | 57 | def __load_data_and_labels(self, data_file): 58 | x_text = [] 59 | y = [] 60 | with open(data_file, 'r') as tsvin: 61 | classes = self.__classes() 62 | one_hot_vectors = np.eye(len(classes), dtype=int) 63 | class_vectors = {} 64 | for i, cls in enumerate(classes): 65 | class_vectors[cls] = one_hot_vectors[i] 66 | #edit for the first to the code. 67 | all_lines = tsvin.readlines() 68 | for line in all_lines: 69 | temp = line.split(' ',1) 70 | data = self.__data_processor.clean_data(temp[1]) 71 | x_text.append(data) 72 | y.append(class_vectors[temp[0]]) 73 | #edit 74 | # tsvin = csv.reader(tsvin, delimiter='\t') 75 | # for row in tsvin: 76 | # data = self.__data_processor.clean_data(row[0]) 77 | # x_text.append(data) 78 | # y.append(class_vectors[row[1]]) 79 | return [x_text, np.array(y)] 80 | 81 | def __classes(self): 82 | self.__resolve_params() 83 | if self.__classes_cache is None: 84 | with open(self.__class_data_file, 'r') as catin: 85 | classes = list(catin.readlines()) 86 | self.__classes_cache = [s.strip() for s in classes] 87 | return self.__classes_cache 88 | 89 | def __resolve_params(self): 90 | if self.__class_data_file is None: 91 | self.__train_data_file = self.__flags.FLAGS.train_data_file 92 | self.__dev_data_file = self.__flags.FLAGS.dev_data_file 93 | self.__class_data_file = self.__flags.FLAGS.class_data_file 94 | -------------------------------------------------------------------------------- /CNN/text_cnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class TextCNN(object): 6 | """ 7 | A CNN for text classification. 8 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 9 | """ 10 | def __init__( 11 | self, sequence_length, num_classes, vocab_size, 12 | embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): 13 | 14 | # Placeholders for input, output and dropout 15 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") 16 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") 17 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 18 | 19 | # Keeping track of l2 regularization loss (optional) 20 | l2_loss = tf.constant(0.0) 21 | 22 | # Embedding layer 23 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 24 | self.W = tf.Variable( 25 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 26 | trainable = False, 27 | name="W") 28 | 29 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 30 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 31 | 32 | # Create a convolution + maxpool layer for each filter size 33 | pooled_outputs = [] 34 | for i, filter_size in enumerate(filter_sizes): 35 | with tf.name_scope("conv-maxpool-%s" % filter_size): 36 | # Convolution Layer 37 | filter_shape = [filter_size, embedding_size, 1, num_filters] 38 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 39 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 40 | conv = tf.nn.conv2d( 41 | self.embedded_chars_expanded, 42 | W, 43 | strides=[1, 1, 1, 1], 44 | padding="VALID", 45 | name="conv") 46 | # Apply nonlinearity 47 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 48 | # Maxpooling over the outputs 49 | pooled = tf.nn.max_pool( 50 | h, 51 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 52 | strides=[1, 1, 1, 1], 53 | padding='VALID', 54 | name="pool") 55 | pooled_outputs.append(pooled) 56 | 57 | # Combine all the pooled features 58 | num_filters_total = num_filters * len(filter_sizes) 59 | self.h_pool = tf.concat(3, pooled_outputs) 60 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 61 | 62 | # Add dropout 63 | with tf.name_scope("dropout"): 64 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 65 | 66 | # Final (unnormalized) scores and predictions 67 | with tf.name_scope("output"): 68 | W = tf.get_variable( 69 | "W", 70 | shape=[num_filters_total, num_classes], 71 | initializer=tf.contrib.layers.xavier_initializer()) 72 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 73 | l2_loss += tf.nn.l2_loss(W) 74 | l2_loss += tf.nn.l2_loss(b) 75 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 76 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 77 | 78 | # CalculateMean cross-entropy loss 79 | with tf.name_scope("loss"): 80 | losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y) 81 | self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss 82 | 83 | # Accuracy 84 | with tf.name_scope("accuracy"): 85 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 86 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 87 | -------------------------------------------------------------------------------- /CNN/word_data_processor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from tensorflow.contrib import learn 3 | 4 | class WordDataProcessor(object): 5 | def vocab_processor(_, *texts): 6 | max_document_length = 0 7 | for text in texts: 8 | max_doc_len = max([len(line.split(" ")) for line in text]) 9 | if max_doc_len > max_document_length: 10 | max_document_length = max_doc_len 11 | return learn.preprocessing.VocabularyProcessor(max_document_length) 12 | 13 | def restore_vocab_processor(_, vocab_path): 14 | return learn.preprocessing.VocabularyProcessor.restore(vocab_path) 15 | 16 | def clean_data(_, string): 17 | """ 18 | Tokenization/string cleaning for all datasets except for SST. 19 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 20 | """ 21 | string = string.strip() 22 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 23 | string = re.sub(r"\'s", " \'s", string) 24 | string = re.sub(r"\'ve", " \'ve", string) 25 | string = re.sub(r"n\'t", " n\'t", string) 26 | string = re.sub(r"\'re", " \'re", string) 27 | string = re.sub(r"\'d", " \'d", string) 28 | string = re.sub(r"\'ll", " \'ll", string) 29 | string = re.sub(r",", " , ", string) 30 | string = re.sub(r"!", " ! ", string) 31 | string = re.sub(r"\(", " \( ", string) 32 | string = re.sub(r"\)", " \) ", string) 33 | string = re.sub(r"\?", " \? ", string) 34 | string = re.sub(r"\s{2,}", " ", string) 35 | return string.strip().lower() 36 | -------------------------------------------------------------------------------- /CNNSentenceClassificationTflearn/p4_cnn_sentence_classification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, absolute_import 3 | 4 | """ 5 | Simple example using convolutional neural network to classify IMDB 6 | sentiment dataset. 7 | References: 8 | - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, 9 | and Christopher Potts. (2011). Learning Word Vectors for Sentiment 10 | Analysis. The 49th Annual Meeting of the Association for Computational 11 | Linguistics (ACL 2011). 12 | - Kim Y. Convolutional Neural Networks for Sentence Classification[C]. 13 | Empirical Methods in Natural Language Processing, 2014. 14 | Links: 15 | - http://ai.stanford.edu/~amaas/data/sentiment/ 16 | - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf 17 | """ 18 | import tensorflow as tf 19 | import tflearn 20 | from tflearn.layers.core import input_data, dropout, fully_connected 21 | from tflearn.layers.conv import conv_1d, global_max_pool 22 | from tflearn.layers.merge_ops import merge 23 | from tflearn.layers.estimator import regression 24 | from tflearn.data_utils import to_categorical, pad_sequences 25 | from tflearn.datasets import imdb 26 | import numpy as np 27 | 28 | print("started...") 29 | # 1.IMDB Dataset loading 30 | train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1) 31 | trainX, trainY = train 32 | testX, testY = test 33 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话 34 | print("testY.shape:",np.array(testY).shape) #2500个label 35 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] 36 | print("testY[0]:",testY[0]) #0 37 | 38 | # 2.Data preprocessing 39 | # Sequence padding 40 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length 41 | testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length 42 | # Converting labels to binary vectors 43 | trainY = to_categorical(trainY, nb_classes=2) #y as one hot 44 | testY = to_categorical(testY, nb_classes=2) #y as one hot 45 | 46 | # 3.Building convolutional network 47 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData") 48 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training 49 | network = tflearn.embedding(network, input_dim=10000, output_dim=128) #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size 50 | #conv_1d(incoming,nb_filter,filter_size) 51 | branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns 52 | branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters] 53 | branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters] 54 | network = merge([branch1, branch2, branch3], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters] 55 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape 56 | network = global_max_pool(network) #[batch_size, pooled dim] 57 | network = dropout(network, 0.5) #[batch_size, pooled dim] 58 | network = fully_connected(network, 2, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,2] 59 | network = regression(network, optimizer='adam', learning_rate=0.001, 60 | loss='categorical_crossentropy', name='target') 61 | # Training 62 | model = tflearn.DNN(network, tensorboard_verbose=0) 63 | model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32) 64 | print("ended...") -------------------------------------------------------------------------------- /CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, absolute_import 3 | 4 | """ 5 | Simple example using convolutional neural network to classify IMDB 6 | sentiment dataset. 7 | References: 8 | - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, 9 | and Christopher Potts. (2011). Learning Word Vectors for Sentiment 10 | Analysis. The 49th Annual Meeting of the Association for Computational 11 | Linguistics (ACL 2011). 12 | - Kim Y. Convolutional Neural Networks for Sentence Classification[C]. 13 | Empirical Methods in Natural Language Processing, 2014. 14 | Links: 15 | - http://ai.stanford.edu/~amaas/data/sentiment/ 16 | - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf 17 | """ 18 | import tensorflow as tf 19 | import tflearn 20 | from tflearn.layers.core import input_data, dropout, fully_connected 21 | from tflearn.layers.conv import conv_1d, global_max_pool 22 | from tflearn.layers.merge_ops import merge 23 | from tflearn.layers.estimator import regression 24 | from tflearn.data_utils import to_categorical, pad_sequences 25 | #from tflearn.datasets import imdb 26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label 27 | import numpy as np 28 | import pickle 29 | 30 | print("started...") 31 | f_cache='data_zhihu.pik' 32 | # 1. loading dataset 33 | with open(f_cache, 'r') as f: 34 | trainX,trainY,testX,testY=pickle.load(f) 35 | if trainX is not None and trainY is not None: #如果训练数据,不存在 36 | print("training data not exist==>load data, and dump it to file system") 37 | vocabulary_word2index, vocabulary_index2word = create_voabulary() 38 | vocabulary_word2index_label = create_voabulary_label() 39 | train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label) 40 | trainX, trainY = train 41 | testX, testY = test 42 | nb_classes=1999 43 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话 44 | print("testY.shape:",np.array(testY).shape) #2500个label 45 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] 46 | print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0 47 | 48 | # 2.Data preprocessing 49 | # Sequence padding 50 | print("start padding & transform to one hot...") 51 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length 52 | testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length 53 | # Converting labels to binary vectors 54 | trainY = to_categorical(trainY, nb_classes=nb_classes) #y as one hot 55 | testY = to_categorical(testY, nb_classes=nb_classes) #y as one hot 56 | print("end padding & transform to one hot...") 57 | #cahe trainX,trainY,testX,testY for next time use. 58 | pickle.dump((trainX,trainY,testX,testY)) 59 | else: 60 | print("traning data exists in cache. going to use it.") 61 | 62 | # 3.Building convolutional network 63 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC############################################################################################## 64 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData") 65 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training 66 | network = tflearn.embedding(network, input_dim=10000, output_dim=128) #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size 67 | #conv_1d(incoming,nb_filter,filter_size) 68 | branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns 69 | branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters] 70 | branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters] 71 | network = merge([branch1, branch2, branch3], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters] 72 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape 73 | network = global_max_pool(network) #[batch_size, pooled dim] 74 | network = dropout(network, 0.5) #[batch_size, pooled dim] 75 | network = fully_connected(network, nb_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,2] 76 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') 77 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################ 78 | # 4.Training 79 | model = tflearn.DNN(network, tensorboard_verbose=0) 80 | model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32 81 | print("ended...") -------------------------------------------------------------------------------- /CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, absolute_import 3 | 4 | """ 5 | Simple example using convolutional neural network to classify IMDB 6 | sentiment dataset. 7 | References: 8 | - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, 9 | and Christopher Potts. (2011). Learning Word Vectors for Sentiment 10 | Analysis. The 49th Annual Meeting of the Association for Computational 11 | Linguistics (ACL 2011). 12 | - Kim Y. Convolutional Neural Networks for Sentence Classification[C]. 13 | Empirical Methods in Natural Language Processing, 2014. 14 | Links: 15 | - http://ai.stanford.edu/~amaas/data/sentiment/ 16 | - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf 17 | """ 18 | import tensorflow as tf 19 | import tflearn 20 | from tflearn.layers.core import input_data, dropout, fully_connected 21 | from tflearn.layers.conv import conv_1d, global_max_pool 22 | from tflearn.layers.merge_ops import merge 23 | from tflearn.layers.estimator import regression 24 | from tflearn.data_utils import to_categorical, pad_sequences 25 | #from tflearn.datasets import imdb 26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label 27 | import numpy as np 28 | import pickle 29 | import os 30 | #import tflearn.metrics.Top_k as Top_k 31 | 32 | print("started...") 33 | f_cache='data_zhihu.pik' 34 | # 1. loading dataset 35 | trainX,trainY,testX,testY=None,None,None,None 36 | number_classes=1999 37 | #if os.path.exists(f_cache): 38 | # with open(f_cache, 'r') as f: 39 | # trainX,trainY,testX,testY,vocab_size=pickle.load(f) 40 | #if trainX is None or trainY is None: #如果训练数据,不存在 41 | #------------------------------------------------------------------------------------------------- 42 | print("training data not exist==>load data, and dump it to file system") 43 | vocabulary_word2index, vocabulary_index2word = create_voabulary() 44 | vocab_size=len(vocabulary_word2index) 45 | vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() 46 | train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label) 47 | trainX, trainY = train 48 | testX, testY = test 49 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话 50 | print("testY.shape:",np.array(testY).shape) #2500个label 51 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] 52 | print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0 53 | 54 | # 2.Data preprocessing 55 | # Sequence padding 56 | print("start padding & transform to one hot...") 57 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length 58 | testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length 59 | # Converting labels to binary vectors 60 | trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot 61 | testY = to_categorical(testY, nb_classes=number_classes) #y as one hot 62 | print("end padding & transform to one hot...") 63 | #-------------------------------------------------------------------------------------------------- 64 | # cache trainX,trainY,testX,testY for next time use. 65 | # with open(f_cache, 'w') as f: 66 | # pickle.dump((trainX,trainY,testX,testY,vocab_size),f) 67 | #else: 68 | # print("traning data exists in cache. going to use it.") 69 | 70 | # 3.Building convolutional network 71 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC############################################################################################## 72 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData") 73 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training 74 | network = tflearn.embedding(network, input_dim=vocab_size, output_dim=256) #TODO 128 [None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size 75 | #conv_1d(incoming,nb_filter,filter_size) 76 | branch1 = conv_1d(network, 256, 1, padding='valid', activation='relu', regularizer="L2") #128 77 | branch2 = conv_1d(network, 256, 2, padding='valid', activation='relu', regularizer="L2") #128 78 | branch3 = conv_1d(network, 256, 3, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns 79 | branch4 = conv_1d(network, 256, 4, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps2, nb_filters] 80 | branch5 = conv_1d(network, 256, 5, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] 81 | branch6 = conv_1d(network, 256, 6, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD 82 | branch7 = conv_1d(network, 256, 7, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD 83 | branch8 = conv_1d(network, 256, 7, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD 84 | branch9 = conv_1d(network, 256, 8, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD 85 | branch10 = conv_1d(network,256, 9, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD 86 | network = merge([branch1, branch2, branch3,branch4,branch5,branch6, branch7, branch8,branch9,branch10], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters] 87 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape 88 | network = global_max_pool(network) #input: 4-D tensors,[batch_size,height,width,in_channels]; output:2-D Tensor,[batch_size, pooled dim] 89 | network = dropout(network, 0.5) #[batch_size, pooled dim] 90 | network = fully_connected(network, number_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,number_classes] 91 | #top5 = tflearn.metrics.Top_k(k=5) 92 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') #,metric=top5 93 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################ 94 | # 4.Training 95 | model = tflearn.DNN(network, tensorboard_verbose=0) 96 | model.fit(trainX, trainY, n_epoch = 10, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32 97 | model.save('model_zhihu_cnn12345') 98 | 99 | print("going to make a prediction...") 100 | model.predict(testX[0:1000]) 101 | print("ended...") -------------------------------------------------------------------------------- /CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu2_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division, print_function, absolute_import 3 | 4 | """ 5 | Simple example using convolutional neural network to classify IMDB 6 | sentiment dataset. 7 | References: 8 | - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, 9 | and Christopher Potts. (2011). Learning Word Vectors for Sentiment 10 | Analysis. The 49th Annual Meeting of the Association for Computational 11 | Linguistics (ACL 2011). 12 | - Kim Y. Convolutional Neural Networks for Sentence Classification[C]. 13 | Empirical Methods in Natural Language Processing, 2014. 14 | Links: 15 | - http://ai.stanford.edu/~amaas/data/sentiment/ 16 | - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf 17 | """ 18 | import tensorflow as tf 19 | import tflearn 20 | from tflearn.layers.core import input_data, dropout, fully_connected 21 | from tflearn.layers.conv import conv_1d, global_max_pool 22 | from tflearn.layers.merge_ops import merge 23 | from tflearn.layers.estimator import regression 24 | from tflearn.data_utils import to_categorical, pad_sequences 25 | #from tflearn.datasets import imdb 26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label 27 | import numpy as np 28 | import pickle 29 | import os 30 | #import tflearn.metrics.Metric.Top_k as Top_k 31 | 32 | print("started...") 33 | f_cache='data_zhihu.pik' 34 | # 1. loading dataset 35 | trainX,trainY,testX,testY=None,None,None,None 36 | number_classes=1999 37 | #if os.path.exists(f_cache): 38 | # with open(f_cache, 'r') as f: 39 | # trainX,trainY,testX,testY,vocab_size=pickle.load(f) 40 | #if trainX is None or trainY is None: #如果训练数据,不存在 41 | #------------------------------------------------------------------------------------------------- 42 | print("training data not exist==>load data, and dump it to file system") 43 | vocabulary_word2index, vocabulary_index2word = create_voabulary() 44 | vocab_size=len(vocabulary_word2index) 45 | vocabulary_word2index_label = create_voabulary_label() 46 | train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label) 47 | trainX, trainY = train 48 | testX, testY = test 49 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话 50 | print("testY.shape:",np.array(testY).shape) #2500个label 51 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] 52 | print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0 53 | 54 | # 2.Data preprocessing 55 | # Sequence padding 56 | print("start padding & transform to one hot...") 57 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length 58 | testX = pad_sequences(testX, maxlen=100, value=0.) #padding to max length 59 | # Converting labels to binary vectors 60 | trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot 61 | testY = to_categorical(testY, nb_classes=number_classes) #y as one hot 62 | print("end padding & transform to one hot...") 63 | #-------------------------------------------------------------------------------------------------- 64 | # cache trainX,trainY,testX,testY for next time use. 65 | # with open(f_cache, 'w') as f: 66 | # pickle.dump((trainX,trainY,testX,testY,vocab_size),f) 67 | #else: 68 | # print("traning data exists in cache. going to use it.") 69 | 70 | # 3.Building convolutional network 71 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC############################################################################################## 72 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData") 73 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training 74 | network = tflearn.embedding(network, input_dim=vocab_size, output_dim=128) #TODO [None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size 75 | #conv_1d(incoming,nb_filter,filter_size) 76 | branch1 = conv_1d(network, 128, 1, padding='valid', activation='relu', regularizer="L2") 77 | branch2 = conv_1d(network, 128, 2, padding='valid', activation='relu', regularizer="L2") 78 | branch3 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns 79 | branch4 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters] 80 | branch5 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters] 81 | network = merge([branch1, branch2, branch3,branch4,branch5], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters] 82 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape 83 | network = global_max_pool(network) #[batch_size, pooled dim] 84 | network = dropout(network, 0.5) #[batch_size, pooled dim] 85 | network = fully_connected(network, number_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,number_classes] 86 | top5 = tflearn.metrics.Top_k(k=5) 87 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') #metric=top5 88 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################ 89 | # 4.Training 90 | model = tflearn.DNN(network, tensorboard_verbose=0) 91 | #model.fit(trainX, trainY, n_epoch = 10, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32 92 | #model.save('model_zhihu_cnn12345') 93 | model.load('model_zhihu_cnn12345') 94 | print("going to make a prediction...") 95 | predict_result=model.predict(testX[0:1000]) 96 | print("predict_result:",predict_result) 97 | print("ended...") -------------------------------------------------------------------------------- /CNNSentenceClassificationTflearn/p4_conv_classification_tflearn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | 3 | import tensorflow as tf 4 | 5 | # -*- coding: utf-8 -*- 6 | 7 | """ Convolutional network applied to CIFAR-10 dataset classification task. 8 | References: 9 | Learning Multiple Layers of Features from Tiny Images, A. Krizhevsky, 2009. 10 | Links: 11 | [CIFAR-10 Dataset](https://www.cs.toronto.edu/~kriz/cifar.html) 12 | """ 13 | 14 | import tflearn 15 | from tflearn.data_utils import shuffle, to_categorical 16 | from tflearn.layers.core import input_data, dropout, fully_connected 17 | from tflearn.layers.conv import conv_2d, max_pool_2d 18 | from tflearn.layers.estimator import regression 19 | from tflearn.data_preprocessing import ImagePreprocessing 20 | from tflearn.data_augmentation import ImageAugmentation 21 | 22 | print("started...") 23 | # Data loading and preprocessing 24 | from tflearn.datasets import cifar10 25 | (X, Y), (X_test, Y_test) = cifar10.load_data() 26 | X, Y = shuffle(X, Y) 27 | Y = to_categorical(Y, 10) 28 | Y_test = to_categorical(Y_test, 10) 29 | 30 | # Real-time data preprocessing 31 | img_prep = ImagePreprocessing() 32 | img_prep.add_featurewise_zero_center() 33 | img_prep.add_featurewise_stdnorm() 34 | 35 | # Real-time data augmentation 36 | img_aug = ImageAugmentation() 37 | img_aug.add_random_flip_leftright() 38 | img_aug.add_random_rotation(max_angle=25.) 39 | 40 | # Convolutional network building 41 | #------------------------------------------------------------------------------------------- 42 | network = input_data(shape=[None, 32, 32, 3], 43 | data_preprocessing=img_prep, 44 | data_augmentation=img_aug) 45 | network = conv_2d(network, 32, 3, activation='relu') 46 | network = max_pool_2d(network, 2) 47 | network = conv_2d(network, 64, 3, activation='relu') 48 | network = conv_2d(network, 64, 3, activation='relu') 49 | network = max_pool_2d(network, 2) 50 | network = fully_connected(network, 512, activation='relu') 51 | network = dropout(network, 0.5) 52 | network = fully_connected(network, 10, activation='softmax') 53 | network = regression(network, optimizer='adam', 54 | loss='categorical_crossentropy', 55 | learning_rate=0.001) 56 | #----------------------------------------------------------------------------------------- 57 | # Train using classifier 58 | model = tflearn.DNN(network, tensorboard_verbose=0) 59 | model.fit(X, Y, n_epoch=50, shuffle=True, validation_set=(X_test, Y_test), 60 | show_metric=True, batch_size=96, run_id='cifar10_cnn') 61 | print("end...") -------------------------------------------------------------------------------- /GraphCNN/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | """ must run in python3x""" 3 | import numpy as np 4 | import tensorflow as tf 5 | import os 6 | import shutil 7 | __author__ = 'Yu He' 8 | __version__ = 'v30' 9 | 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 11 | 12 | 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution') 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float) 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value') 16 | total_true_value = np.loadtxt(detail_filename,dtype=int) 17 | 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 19 | 20 | 21 | 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int) 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1) 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1) 25 | # 26 | 27 | 28 | filename_eval_log = os.path.join('./data', 'log_eval') 29 | file_eval_log = open(filename_eval_log, 'w') 30 | np.set_printoptions(threshold=np.nan) 31 | print('\nevaluation:', file=file_eval_log) 32 | print('\nevaluation:') 33 | 34 | total_predicted_value = total_predicted_value.astype(bool) 35 | total_true_value = total_true_value.astype(bool) 36 | 37 | print(' example based evaluations:', file=file_eval_log) 38 | print(' example based evaluations:') 39 | 40 | equal = total_true_value == total_predicted_value 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1) 42 | exact_match_ratio = np.sum(match) / np.size(match) 43 | print(' exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log) 44 | print(' exact_match_ratio = %.4f' % exact_match_ratio) 45 | 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1) 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1) 48 | accuracy = np.mean(true_and_predict / true_or_predict) 49 | print(' accuracy = %.4f' % accuracy, file=file_eval_log) 50 | print(' accuracy = %.4f' % accuracy) 51 | 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9)) 53 | print(' precison = %.4f' % precison, file=file_eval_log) 54 | print(' precison = %.4f' % precison) 55 | 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1)) 57 | print(' recall = %.4f' % recall, file=file_eval_log) 58 | print(' recall = %.4f' % recall) 59 | 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1) 61 | + np.sum(total_predicted_value, axis=1))) 62 | print(' F1_Measure = %.4f' % F1_Measure, file=file_eval_log) 63 | print(' F1_Measure = %.4f' % F1_Measure) 64 | 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value) 66 | print(' HammingLoss = %.4f' % HammingLoss, file=file_eval_log) 67 | print(' HammingLoss = %.4f' % HammingLoss) 68 | 69 | 70 | print(' label based evaluations:', file=file_eval_log) 71 | print(' label based evaluations:') 72 | 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32) 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32) 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32) 76 | 77 | TP_re = np.reshape(TP,[TP.shape[0],1]) 78 | FP_re = np.reshape(FP,[FP.shape[0],1]) 79 | FN_re = np.reshape(FN,[FN.shape[0],1]) 80 | re = np.concatenate((TP_re,FP_re,FN_re),axis=1) 81 | print('TP FP FN:') 82 | print('TP FP FN:', file=file_eval_log) 83 | print(re,file=file_eval_log) 84 | print(re) 85 | 86 | 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:])) 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:])) 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:])) 90 | 91 | # for i in [6,28,31,36,52]: 92 | # TP[i] = TP[i-1] 93 | # FP[i] = FP[i - 1] 94 | # FN[i] = FN[i - 1] 95 | # 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:])) 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:])) 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:])) 99 | 100 | 101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9 ) 102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9 ) 103 | Micro_F1 = (2 * _P *_R) / (_P + _R) 104 | print(' P = %.4f' % _P, file=file_eval_log) 105 | print(' P = %.4f' % _P) 106 | print(' R = %.4f' % _R, file=file_eval_log) 107 | print(' R = %.4f' % _R) 108 | print(' Micro-F1 = %.4f' % Micro_F1, file=file_eval_log) 109 | print(' Micro-F1 = %.4f' % Micro_F1) 110 | 111 | _P_t = TP / (TP + FP + 1e-9) 112 | _R_t = TP / (TP + FN + 1e-9) 113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 114 | 115 | 116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1]) 117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1]) 118 | re = np.concatenate((_P_t_re,_R_t_re),axis=1) 119 | print('_P_t _R_t:') 120 | print('_P_t:', file=file_eval_log) 121 | print(re,file=file_eval_log) 122 | print(re) 123 | 124 | print(' Macro-F1 = %.4f' % Macro_F1, file=file_eval_log) 125 | print(' Macro-F1 = %.4f' % Macro_F1) 126 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_all.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 222 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | 19 | evalDataSet = None 20 | 21 | FLAGS = tf.app.flags.FLAGS 22 | 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 24 | """Directory where to write event logs.""") 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 26 | """Directory where to read model checkpoints.""") 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 28 | """How often to run the eval.""") 29 | tf.app.flags.DEFINE_boolean('run_once', False, 30 | """Whether to run eval only once.""") 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 32 | """Whether to log device placement.""") 33 | 34 | 35 | 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 37 | 38 | def evaluate(checkpoint,test_index_array): 39 | with tf.Graph().as_default() as g, tf.device('/cpu:0'): 40 | # Get images and labels 41 | data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH, 42 | graphcnn_input.NUM_CHANNELS]) 43 | # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) 44 | 45 | # inference 46 | logits = graphcnn_model.inference(data, eval_data=True) 47 | # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False) 48 | 49 | # multi-label sigmoid 50 | logits = tf.sigmoid(logits) 51 | 52 | # Restore the moving average version of the learned variables for eval. # ????????????????????????? 53 | variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY) 54 | variables_to_restore = variable_averages.variables_to_restore() 55 | saver = tf.train.Saver(variables_to_restore) 56 | 57 | # Build the summary operation based on the TF collection of Summaries. 58 | # summary_op = tf.merge_all_summaries() 59 | # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) 60 | 61 | 62 | with tf.Session(config=tf.ConfigProto( 63 | allow_soft_placement=True, 64 | log_device_placement=FLAGS.log_device_placement)) as sess: 65 | if checkpoint == '0': 66 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) 67 | if ckpt and ckpt.model_checkpoint_path: 68 | # Restores from checkpoint 69 | saver.restore(sess, ckpt.model_checkpoint_path) 70 | # extract global_step 71 | global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) 72 | else: 73 | print('No checkpoint file found') 74 | return 75 | else: 76 | if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)): 77 | saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)) 78 | global_step_for_restore = int(checkpoint) 79 | else: 80 | print('No checkpoint file found') 81 | return 82 | 83 | num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE)) 84 | total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE 85 | step = 0 86 | total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32) ## 87 | while step < num_iter: 88 | test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE) 89 | predicted_value = sess.run( 90 | logits, feed_dict={data: test_data}) 91 | total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0) 92 | step += 1 93 | 94 | total_predicted_value = total_predicted_value[1:] 95 | 96 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 97 | if os.path.exists(detail_filename): 98 | os.remove(detail_filename) 99 | np.savetxt(detail_filename, total_predicted_value, fmt='%.4f') 100 | 101 | 102 | filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval') 103 | file_eval_log = open(filename_eval_log, 'w') 104 | np.set_printoptions(threshold=np.nan) 105 | print('\nevaluation:', file=file_eval_log) 106 | print('\nevaluation:') 107 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log) 108 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore)) 109 | print('evaluation is end...') 110 | print('evaluation is end...', file=file_eval_log) 111 | 112 | print('evaluation samples number:%d, evaluation classes number:%d' % 113 | (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log) 114 | print('evaluation samples number:%d, evaluation classes number:%d' % 115 | (total_predicted_value.shape[0], total_predicted_value.shape[1])) 116 | print('evaluation detail: ' 117 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 118 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'), 119 | file=file_eval_log) 120 | print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval') 121 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 122 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution')) 123 | file_eval_log.close() 124 | 125 | 126 | 127 | def main(argv=None): # pylint: disable=unused-argument 128 | global evalDataSet 129 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 130 | 131 | if tf.gfile.Exists(FLAGS.eval_dir): 132 | # print('the evaluate data has already exists!') 133 | # str = input('continue will delete the old evaluate directory:(y/n)') 134 | # if str == 'y' or str == 'Y': 135 | tf.gfile.DeleteRecursively(FLAGS.eval_dir) 136 | #elif str == 'n' or str == 'N': 137 | # print('eval end!') 138 | # return 139 | #else: 140 | # print('invalid input!') 141 | # return 142 | tf.gfile.MakeDirs(FLAGS.eval_dir) 143 | 144 | test_index_array = np.array(range(0, 81262)) 145 | 146 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 147 | checkpoint = '0' 148 | evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array, 149 | data_dir=graphcnn_option.EVAL_DATA_DIR, 150 | ont_hot=True, 151 | index_mode=True, 152 | label_used=False) 153 | print('evaluating...') 154 | evaluate(checkpoint,test_index_array) 155 | 156 | 157 | if __name__ == '__main__': 158 | tf.app.run() 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_some.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 444 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | 37 | 38 | # 生成测试数据的索引文件 39 | def generate_eval_index(): 40 | test_index_array = [] 41 | # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME) 42 | filepath = '../hier_eval_root' 43 | pathDir = os.listdir(filepath) 44 | for allDir in pathDir: 45 | child = os.path.join(filepath, allDir) 46 | if os.path.getsize(child): 47 | example_label_array = np.loadtxt(child,dtype=int) 48 | examlpe_array = example_label_array[:,0] 49 | label_array = example_label_array[:, 1] 50 | for root in graphcnn_option.HIER_ROOT_CODE: 51 | index = np.where(label_array==root)[0] 52 | for one in examlpe_array[index]: 53 | if one not in test_index_array: 54 | test_index_array.append(one) 55 | 56 | # for allDir in pathDir: 57 | # child = os.path.join(filepath, allDir) 58 | # os.remove(child) 59 | 60 | # 将索引文件写到hier_eval文件夹下 61 | filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index') 62 | np.savetxt(filename,test_index_array,fmt='%d') 63 | 64 | return test_index_array 65 | 66 | 67 | def evaluate(checkpoint,test_index_array): 68 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 69 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 70 | total_predicted_value = total_predicted_value[test_index_array] 71 | 72 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 73 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 74 | total_predicted_value = ( 75 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 76 | 77 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 78 | if os.path.exists(detail_filename): 79 | os.remove(detail_filename) 80 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 81 | 82 | 83 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 84 | total_remap = np.loadtxt(filename, dtype=int) 85 | 86 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 87 | graphcnn_option.HIER_labels_remap_file) 88 | remap = np.loadtxt(detail_filename, dtype=int) 89 | 90 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 91 | fr_leaf = open(filename,'a') 92 | filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file) 93 | fr_leaf_exp = open(filename, 'a') 94 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 95 | fr_root = open(filename, 'w') 96 | 97 | # rootstr_tmp = [] 98 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 99 | fr = open(detail_filename, 'w') 100 | for i in range(0, np.size(total_predicted_value, axis=0)): 101 | labels = np.where(total_predicted_value[i] == 1)[0] 102 | if len(labels) > 0: 103 | labels_remap = remap[labels, 0] 104 | for elem in labels_remap: 105 | print(elem, end=' ', file=fr) 106 | if elem in total_remap[:,0]: # leaf 107 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 108 | else: 109 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 110 | # for j in range(0,len(rootlist)): 111 | # if elem in rootlist[j]: 112 | # if rootstr[j] not in rootstr_tmp: 113 | # rootstr_tmp.append(rootstr[j]) 114 | print('', file=fr) 115 | else: 116 | # labels_remap = remap[:, 0] 117 | labels = total_predicted_value_argmax[i] 118 | labels_value = total_predicted_value_max[i] 119 | labels_remap = remap[labels, 0] 120 | # for elem in labels_remap: 121 | elem = labels_remap 122 | print(elem, file=fr) 123 | if elem in total_remap[:, 0]: # leaf 124 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp) 125 | else: 126 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 127 | # if labels_value < 0.5: 128 | # labels_remap = remap[:, 0] 129 | # for elem in labels_remap: 130 | # if elem not in total_remap[:, 0]: 131 | # print('%d %d' % (test_index_array[i], elem), file=fr_root) 132 | 133 | fr.close() 134 | fr_leaf.close() 135 | fr_root.close() 136 | fr_leaf_exp.close() 137 | 138 | # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root') 139 | # fr = open(filename, 'w') 140 | # for one in rootstr_tmp: 141 | # print(one) 142 | # print(one,file=fr) 143 | # fr.close() 144 | 145 | 146 | 147 | 148 | def main(argv=None): # pylint: disable=unused-argument 149 | global evalDataSet 150 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 151 | 152 | # test_index_array = np.array(range(0, 81262)) 153 | if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root 154 | test_index_array = np.array(range(0,81262)) 155 | # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int) 156 | else: 157 | test_index_array = generate_eval_index() 158 | if test_index_array is None or len(test_index_array)==0: 159 | print('no hier_data need eval') 160 | return 161 | else: 162 | print('choosing for evaluation...') 163 | print('choosed number:%d' % len(test_index_array)) 164 | 165 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 166 | checkpoint = '0' 167 | 168 | # print('choosing for evaluation...') 169 | evaluate(checkpoint,test_index_array) 170 | 171 | 172 | if __name__ == '__main__': 173 | tf.app.run() 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_some2.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 333 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | def evaluate(checkpoint,test_index_array): 37 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 38 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 39 | total_predicted_value = total_predicted_value[test_index_array] 40 | 41 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 42 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 43 | total_predicted_value = ( 44 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 45 | 46 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 47 | if os.path.exists(detail_filename): 48 | os.remove(detail_filename) 49 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 50 | 51 | 52 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 53 | total_remap = np.loadtxt(filename, dtype=int) 54 | 55 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 56 | graphcnn_option.HIER_labels_remap_file) 57 | remap = np.loadtxt(detail_filename, dtype=int) 58 | 59 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 60 | fr_leaf = open(filename,'a') 61 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 62 | fr_root = open(filename, 'w') 63 | 64 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr') 65 | # fr = open(filename, 'r') 66 | # rootstr = fr.readlines() 67 | # fr.close() 68 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist') 69 | # fr = open(filename, 'r') 70 | # rootlines = fr.readlines() 71 | # fr.close() 72 | # rootlist = [] 73 | # for line in rootlines: 74 | # line = line.strip() 75 | # linelist = line.split(' ') 76 | # linelist = [int(k) for k in linelist] 77 | # rootlist.append(linelist) 78 | 79 | # rootstr_tmp = [] 80 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 81 | fr = open(detail_filename, 'w') 82 | for i in range(0, np.size(total_predicted_value, axis=0)): 83 | labels = np.where(total_predicted_value[i] == 1)[0] 84 | if len(labels) > 0: 85 | labels_remap = remap[labels, 0] 86 | for elem in labels_remap: 87 | print(elem, end=' ', file=fr) 88 | if elem in total_remap[:,0]: # leaf 89 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 90 | print('', file=fr) 91 | else: 92 | labels = total_predicted_value_argmax[i] 93 | labels_remap = remap[labels, 0] 94 | elem = labels_remap 95 | labels_value = total_predicted_value_max[i] 96 | print(elem, file=fr) 97 | if elem in total_remap[:, 0]: # leaf 98 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root) 99 | 100 | 101 | fr.close() 102 | fr_leaf.close() 103 | fr_root.close() 104 | 105 | 106 | 107 | 108 | def main(argv=None): # pylint: disable=unused-argument 109 | global evalDataSet 110 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 111 | 112 | test_index_array = np.array(range(0, 81262)) 113 | print('choosing for evaluation...') 114 | print('choosed number:%d' % len(test_index_array)) 115 | 116 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 117 | checkpoint = '0' 118 | 119 | # print('choosing for evaluation...') 120 | evaluate(checkpoint,test_index_array) 121 | 122 | 123 | if __name__ == '__main__': 124 | tf.app.run() 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_option.py: -------------------------------------------------------------------------------- 1 | 2 | ## data 3 | ORI_DATA_NAME = 'graphs' 4 | ORI_TRAIN_DATA_NAME = 'train_graphs' 5 | ORI_TEST_DATA_NAME = 'test_graphs' 6 | ORI_DATA_VEC_NAME = 'index2vec' 7 | ORI_DATA_OPTION_NAME = 'option' 8 | 9 | TRAIN_DATA_NAME = 'data.train' 10 | TEST_DATA_NAME = 'data.test' 11 | DATA_OPTION_NAME = 'data.option' 12 | 13 | DATA_LABELS_REMAP_NAME = 'remap' 14 | 15 | ## LSHTC Hierarchy training 16 | 17 | 18 | HIER_used = True 19 | HIER_test_used = True 20 | rootstr = '_1_2322682_' # ???? 21 | HIER_ROOT_CODE = [2322682] # ???? 22 | HIER_DIR_NAME = 'hier' 23 | HIER_labels_remap_file = 'hier'+rootstr+'remap' 24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index' 25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels' 26 | HIER_train_data_file = 'hier'+rootstr+'train_data' # ?? 27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index' 28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels' 29 | HIER_test_data_file = 'hier'+rootstr+'test_data' # ?? 30 | 31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf' 32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp' 33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root' 34 | 35 | if HIER_used: 36 | TRAIN_DATA_NAME = HIER_train_data_file 37 | if HIER_test_used: 38 | TEST_DATA_NAME = HIER_test_data_file 39 | 40 | 41 | 42 | 43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn 44 | # lr_decay_ecophs = [2,150,750,1250,1500] # single-label wiki_cn 45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001] 46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001] 47 | # lr_decay_ecophs = [10,400,1500,1800,2000] # multi-label, RCV 48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500] # multi-label, RCV 49 | 50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600 51 | 52 | 53 | 54 | ## Basic parameters. 55 | TRAIN_DATA_DIR = '../graphCNN_data' # Path to the train data directory. 56 | EVAL_DATA_DIR = '../graphCNN_data' # Path to the test data directory. 57 | DATA_PATH = './data' # Path to data directory 58 | 59 | USE_FP16 = False # Train the model using fp16. 60 | 61 | # summaryWriter 62 | SUMMARYWRITER = False 63 | 64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 65 | # to differentiate the operations. Note that this prefix is removed from the 66 | # names of the summaries when visualizing a model. 67 | TOWER_NAME = 'tower' 68 | 69 | 70 | 71 | ## model parameters 72 | NUM_EPOCHS_PER_DECAY = 1000 #350 # Epochs after which learning rate decays. 73 | INITIAL_LEARNING_RATE = 0.001 # Initial learning rate. 74 | LEARNING_RATE_DECAY_RATE = 0.1 # Learning rate decay rate. 75 | 76 | MOMENTUM = 0.9 # Momentum of SGD 77 | 78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training. 79 | 80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average. 81 | 82 | WEIGHT_DECAY = 0.0005 # 0.00005 # 0.0005 # l2 regularization weight decay 83 | 84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint 85 | 86 | 87 | ## train parameters 88 | NUM_GPUS = 4 # How many GPUs to use 89 | 90 | CKPT_PERIOD = 5000 91 | 92 | 93 | ## eval parameters 94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification 95 | -------------------------------------------------------------------------------- /GraphCNN/utils/read: -------------------------------------------------------------------------------- 1 | a 1 2 | a 1 3 | a 1 4 | a 1 5 | a 1 6 | a 1 7 | a 1 8 | a 1 9 | b 1 10 | b 1 11 | b 1 12 | b 1 13 | c 1 14 | c 1 15 | c 1 16 | c 1 17 | a 1 18 | a 1 19 | a 1 20 | a 1 21 | b 1 22 | b 1 23 | b 1 24 | b 1 25 | c 1 26 | c 1 27 | c 1 28 | c 1 29 | -------------------------------------------------------------------------------- /GraphCNN/utils/tmp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | import shutil 6 | 7 | # 遍历指定目录,显示目录下的所有文件名 8 | def eachFile(filepath): 9 | pathDir = os.listdir(filepath) 10 | for allDir in pathDir: 11 | child = os.path.join('%s%s' % (filepath, allDir)) 12 | 13 | def xx(): 14 | filename = 'graphcnn_hier_eval_without_labels.py' 15 | DIR = '.' 16 | pathDir = os.listdir(DIR) 17 | for path in pathDir: 18 | if len(path)>5 and path[0:5]=='LSHTC': 19 | sourceFile = os.path.join(DIR, filename) 20 | targetFile = os.path.join(DIR,path,filename) 21 | if os.path.exists(targetFile): 22 | os.remove(targetFile) 23 | shutil.copy(sourceFile, targetFile) 24 | 25 | 26 | a = np.array([[1,2,3],[1,2,3]]) 27 | a = np.reshape(a,[-1,1]) 28 | print(a) -------------------------------------------------------------------------------- /GraphCNN/utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | def main(): 5 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups' 6 | fr = open(filename, 'r') 7 | lines = fr.readlines() 8 | fr.close() 9 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info' 10 | fr = open(filename, 'w') 11 | for line in lines: 12 | line = line.strip() 13 | linelist = line.split(' ') 14 | print(len(linelist),file=fr) 15 | fr.close() 16 | 17 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups' 18 | fr = open(filename, 'r') 19 | lines = fr.readlines() 20 | fr.close() 21 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info' 22 | fr = open(filename, 'w') 23 | for line in lines: 24 | line = line.strip() 25 | linelist = line.split(' ') 26 | print(len(linelist),file=fr) 27 | fr.close() 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /HLSTM/src/Dataset.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy 3 | import copy 4 | import theano 5 | import random 6 | 7 | def genBatch(data): 8 | m =0 9 | maxsentencenum = len(data[0]) 10 | for doc in data: 11 | for sentence in doc: 12 | if len(sentence)>m: 13 | m = len(sentence) 14 | for i in xrange(maxsentencenum - len(doc)): 15 | doc.append([-1]) 16 | tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data) #[-1]是加在最前面 17 | tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp) 18 | return tmp 19 | 20 | def genLenBatch(lengths,maxsentencenum): 21 | lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths) 22 | return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths) 23 | 24 | def genwordmask(docsbatch): 25 | mask = copy.deepcopy(docsbatch) 26 | mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask) 27 | mask = numpy.asarray(mask,dtype=numpy.float32) 28 | mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 29 | return mask 30 | 31 | def gensentencemask(sentencenum): 32 | maxnum = sentencenum[0] 33 | mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32) 34 | return mask.T 35 | 36 | class Dataset(object): 37 | def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ): 38 | lines = map(lambda x: x.split('\t\t'), open(filename).readlines()) 39 | # here i need more label. there is only one label 40 | label = map(lambda x: x[0].split(' '), lines) 41 | oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32) 42 | for i in range(0,len(label)): 43 | for j in label[i]: 44 | oneslable[i][int(j)] = 1 45 | label = oneslable 46 | print("already done the ones-hot") 47 | docs = map(lambda x: x[1][0:len(x[1])-1], lines) 48 | docs = map(lambda x: x.split(''), docs) 49 | docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs) 50 | docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs) 51 | tmp = zip(docs, label) 52 | #random.shuffle(tmp) 53 | tmp.sort(lambda x, y: len(y[0]) - len(x[0])) 54 | docs, label = zip(*tmp) 55 | 56 | sentencenum = map(lambda x : len(x),docs) 57 | length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs) 58 | self.epoch = len(docs) / maxbatch 59 | if len(docs) % maxbatch != 0: 60 | self.epoch += 1 61 | 62 | # self.docs = [] 63 | # self.label = [] 64 | # self.wordmask = [] 65 | # self.sentencemask = [] 66 | # self.maxsentencenum = [] 67 | 68 | # for i in xrange(self.epoch): 69 | # self.maxsentencenum.append(sentencenum[i*maxbatch]) 70 | # docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 71 | # self.docs.append(docsbatch) 72 | # self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 73 | # self.wordmask.append(genwordmask(docsbatch)) 74 | # self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 75 | self.docs = [] 76 | self.label = [] 77 | self.length = [] 78 | self.sentencenum = [] 79 | self.wordmask = [] 80 | self.sentencemask = [] 81 | self.maxsentencenum = [] 82 | 83 | for i in xrange(self.epoch): 84 | self.maxsentencenum.append(sentencenum[i*maxbatch]) 85 | self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 86 | docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 87 | self.docs.append(docsbatch) 88 | self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 89 | self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4)) 90 | self.wordmask.append(genwordmask(docsbatch)) 91 | self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 92 | 93 | 94 | class Wordlist(object): 95 | def __init__(self, filename, maxn = 100000): 96 | lines = map(lambda x: x.split(), open(filename).readlines()[:maxn]) 97 | self.size = len(lines) 98 | 99 | self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))] 100 | self.voc = dict(self.voc) 101 | 102 | def getID(self, word): 103 | try: 104 | return self.voc[word] 105 | except: 106 | return -1 107 | 108 | -------------------------------------------------------------------------------- /HLSTM/src/EmbLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class EmbLayer(object): 8 | def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None): 9 | self.input = inp 10 | self.name = name 11 | 12 | if prefix == None: 13 | f = file('../data/'+dataname+'/embinit.save', 'rb') 14 | W = cPickle.load(f) 15 | f.close() 16 | W = theano.shared(value=W, name='E', borrow=True) 17 | else: 18 | f = file(prefix + name + '.save', 'rb') 19 | W = cPickle.load(f) 20 | f.close() 21 | self.W = W 22 | 23 | self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim)) 24 | self.params = [self.W] 25 | 26 | def save(self, prefix): 27 | f = file(prefix + self.name + '.save', 'wb') 28 | for obj in self.params: 29 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 30 | f.close() 31 | -------------------------------------------------------------------------------- /HLSTM/src/HiddenLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class HiddenLayer(object): 8 | def __init__(self, rng, input, n_in, n_out, name, prefix=None, 9 | activation=T.tanh): 10 | self.name = name 11 | self.input = input 12 | 13 | if prefix is None: 14 | W_values = numpy.asarray( 15 | rng.uniform( 16 | low=-numpy.sqrt(6. / (n_in + n_out)), 17 | high=numpy.sqrt(6. / (n_in + n_out)), 18 | size=(n_in, n_out) 19 | ), 20 | dtype=numpy.float32 21 | ) 22 | if activation == theano.tensor.nnet.sigmoid: 23 | W_values *= 4 24 | W = theano.shared(value=W_values, name='W', borrow=True) 25 | 26 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 27 | b = theano.shared(value=b_values, name='b', borrow=True) 28 | else: 29 | f = file(prefix + name + '.save', 'rb') 30 | W = cPickle.load(f) 31 | b = cPickle.load(f) 32 | f.close() 33 | 34 | self.W = W 35 | self.b = b 36 | 37 | lin_output = T.dot(input, self.W) + self.b 38 | self.output = ( 39 | lin_output if activation is None 40 | else activation(lin_output) 41 | ) 42 | 43 | self.params = [self.W, self.b] 44 | 45 | def save(self, prefix): 46 | f = file(prefix + self.name + '.save', 'wb') 47 | for obj in self.params: 48 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 49 | f.close() 50 | -------------------------------------------------------------------------------- /HLSTM/src/LSTMLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def randMatrix(rng, shape, lim): 8 | return numpy.asarray( 9 | rng.uniform( 10 | low=-lim, 11 | high=lim, 12 | size=shape 13 | ), 14 | dtype=numpy.float32 15 | ) 16 | 17 | class LSTMLayer(object): 18 | def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None): 19 | self.input = input 20 | self.name = name 21 | 22 | limV = numpy.sqrt(6. / (n_in + n_out * 2)) 23 | limG = limV * 4 24 | 25 | if prefix is None: 26 | Wi1_values = randMatrix(rng, (n_in, n_out), limG) 27 | Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True) 28 | Wi2_values = randMatrix(rng, (n_out, n_out), limG) 29 | Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True) 30 | bi_values = numpy.zeros((n_out,), dtype=numpy.float32) 31 | bi = theano.shared(value=bi_values, name='bi', borrow=True) 32 | 33 | Wo1_values = randMatrix(rng, (n_in, n_out), limG) 34 | Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True) 35 | Wo2_values = randMatrix(rng, (n_out, n_out), limG) 36 | Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True) 37 | bo_values = numpy.zeros((n_out,), dtype=numpy.float32) 38 | bo = theano.shared(value=bo_values, name='bo', borrow=True) 39 | 40 | Wf1_values = randMatrix(rng, (n_in, n_out), limG) 41 | Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True) 42 | Wf2_values = randMatrix(rng, (n_out, n_out), limG) 43 | Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True) 44 | bf_values = numpy.zeros((n_out,), dtype=numpy.float32) 45 | bf = theano.shared(value=bf_values, name='bf', borrow=True) 46 | 47 | Wc1_values = randMatrix(rng, (n_in, n_out), limV) 48 | Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True) 49 | Wc2_values = randMatrix(rng, (n_out, n_out), limV) 50 | Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True) 51 | bc_values = numpy.zeros((n_out,), dtype=numpy.float32) 52 | bc = theano.shared(value=bc_values, name='bc', borrow=True) 53 | 54 | else: 55 | f = file(prefix + name + '.save', 'rb') 56 | Wi1 = cPickle.load(f) 57 | Wi2 = cPickle.load(f) 58 | bi = cPickle.load(f) 59 | 60 | Wo1 = cPickle.load(f) 61 | Wo2 = cPickle.load(f) 62 | bo = cPickle.load(f) 63 | 64 | Wf1 = cPickle.load(f) 65 | Wf2 = cPickle.load(f) 66 | bf = cPickle.load(f) 67 | 68 | Wc1 = cPickle.load(f) 69 | Wc2 = cPickle.load(f) 70 | bc = cPickle.load(f) 71 | 72 | f.close() 73 | 74 | self.Wi1 = Wi1 75 | self.Wi2 = Wi2 76 | self.bi = bi 77 | 78 | self.Wo1 = Wo1 79 | self.Wo2 = Wo2 80 | self.bo = bo 81 | 82 | self.Wf1 = Wf1 83 | self.Wf2 = Wf2 84 | self.bf = bf 85 | 86 | self.Wc1 = Wc1 87 | self.Wc2 = Wc2 88 | self.bc = bc 89 | 90 | def step(emb, mask, C, prev): 91 | Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi) 92 | Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo) 93 | Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf) 94 | Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc) 95 | 96 | CC = C * Gf + Ct * Gi 97 | CC = CC * mask.dimshuffle(0,'x') 98 | CC = T.cast(CC,'float32') 99 | h = T.tanh(CC) * Go 100 | h = h * mask.dimshuffle(0,'x') 101 | h = T.cast(h,'float32') 102 | return [CC, h] 103 | 104 | outs, _ = theano.scan(fn=step, 105 | outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))], 106 | sequences=[input, mask]) 107 | 108 | self.output = outs[1] 109 | 110 | self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo, 111 | self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc] 112 | 113 | def save(self, prefix): 114 | f = file(prefix + self.name + '.save', 'wb') 115 | for obj in self.params: 116 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 117 | f.close() 118 | -------------------------------------------------------------------------------- /HLSTM/src/PoolLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def softmask(x,mask): 8 | y = T.exp(x) 9 | y =y *mask 10 | sumx = T.sum(y,axis=1) 11 | x = y/sumx.dimshuffle(0,'x') 12 | return x 13 | 14 | class LastPoolLayer(object): 15 | def __init__(self, input): 16 | self.input = input 17 | self.output = input[-1] 18 | self.params = [] 19 | 20 | def save(self, prefix): 21 | pass 22 | 23 | class MeanPoolLayer(object): 24 | def __init__(self, input, ll): 25 | self.input = input 26 | self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x') 27 | self.params = [] 28 | 29 | def save(self, prefix): 30 | pass 31 | 32 | 33 | class MaxPoolLayer(object): 34 | def __init__(self, input): 35 | self.input = input 36 | self.output = T.max(input, axis = 0) 37 | self.params = [] 38 | 39 | def save(self, prefix): 40 | pass 41 | 42 | 43 | class SimpleAttentionLayer(object): 44 | def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None): 45 | self.input = input 46 | 47 | if prefix is None: 48 | W_values = numpy.asarray( 49 | rng.uniform( 50 | low=-numpy.sqrt(6. / (n_in + n_out)), 51 | high=numpy.sqrt(6. / (n_in + n_out)), 52 | size=(n_in, n_out) 53 | ), 54 | dtype=numpy.float32 55 | ) 56 | W = theano.shared(value=W_values, name='W', borrow=True) 57 | 58 | v_values = numpy.asarray( 59 | rng.normal(scale=0.1, size=(n_out,)), 60 | dtype=numpy.float32 61 | ) 62 | v = theano.shared(value=v_values, name='v', borrow=True) 63 | 64 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 65 | b = theano.shared(value=b_values, name='b', borrow=True) 66 | 67 | else: 68 | f = file(prefix + name + '.save', 'rb') 69 | W = cPickle.load(f) 70 | v = cPickle.load(f) 71 | b = cPickle.load(f) 72 | f.close() 73 | 74 | self.W = W 75 | self.v = v 76 | self.b = b 77 | 78 | atten = T.tanh(T.dot(input, self.W)+ b) 79 | atten = T.sum(atten * v, axis=2, acc_dtype='float32') 80 | atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0) 81 | output = atten.dimshuffle(0, 1, 'x') * input 82 | self.output = T.sum(output, axis=0, acc_dtype='float32') 83 | 84 | self.params = [self.W,self.v,self.b] 85 | self.name=name 86 | self.atten = atten 87 | 88 | def save(self, prefix): 89 | f = file(prefix + self.name + '.save', 'wb') 90 | for obj in self.params: 91 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 92 | f.close() 93 | 94 | 95 | class Dropout(object): 96 | def __init__(self, input, rate, istrain): 97 | rate = numpy.float32(rate) 98 | self.input = input 99 | srng = T.shared_randomstreams.RandomStreams() 100 | mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') 101 | self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) 102 | self.params = [] 103 | 104 | def save(self, prefix): 105 | pass 106 | -------------------------------------------------------------------------------- /HLSTM/src/SentenceSortLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | 6 | class SentenceSortLayer(object): 7 | def __init__(self, input,maxsentencenum): 8 | self.input = input 9 | [sentencelen,emblen] = T.shape(input) 10 | output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen)) 11 | output = output.dimshuffle(1,0,2) 12 | self.output = output 13 | self.params = [] 14 | 15 | 16 | def save(self, prefix): 17 | pass 18 | -------------------------------------------------------------------------------- /HLSTM/src/Update.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | def AdaUpdates(parameters, gradients, rho, eps): 7 | rho = np.float32(rho) 8 | eps = np.float32(eps) 9 | 10 | gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 11 | deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 12 | 13 | gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ] 14 | deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] 15 | 16 | deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ] 17 | 18 | gradient_sq_updates = zip(gradients_sq,gradients_sq_new) 19 | deltas_sq_updates = zip(deltas_sq,deltas_sq_new) 20 | parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] 21 | return gradient_sq_updates + deltas_sq_updates + parameters_updates 22 | -------------------------------------------------------------------------------- /HLSTM/src/test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import sys 3 | from Dataset import * 4 | from LSTMModel import LSTMModel 5 | 6 | dataname = sys.argv[1] 7 | classes = sys.argv[2] 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 9 | 10 | testset = Dataset('../data/'+dataname+'/test.txt', voc) 11 | trainset = [] 12 | print 'data loaded.' 13 | 14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel') 15 | print 'model loaded.' 16 | model.test() 17 | -------------------------------------------------------------------------------- /HLSTM/src/train.py: -------------------------------------------------------------------------------- 1 | 2 | #-*- coding: UTF-8 -*- 3 | import sys 4 | from Dataset import * 5 | from LSTMModel import LSTMModel 6 | 7 | dataname = sys.argv[1] 8 | classes = sys.argv[2] 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 10 | 11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes) 12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes) 13 | print 'data loaded.' 14 | 15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None) 16 | model.train(100) 17 | print '****************************************************************************' 18 | print 'test 1' 19 | result = model.test() 20 | print '****************************************************************************' 21 | print '\n' 22 | for i in xrange(1,400): 23 | model.train(1000) 24 | print '****************************************************************************' 25 | print 'test',i+1 26 | newresult=model.test() 27 | print '****************************************************************************' 28 | print '\n' 29 | if newresult[0]>result[0] : 30 | result=newresult 31 | model.save('../model/'+dataname+'/bestmodel') 32 | print 'bestmodel saved!' 33 | 34 | -------------------------------------------------------------------------------- /HierarchicalAttentionNetwork/p1_seq2seq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | # 【该方法测试的时候使用】返回一个方法。这个方法根据输入的值,得到对应的索引,再得到这个词的embedding. 5 | def extract_argmax_and_embed(embedding, output_projection=None): 6 | """ 7 | Get a loop_function that extracts the previous symbol and embeds it. Used by decoder. 8 | :param embedding: embedding tensor for symbol 9 | :param output_projection: None or a pair (W, B). If provided, each fed previous output will 10 | first be multiplied by W and added B. 11 | :return: A loop function 12 | """ 13 | def loop_function(prev, _): 14 | if output_projection is not None: 15 | prev = tf.matmul(prev, output_projection[0]) + output_projection[1] 16 | prev_symbol = tf.argmax(prev, 1) #得到对应的INDEX 17 | emb_prev = tf.gather(embedding, prev_symbol) #得到这个INDEX对应的embedding 18 | return emb_prev 19 | return loop_function 20 | 21 | # RNN的解码部分。 22 | # 如果是训练,使用训练数据的输入;如果是test,将t时刻的输出作为t+1时刻的s输入 23 | def rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):#3D Tensor [batch_size x attn_length x attn_size] 24 | """RNN decoder for the sequence-to-sequence model. 25 | Args: 26 | decoder_inputs: A list of 2D Tensors [batch_size x input_size].it is target Y, but shift by one. 27 | initial_state: 2D Tensor with shape [batch_size x cell.state_size].it is the encoded vector of input sentences, which represent 'thought vector' 28 | cell: core_rnn_cell.RNNCell defining the cell function and size. 29 | loop_function: If not None, this function will be applied to the i-th output 30 | in order to generate the i+1-st input, and decoder_inputs will be ignored, 31 | except for the first element ("GO" symbol). This can be used for decoding, 32 | but also for training to emulate http://arxiv.org/abs/1506.03099. 33 | Signature -- loop_function(prev, i) = next 34 | * prev is a 2D Tensor of shape [batch_size x output_size], 35 | * i is an integer, the step number (when advanced control is needed), 36 | * next is a 2D Tensor of shape [batch_size x input_size]. 37 | attention_states: 3D Tensor [batch_size x attn_length x attn_size].it is a input X. 38 | scope: VariableScope for the created subgraph; defaults to "rnn_decoder". 39 | Returns: 40 | A tuple of the form (outputs, state), where: 41 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 42 | shape [batch_size x output_size] containing generated outputs. 43 | state: The state of each cell at the final time-step. 44 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 45 | (Note that in some cases, like basic RNN cell or GRU cell, outputs and 46 | states can be the same. They are different for LSTM cells though.) 47 | """ 48 | with tf.variable_scope(scope or "rnn_decoder"): 49 | print("rnn_decoder_with_attention started...") 50 | state = initial_state #[batch_size x cell.state_size] 51 | _, hidden_size = state.get_shape().as_list() 52 | batch_size,sequence_length,embed_size=attention_states.get_shape().as_list() 53 | outputs = [] 54 | prev = None 55 | W_a = tf.get_variable("W_a", shape=[embed_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1)) 56 | attention_states=tf.reshape(attention_states,shape=(-1,embed_size)) #attention_states:[batch_size*sequence_length,embed_size] 57 | attention_states = tf.nn.tanh(tf.matmul(attention_states, W_a)) #attention_states:[batch_size*sequence_length,hidden_size] 58 | attention_states=tf.reshape(attention_states,shape=(-1,sequence_length,hidden_size)) #attention_states:[batch_size,sequence_length,hidden_size] 59 | for i, inp in enumerate(decoder_inputs):#循环解码部分的输入。如sentence_length个[batch_size x input_size] 60 | # 如果是训练,使用训练数据的输入;如果是test, 将t时刻的输出作为t + 1 时刻的s输入 61 | if loop_function is not None and prev is not None:#测试的时候:如果loop_function不为空且前一个词的值不为空,那么使用前一个的值作为RNN的输入 62 | with tf.variable_scope("loop_function", reuse=True): 63 | inp = loop_function(prev, i) 64 | if i > 0: 65 | tf.get_variable_scope().reuse_variables() 66 | ##ATTENTION################################################################################################################################################# 67 | # 1.use Full connected layer to match dimension for two parts of attention. 68 | W_s = tf.get_variable("W_s_attention", shape=[hidden_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1)) 69 | 70 | state_transfered=tf.nn.tanh(tf.matmul(state,W_s)) 71 | # 2.get possibility attention for each encoder input. attention_states:[batch_size x attn_length x attn_size]; query=state:[batch_size x cell.state_size] 72 | query=tf.expand_dims(state_transfered,axis=1) #[batch_size x 1 x cell.state_size] 73 | # get logits using attention_states and query 74 | attention_logits=tf.multiply(attention_states,query) #TODO [batch_size x attn_length x attn_size]. notice: cell.state_size=atten_size=embedding_size 75 | attention_logits=tf.reduce_sum(attention_logits,2) #[batch_size x attn_length] 76 | attention_logits_max=tf.reduce_max(attention_logits,axis=1,keep_dims=True) #[batch_size x 1] 77 | # possibility distribution for each encoder input.it means how much attention or focus for each encoder input 78 | p_attention=tf.nn.softmax(attention_logits-attention_logits_max)#[batch_size x attn_length] 79 | 80 | # 3.get weighted sum of hidden state for each encoder input as attention state 81 | p_attention=tf.expand_dims(p_attention,axis=2) #[batch_size x attn_length x 1] 82 | # attention_states:[batch_size x attn_length x attn_size]; p_attention:[batch_size x attn_length]; 83 | # final attention 84 | attention_final=tf.multiply(attention_states,p_attention) #[batch_size x attn_length x attn_size] 85 | attention_final=tf.reduce_sum(attention_final,axis=1) #[batch_size x attn_size] 86 | ############################################################################################################################################################ 87 | output, state = cell(inp+attention_final, state) #使用RNN走一步 #TODO SHOULD WE ADD OR CONCAT THESE TWO PARTS. 88 | outputs.append(output) # 将输出添加到结果列表中 89 | if loop_function is not None: 90 | prev = output 91 | print("rnn_decoder_with_attention ended...") 92 | return outputs, state -------------------------------------------------------------------------------- /Keras_Version/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | #from keras.utils.vis_utils import plot_model 5 | from model2 import gcnn 6 | from keras.optimizers import Adam 7 | from keras.callbacks import EarlyStopping, ModelCheckpoint 8 | import pickle 9 | from keras import backend as K 10 | from keras.utils import to_categorical 11 | import gc 12 | import keras.backend.tensorflow_backend as KTF 13 | import tensorflow as tf 14 | import h5py 15 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 16 | 17 | 18 | config = tf.ConfigProto() 19 | config.gpu_options.allow_growth=True 20 | sess = tf.Session(config=config) 21 | KTF.set_session(sess) 22 | 23 | 24 | def accu(y_true, y_pred): 25 | a = K.argmax(y_true,1) 26 | b = K.argmax(y_pred,1) 27 | c = K.equal(a,b) 28 | accuracy = (K.cast(c, K.float32)) 29 | return accuracy 30 | 31 | 32 | batch_size = 128 33 | depth = 3 34 | mkenerls = [64,64,32] 35 | conv_conf = [2,1] 36 | pooling_conf = ["max",2,2] 37 | bn = False 38 | dropout = True 39 | rate = 0.8 40 | activation = "relu" 41 | conf = [50,300,10] #input size 42 | output_dim = 20 43 | 44 | lr = 0.0008 45 | epoch = 200 46 | epoch_cont = 300 47 | data_dic = os.path.dirname(os.path.abspath(__file__)) 48 | filepath = os.path.join(os.path.dirname(os.path.realpath(__file__)),'cache','Words2Matrix_{}_{}_{}.h5'.format(conf[0], conf[1], conf[2])) 49 | 50 | path_result = 'RET' 51 | path_model = 'MODEL' 52 | if os.path.isdir(path_result) is False: 53 | os.mkdir(path_result) 54 | if os.path.isdir(path_model) is False: 55 | os.mkdir(path_model) 56 | 57 | #build model 58 | def build_model(): 59 | model = gcnn(depth, mkenerls, conv_conf, pooling_conf, bn, dropout, rate, activation, conf, output_dim) 60 | adam = Adam(lr=lr) 61 | model.compile(loss='categorical_crossentropy', optimizer=adam,metrics = ["categorical_accuracy"]) 62 | #plot_model(model, to_file='model.png', show_shapes=True) 63 | return model 64 | 65 | 66 | def getdata(path): 67 | print(path) 68 | h5 = h5py.File(path, 'r') 69 | datax = h5['datax'].value 70 | datay = h5['datay'].value 71 | h5.close() 72 | return datax,datay 73 | 74 | #read data 75 | def read_data(): 76 | X_train,Y_train = getdata(os.path.join(data_dic, "data", "train.h5")) 77 | X_valid,Y_valid = getdata(os.path.join(data_dic, "data", "valid.h5")) 78 | X_test,Y_test = getdata(os.path.join(data_dic, "data", "test.h5")) 79 | print(X_train.shape) 80 | print(X_valid.shape) 81 | print(X_test.shape) 82 | print(Y_train.shape) 83 | print(Y_valid.shape) 84 | print(Y_test.shape) 85 | return X_train,X_valid,X_test,Y_train,Y_valid,Y_test 86 | 87 | def cache(path,X_train,X_valid,X_test,Y_train,Y_valid,Y_test): 88 | h5 = h5py.File(path, 'w') 89 | h5.create_dataset('X_train', data=X_train) 90 | h5.create_dataset('X_valid', data=X_valid) 91 | h5.create_dataset('X_test', data=X_test) 92 | h5.create_dataset('Y_train', data=Y_train) 93 | h5.create_dataset('Y_valid', data=Y_valid) 94 | h5.create_dataset('Y_test', data=Y_test) 95 | h5.close() 96 | 97 | def read_cache(path): 98 | h5 = h5py.File(path, 'r') 99 | X_train = h5['X_train'].value 100 | X_valid = h5['X_valid'].value 101 | X_test = h5['X_test'].value 102 | Y_train = h5['Y_train'].value 103 | Y_valid = h5['Y_valid'].value 104 | Y_test = h5['Y_test'].value 105 | h5.close() 106 | return X_train,X_valid,X_test,Y_train,Y_valid,Y_test 107 | 108 | def main(): 109 | if os.path.exists(filepath): 110 | print("read data from file") 111 | X_train,X_valid,X_test,Y_train,Y_valid,Y_test = read_cache(filepath) 112 | else: 113 | print("read and store data") 114 | X_train,X_valid,X_test,Y_train,Y_valid,Y_test = read_data() 115 | cache(filepath,X_train,X_valid,X_test,Y_train,Y_valid,Y_test) 116 | model = build_model() 117 | 118 | 119 | 120 | 121 | fname_param = os.path.join(data_dic,'MODEL', 'best2.h5') 122 | ''' 123 | early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=5, mode='max') 124 | model_checkpoint = ModelCheckpoint(fname_param, monitor='val_categorical_accuracy', verbose=0, save_best_only=True, mode='max') 125 | print('=' * 10) 126 | print("training model...") 127 | history = model.fit(X_train, Y_train, 128 | nb_epoch=epoch, 129 | batch_size=batch_size, 130 | validation_data=(X_valid, Y_valid), 131 | callbacks=[early_stopping, model_checkpoint], 132 | verbose=1) 133 | #保存训练最好模型训练细节,此时测试集为验证集 134 | model.save_weights(fname_param, overwrite=True) 135 | pickle.dump((history.history), open(os.path.join(path_result, 'history.pkl'), 'wb')) 136 | 137 | model.load_weights(fname_param) 138 | score = model.evaluate(X_train, Y_train, batch_size=X_train.shape[0],verbose=0) 139 | print('训练集最好模型进行预测') 140 | print('Train score: %s' % str(score)) 141 | score = model.evaluate(X_test,Y_test,batch_size=X_test.shape[0],verbose=0) 142 | print('Test score: %s' % str(score)) 143 | ''' 144 | 145 | 146 | #fname_param = os.path.join(data_dic, 'MODEL', 'cont.best2.h5') 147 | model.load_weights(fname_param) 148 | print('=' * 10) 149 | print("training model (cont)...") 150 | fname_param = os.path.join(data_dic,'MODEL', 'cont2.best2.h5') 151 | X_train2 = np.concatenate((X_train,X_valid),axis = 0) 152 | y_train2 = np.concatenate((Y_train,Y_valid),axis = 0) 153 | print(X_train2.shape,y_train2.shape) 154 | # early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min') 155 | model_checkpoint = ModelCheckpoint(fname_param, monitor='val_categorical_accuracy', verbose=0, save_best_only=True, mode='max') 156 | #保存训练最好模型训练细节,此时训练集+验证集为新的训练集,测试集为测试集 157 | history = model.fit(X_train2, y_train2, 158 | nb_epoch=epoch_cont, 159 | verbose=1, 160 | batch_size=batch_size, 161 | callbacks=[model_checkpoint],#early_stopping,model_checkpoint], 162 | validation_data=(X_test, Y_test)) 163 | pickle.dump((history.history), open(os.path.join(path_result, 'cont.history.pkl'), 'wb')) 164 | model.save_weights(fname_param, overwrite=True) 165 | print('=' * 10) 166 | print('The best model to predict') 167 | 168 | model.load_weights(fname_param) 169 | score = model.evaluate(X_train2, y_train2, batch_size=X_train2.shape[0],verbose=0) 170 | print('Train score: %s' % str(score)) 171 | score = model.evaluate(X_test,Y_test,batch_size=X_test.shape[0],verbose=0) 172 | print('Test score: %s' % str(score)) 173 | 174 | print('=' * 10) 175 | print('Done') 176 | 177 | gc.collect() 178 | if __name__ == "__main__": 179 | main() 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /Keras_Version/model2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from keras.layers import ( 3 | Input, 4 | Activation, 5 | Dropout, 6 | Flatten, 7 | Dense, 8 | Reshape) 9 | from keras.layers.convolutional import Convolution2D 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.models import Model 12 | from keras.layers.pooling import MaxPool2D, AveragePooling2D 13 | import time 14 | 15 | 16 | def mpool(type, input, size, stride): 17 | if type == "max": 18 | return MaxPool2D(pool_size=(size, size), strides=stride, padding='same')(input) 19 | elif type == "avg": 20 | return AveragePooling2D(pool_size=(size, size), strides=stride, padding='same')(input) 21 | else: 22 | raise ValueError("pooling type invalid") 23 | 24 | 25 | def active(type, input): 26 | if type == "relu": 27 | return Activation("relu")(input) 28 | elif type == "sigmoid": 29 | return Activation("sigmoid")(input) 30 | elif type == "tanh": 31 | return Activation("tanh")(input) 32 | elif type == "softmax": 33 | return Activation("softmax")(input) 34 | else: 35 | raise ValueError("activation type invalid") 36 | 37 | 38 | def gcnn(depth=4, mkenerls=[64, 64, 64, 32], conv_conf=[2, 1], pooling_conf=["max", 2, 2], bn=False, dropout=True, 39 | rate=0.8, activation="relu", conf=[50, 300, 10], output_dim=20): 40 | assert depth == len(mkenerls) 41 | mchannel, mheight, mwidth = conf 42 | conv_size, conv_stride = conv_conf 43 | pooling_type, pooling_size, pooling_stride = pooling_conf 44 | input = Input(shape=(mchannel, mheight, mwidth)) 45 | 46 | conv1 = Convolution2D(filters=mkenerls[0], kernel_size=(1, mwidth), strides=(1, 1), padding="valid")(input) 47 | # bn1 = BatchNormalization(axis=1)(conv1) 48 | activation1 = Activation("relu")(conv1) 49 | pool1 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation1) 50 | _k1, _n1 = map(int, pool1.shape[1:3]) 51 | reshape_pool1 = Reshape((1, _k1, _n1))(pool1) 52 | 53 | conv2 = Convolution2D(filters=mkenerls[1], kernel_size=(1, _n1), strides=(1, 1), padding="valid")(reshape_pool1) 54 | # bn2 = BatchNormalization(axis=1)(conv2) 55 | activation2 = Activation("relu")(conv2) 56 | pool2 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation2) 57 | _k2, _n2 = map(int, pool2.shape[1:3]) 58 | reshape_pool2 = Reshape((1, _k2, _n2))(pool2) 59 | 60 | conv3 = Convolution2D(filters=mkenerls[1], kernel_size=(1, _n2), strides=(1, 1), padding="valid")(reshape_pool2) 61 | # bn2 = BatchNormalization(axis=1)(conv2) 62 | activation3 = Activation("relu")(conv3) 63 | pool3 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation3) 64 | _k3, _n3 = map(int, pool2.shape[1:3]) 65 | reshape_pool3 = Reshape((1, _k2, _n2))(pool3) 66 | 67 | conv4 = Convolution2D(filters=mkenerls[2], kernel_size=(1, _n3), strides=(1, 1), padding="valid")(reshape_pool2) 68 | # bn3 = BatchNormalization(axis=1)(conv3) 69 | activation4 = Activation("relu")(conv4) 70 | pool4 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation4) 71 | 72 | # step_results = [input] 73 | # for i in range(depth - 1): 74 | # mconv = Convolution2D( 75 | # nb_filter=mkenerls[i], nb_row=conv_size, nb_col=conv_size, strides=(conv_stride, conv_stride), 76 | # border_mode="same")(step_results[-1]) 77 | # if bn: 78 | # mbn = BatchNormalization(axis=1)(mconv) 79 | # else: 80 | # mbn = mconv 81 | # mactivation = active(activation, mbn) 82 | # mpooling = mpool(pooling_type, mactivation, pooling_size, pooling_stride) 83 | # if dropout: 84 | # mdropout = Dropout(rate=rate, seed=time.time())(mpooling) 85 | # else: 86 | # mdropout = mpooling 87 | # step_results.append(mdropout) 88 | 89 | # last_conv = Convolution2D( 90 | # nb_filter=mkenerls[-1], nb_row=conv_size, nb_col=conv_size, border_mode="same")(step_results[-1]) 91 | # last_pooling = mpool(pooling_type, last_conv, pooling_size, pooling_stride) 92 | mFlatten = Flatten()(pool4) 93 | ms_output = Dense(output_dim=128)(mFlatten) 94 | msinput = active("sigmoid", ms_output) 95 | moutput = Dense(output_dim=output_dim)(msinput) 96 | output = active("softmax", moutput) 97 | model = Model(input=input, output=output) 98 | return model 99 | 100 | 101 | if __name__ == '__main__': 102 | model = gcnn() 103 | model.summary() 104 | 105 | -------------------------------------------------------------------------------- /NewGraphCNNs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/make_heiring.py: -------------------------------------------------------------------------------- 1 | f = open("rcv1.topics.hier.orig.txt",'r') 2 | lines = f.readlines() 3 | nodes = [] 4 | for line in lines: 5 | keys = line.split(' ') 6 | while '' in keys: 7 | keys.remove("") 8 | node ={} 9 | node['parent'] = keys[1] 10 | node['child'] =keys[3] 11 | nodes.append(node) 12 | 13 | f.close() 14 | 15 | relation = {} 16 | for node in nodes: 17 | parent = node['parent'] 18 | child = node['child'] 19 | if parent not in relation: 20 | relation[parent] = [] 21 | relation[parent].append(child) 22 | 23 | 24 | import json 25 | result = [] 26 | with open('classes.json','r') as f: 27 | classes = json.load(f) 28 | for key in relation: 29 | if len(relation[key]) <2: 30 | continue 31 | new = [] 32 | for index,values in enumerate(relation[key]): 33 | new.append(classes[values]) 34 | result.append(new) 35 | 36 | final = [] 37 | for single in result: 38 | length = len(single) 39 | for i in range(length-1): 40 | for j in range(i+1,length): 41 | temp = [] 42 | temp.append(single[i]) 43 | temp.append(single[j]) 44 | final.append(temp) 45 | for v in final: 46 | print(str(v)) 47 | with open ('heiring.json','w') as f: 48 | j = json.dump(final,f) 49 | #print(j) 50 | 51 | 52 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/rcv1_processer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import zipfile 4 | from multiprocessing import Pool 5 | import xml.etree.ElementTree as ET 6 | import re 7 | import json 8 | import numpy as np 9 | import gensim 10 | import h5py 11 | from nltk.stem import WordNetLemmatizer 12 | from nltk.tokenize import WordPunctTokenizer 13 | import nltk 14 | 15 | PATH = "/home/penghao/mars/rcv2" 16 | original_path = r'/home/penghao/mars/rcv2/reuters/training' 17 | targetpath = r'/data/LJ/LJ/own/RCV1/target_files' 18 | # targetpath = os.path.join(PATH,"target_files") 19 | all = 0 20 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*','”','“','’',"‘","'",'"'] 21 | wordEngStop = nltk.corpus.stopwords.words('english') 22 | lemmatizer = WordNetLemmatizer() 23 | 24 | def unzip(file,name): 25 | global all 26 | zip_file = zipfile.ZipFile(file) 27 | path = os.path.join(targetpath,name) 28 | print(path) 29 | if not os.path.exists(path): 30 | os.mkdir(path) 31 | for name in zip_file.namelist(): 32 | zip_file.extract(name,path) 33 | all += 1 34 | print(all) 35 | 36 | def zipp(): 37 | flist = os.listdir(original_path) 38 | flist.sort() 39 | for f in flist: 40 | fname = f.split('.')[0] 41 | print(fname) 42 | fpath = os.path.join(original_path,f) 43 | print(fpath) 44 | unzip(fpath,fname) 45 | 46 | def readfile(path): 47 | f = open(path,'r') 48 | s = f.readlines() 49 | 50 | topics = [] 51 | 52 | 53 | 54 | 55 | finalwords = [] 56 | for line in s: 57 | line = line.lower().strip().decode(errors="ignore") 58 | line = re.split('[-_\.:/ \"\'(),.;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', line) 59 | for word in line: 60 | if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha(): 61 | finalwords.append(word) 62 | 63 | # mtext = re.split('[-_:/ \"\'(),;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', mtext) 64 | 65 | # while "" in mtext: 66 | # mtext.remove("") 67 | # print(mtext) 68 | # print(topics) 69 | #print finalwords 70 | return finalwords,topics 71 | 72 | def haha1(): 73 | # xxxx = 0 74 | all_words = {} 75 | opath = os.listdir('reuters/test') 76 | for ff in opath: 77 | simpath = os.path.join('reuters/test',ff) 78 | mcontent,_ = readfile(simpath) 79 | for word in mcontent: 80 | if word not in all_words.keys(): 81 | all_words[word] = True 82 | pp = os.path.join('data',"test.json") 83 | print(pp) 84 | with open(pp,"w") as fp: 85 | json.dump(all_words, fp) 86 | 87 | def haha2(): 88 | # xxxx = 0 89 | all_words = {} 90 | opath = os.listdir('reuters/training') 91 | for ff in opath: 92 | simpath = os.path.join('reuters/training',ff) 93 | mcontent,_ = readfile(simpath) 94 | for word in mcontent: 95 | if word not in all_words.keys(): 96 | all_words[word] = True 97 | pp = os.path.join('data',"training.json") 98 | print(pp) 99 | with open(pp,"w") as fp: 100 | json.dump(all_words, fp) 101 | 102 | def findwords(): 103 | #lnums = [(i*1000,(i+1)*1000) for i in range(15,21)]+[(14826,15000),(21000,21576)] #test 104 | lnums = [(i*1000,(i+1)*1000) for i in range(0,14)]+[(14000,14818)] 105 | print(lnums) 106 | #lnums = [(0,1)] 107 | #tpath = r'E:\RCV1\words' 108 | tpath = os.path.join(PATH,"data") 109 | p = Pool(30) 110 | results = [] 111 | for i in range(len(lnums)): 112 | start,end = lnums[i] 113 | print("process{0} start. Range({1},{2})".format(i,start,end)) 114 | results.append(p.apply_async(haha,args=(start,end,tpath))) 115 | print("process{0} end".format(i)) 116 | p.close() 117 | p.join() 118 | for r in results: 119 | print(r.get()) 120 | 121 | def isnumber(str): 122 | if str.count('.') == 1: 123 | left = str.split('.')[0] 124 | right = str.split('.')[1] 125 | lright = '' 126 | if str.count('-') == 1 and str[0] == '-': 127 | lright = left.split('-')[1] 128 | elif str.count('-') == 0: 129 | lright = left 130 | else: 131 | return False 132 | if right.isdigit() and lright.isdigit(): 133 | return True 134 | else: 135 | return False 136 | elif str.count('.') == 0: 137 | if str[0] == "-": 138 | str2 = str[1:] 139 | else: 140 | str2 = str 141 | if str2.isdigit(): 142 | return True 143 | return False 144 | else: 145 | return False 146 | 147 | def allwords(): 148 | tpath = os.path.join(PATH,"data") 149 | words = {} 150 | ind = 0 151 | flist = os.listdir(tpath) 152 | flist.sort() 153 | for f in flist: 154 | ppath = os.path.join(tpath,f) 155 | with open(ppath, "r") as f1: 156 | simjson = json.load(f1) 157 | for i in simjson.keys(): 158 | if i not in words.keys(): 159 | words[i] = ind 160 | ind += 1 161 | print(len(list(words.keys()))) 162 | #print("1190" in words) 163 | #893198 164 | lens = len(list(words.keys())) 165 | #print(list(words.keys())) 166 | #assert lens == 364830 167 | wembeddingwords = np.random.uniform(-1.0, 1.0, (lens, 50)) 168 | word2vec_model = gensim.models.Word2Vec.load(r'/home/penghao/lj/Google_w2v/wiki.en.text.model') 169 | xx = 0 170 | for key in words.keys(): 171 | # if isnumber(key): 172 | # xx += 1 173 | if key in word2vec_model: 174 | #print(key) 175 | xx += 1 176 | index = words[key] 177 | wembeddingwords[index, :] = word2vec_model[key] 178 | print(xx) 179 | with open(os.path.join(PATH,r"words.json"), "w") as f: 180 | json.dump(words, f) 181 | f = h5py.File(os.path.join(PATH,"matrix_rcv1.h5"), "w") 182 | f.create_dataset("data", data=wembeddingwords) 183 | f.close() 184 | 185 | def classpro(): 186 | tpath = r'/home/user/LJ/own/RCV1/topic_codes.txt' 187 | haha = {} 188 | with open(tpath,"r") as f: 189 | lines = f.readlines() 190 | print(len(lines)) 191 | for index,line in enumerate(lines[2:]): 192 | if line != '\n' and '\t' in line: 193 | haha[line.strip().split('\t')[0]] = index 194 | for k,v in haha.items(): 195 | print(k,v) 196 | print(len(list(haha.keys()))) 197 | with open(r'/home/user/LJ/own/RCV1/classes.json','w') as f: 198 | json.dump(haha,f) 199 | 200 | 201 | if __name__ == "__main__": 202 | findwords() 203 | haha1() 204 | haha2() 205 | allwords() 206 | classpro() 207 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/unzip.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | 4 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD1/" 5 | list = os.listdir(path) 6 | 7 | for z in list: 8 | file_path = os.path.join(path,z) 9 | zipf = zipfile.ZipFile(file_path) 10 | zipf.extractall('xml2') 11 | zipf.close() 12 | 13 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD2/" 14 | list = os.listdir(path) 15 | 16 | for z in list: 17 | file_path = os.path.join(path,z) 18 | zipf = zipfile.ZipFile(file_path) 19 | zipf.extractall('xml2') 20 | zipf.close() -------------------------------------------------------------------------------- /RCNN/v-cpp/ecnn-noada.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/RCNN/v-cpp/ecnn-noada.cpp -------------------------------------------------------------------------------- /RCNN/v-cpp/fileutil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/RCNN/v-cpp/fileutil.hpp -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deepgraphcnn 2 | 3 | The code for our WWW2018 paper "Large-Scale Hierarchical Text Classification with Recursively Regularized Deep Graph-CNN" 4 | 5 | Readers are welcomed to fork this repository to reproduce the experiments and follow our work. Just remmeber to cite our paper 6 | 7 | @inproceedings{peng2018deepgraphcnn, 8 | title={Large-Scale Hierarchical Text Classification with Recursively Regularized Deep Graph-CNN}, 9 | author={Peng, Hao and Li, Jianxin and He, Yu and Liu, Yaopeng and Bao, Mengjiao and Song, Yangqiu and Yang, Qiang}, 10 | booktitle={WWW}, 11 | year={2018} 12 | } 13 | 14 | 15 | ## Requirements 16 | - Python 3 17 | - Tensorflow > 0.8 18 | - Numpy 19 | 20 | 21 | 22 | 23 | Train: 24 | graphcnn_train.py 25 | 26 | -------------------------------------------------------------------------------- /Seq2seqWithAttention/a1_seq2seq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | # 【该方法测试的时候使用】返回一个方法。这个方法根据输入的值,得到对应的索引,再得到这个词的embedding. 5 | def extract_argmax_and_embed(embedding, output_projection=None): 6 | """ 7 | Get a loop_function that extracts the previous symbol and embeds it. Used by decoder. 8 | :param embedding: embedding tensor for symbol 9 | :param output_projection: None or a pair (W, B). If provided, each fed previous output will 10 | first be multiplied by W and added B. 11 | :return: A loop function 12 | """ 13 | def loop_function(prev, _): 14 | if output_projection is not None: 15 | prev = tf.matmul(prev, output_projection[0]) + output_projection[1] 16 | prev_symbol = tf.argmax(prev, 1) #得到对应的INDEX 17 | emb_prev = tf.gather(embedding, prev_symbol) #得到这个INDEX对应的embedding 18 | return emb_prev 19 | return loop_function 20 | 21 | # RNN的解码部分。 22 | # 如果是训练,使用训练数据的输入;如果是test,将t时刻的输出作为t+1时刻的s输入 23 | def rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):#3D Tensor [batch_size x attn_length x attn_size] 24 | """RNN decoder for the sequence-to-sequence model. 25 | Args: 26 | decoder_inputs: A list of 2D Tensors [batch_size x input_size].it is decoder input. 27 | initial_state: 2D Tensor with shape [batch_size x cell.state_size].it is the encoded vector of input sentences, which represent 'thought vector' 28 | cell: core_rnn_cell.RNNCell defining the cell function and size. 29 | loop_function: If not None, this function will be applied to the i-th output 30 | in order to generate the i+1-st input, and decoder_inputs will be ignored, 31 | except for the first element ("GO" symbol). This can be used for decoding, 32 | but also for training to emulate http://arxiv.org/abs/1506.03099. 33 | Signature -- loop_function(prev, i) = next 34 | * prev is a 2D Tensor of shape [batch_size x output_size], 35 | * i is an integer, the step number (when advanced control is needed), 36 | * next is a 2D Tensor of shape [batch_size x input_size]. 37 | attention_states: 3D Tensor [batch_size x attn_length x attn_size].it is represent input X. 38 | scope: VariableScope for the created subgraph; defaults to "rnn_decoder". 39 | Returns: 40 | A tuple of the form (outputs, state), where: 41 | outputs: A list of the same length as decoder_inputs of 2D Tensors with 42 | shape [batch_size x output_size] containing generated outputs. 43 | state: The state of each cell at the final time-step. 44 | It is a 2D Tensor of shape [batch_size x cell.state_size]. 45 | (Note that in some cases, like basic RNN cell or GRU cell, outputs and 46 | states can be the same. They are different for LSTM cells though.) 47 | """ 48 | with tf.variable_scope(scope or "rnn_decoder"): 49 | print("rnn_decoder_with_attention started...") 50 | state = initial_state #[batch_size x cell.state_size]. 51 | _, hidden_size = state.get_shape().as_list() #200 52 | attention_states_original=attention_states 53 | batch_size,sequence_length,_=attention_states.get_shape().as_list() 54 | outputs = [] 55 | prev = None 56 | ################################################# 57 | for i, inp in enumerate(decoder_inputs):#循环解码部分的输入。如sentence_length个[batch_size x input_size] 58 | # 如果是训练,使用训练数据的输入;如果是test, 将t时刻的输出作为t + 1 时刻的s输入 59 | if loop_function is not None and prev is not None:#测试的时候:如果loop_function不为空且前一个词的值不为空,那么使用前一个的值作为RNN的输入 60 | with tf.variable_scope("loop_function", reuse=True): 61 | inp = loop_function(prev, i) 62 | if i > 0: 63 | tf.get_variable_scope().reuse_variables() 64 | ##ATTENTION################################################################################################################################################# 65 | # 1.get logits of attention for each encoder input. attention_states:[batch_size x attn_length x attn_size]; query=state:[batch_size x cell.state_size] 66 | query=state 67 | W_a = tf.get_variable("W_a", shape=[hidden_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1)) 68 | query=tf.matmul(query, W_a) #[batch_size,hidden_size] 69 | query=tf.expand_dims(query,axis=1) #[batch_size, 1, hidden_size] 70 | U_a = tf.get_variable("U_a", shape=[hidden_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1)) 71 | U_aa = tf.get_variable("U_aa", shape=[ hidden_size]) 72 | attention_states=tf.reshape(attention_states,shape=(-1,hidden_size)) #[batch_size*sentence_length,hidden_size] 73 | attention_states=tf.matmul(attention_states, U_a) #[batch_size*sentence_length,hidden_size] 74 | #print("batch_size",batch_size," ;sequence_length:",sequence_length," ;hidden_size:",hidden_size) #print("attention_states:", attention_states) #(?, 200) 75 | attention_states=tf.reshape(attention_states,shape=(-1,sequence_length,hidden_size)) # TODO [batch_size,sentence_length,hidden_size] 76 | #query_expanded: [batch_size,1, hidden_size] 77 | #attention_states_reshaped: [batch_size,sentence_length,hidden_size] 78 | attention_logits=tf.nn.tanh(query+attention_states+U_aa) #[batch_size,sentence_length,hidden_size]. additive style 79 | 80 | # 2.get possibility of attention 81 | attention_logits=tf.reshape(attention_logits,shape=(-1,hidden_size)) #batch_size*sequence_length [batch_size*sentence_length,hidden_size] 82 | V_a = tf.get_variable("V_a", shape=[hidden_size,1],initializer=tf.random_normal_initializer(stddev=0.1)) #[hidden_size,1] 83 | attention_logits=tf.matmul(attention_logits,V_a) #最终需要的是[batch_size*sentence_length,1]<-----[batch_size*sentence_length,hidden_size],[hidden_size,1] 84 | attention_logits=tf.reshape(attention_logits,shape=(-1,sequence_length)) #attention_logits:[batch_size,sequence_length] 85 | ########################################################################################################################################################## 86 | #attention_logits=tf.reduce_sum(attention_logits,2) #[batch_size x attn_length] 87 | attention_logits_max=tf.reduce_max(attention_logits,axis=1,keep_dims=True) #[batch_size x 1] 88 | # possibility distribution for each encoder input.it means how much attention or focus for each encoder input 89 | p_attention=tf.nn.softmax(attention_logits-attention_logits_max)#[batch_size x attn_length] 90 | 91 | # 3.get weighted sum of hidden state for each encoder input as attention state 92 | p_attention=tf.expand_dims(p_attention,axis=2) #[batch_size x attn_length x 1] 93 | # attention_states:[batch_size x attn_length x attn_size]; p_attention:[batch_size x attn_length]; 94 | attention_final=tf.multiply(attention_states_original,p_attention) #[batch_size x attn_length x attn_size] 95 | context_vector=tf.reduce_sum(attention_final,axis=1) #[batch_size x attn_size] 96 | ############################################################################################################################################################ 97 | #inp:[batch_size x input_size].it is decoder input; attention_final:[batch_size x attn_size] 98 | output, state = cell(inp, state,context_vector) #attention_final TODO 使用RNN走一步 99 | outputs.append(output) # 将输出添加到结果列表中 100 | if loop_function is not None: 101 | prev = output 102 | print("rnn_decoder_with_attention ended...") 103 | return outputs, state -------------------------------------------------------------------------------- /Text2Graph/Text2Graph-master/src/main/java/ecs/CoreNLPService.java: -------------------------------------------------------------------------------- 1 | package ecs; 2 | 3 | import java.util.concurrent.Executors; 4 | import java.util.concurrent.ScheduledExecutorService; 5 | 6 | /** 7 | * Created by LYP on 2016/11/24. 8 | */ 9 | public class CoreNLPService { 10 | static String pathPatch = "/storage1/lyp/InputFiles/"; 11 | private static int threadNum = 50; 12 | private static int threadEnd = 50; 13 | private static int threadSta = 0; 14 | //bd62->20 80 60 1391700+463958=>9279*50 15 | //bd31->30 30 0 16 | //bd54->30 60 30 17 | public static void main(String[] args) { 18 | // String str = "java怎么把字符1串中的的汉字2取出来"; 19 | // String reg = "[^0-9]"; 20 | // str = str.replaceAll(reg, ""); 21 | // System.out.println(str); 22 | // System.exit(-1); 23 | CoreNLPService coreNLPService = new CoreNLPService(); 24 | coreNLPService.service(); 25 | } 26 | 27 | public void service() { 28 | ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum); 29 | int cnt = threadSta; 30 | while (cnt < threadEnd) { 31 | try { 32 | final int inner = cnt; 33 | final Runnable task = new Runnable() { 34 | @Override 35 | public void run() { 36 | try { 37 | System.out.println("process start!"); 38 | ProcessBuilder builder = new ProcessBuilder(); 39 | builder.redirectError(ProcessBuilder.Redirect.INHERIT); 40 | builder.redirectOutput(ProcessBuilder.Redirect.INHERIT); 41 | 42 | builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M"); 43 | String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i " 44 | + inner + " -c " + pathPatch + " -t 5" + "\"\""; 45 | String[] cmdArray = cmdLine.split(","); 46 | builder.command(cmdArray); 47 | 48 | final Process process = builder.start(); 49 | 50 | Runtime.getRuntime().addShutdownHook(new Thread() { 51 | @Override 52 | public void run() { 53 | process.destroy(); 54 | } 55 | }); 56 | }catch (Exception e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | } 61 | }; 62 | 63 | scheduler.submit(task); 64 | cnt++; 65 | }catch (Exception e) { 66 | // TODO Auto-generated catch block 67 | e.printStackTrace(); 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /Text2Graph/src/main/java/ecs/CoreNLPService.java: -------------------------------------------------------------------------------- 1 | package ecs; 2 | 3 | import java.util.concurrent.Executors; 4 | import java.util.concurrent.ScheduledExecutorService; 5 | 6 | /** 7 | * Created by LYP on 2016/11/24. 8 | */ 9 | public class CoreNLPService { 10 | static String pathPatch = "/storage1/lyp/InputFiles/"; 11 | private static int threadNum = 50; 12 | private static int threadEnd = 50; 13 | private static int threadSta = 0; 14 | //bd62->20 80 60 1391700+463958=>9279*50 15 | //bd31->30 30 0 16 | //bd54->30 60 30 17 | public static void main(String[] args) { 18 | // String str = "java怎么把字符1串中的的汉字2取出来"; 19 | // String reg = "[^0-9]"; 20 | // str = str.replaceAll(reg, ""); 21 | // System.out.println(str); 22 | // System.exit(-1); 23 | CoreNLPService coreNLPService = new CoreNLPService(); 24 | coreNLPService.service(); 25 | } 26 | 27 | public void service() { 28 | ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum); 29 | int cnt = threadSta; 30 | while (cnt < threadEnd) { 31 | try { 32 | final int inner = cnt; 33 | final Runnable task = new Runnable() { 34 | @Override 35 | public void run() { 36 | try { 37 | System.out.println("process start!"); 38 | ProcessBuilder builder = new ProcessBuilder(); 39 | builder.redirectError(ProcessBuilder.Redirect.INHERIT); 40 | builder.redirectOutput(ProcessBuilder.Redirect.INHERIT); 41 | 42 | builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M"); 43 | String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i " 44 | + inner + " -c " + pathPatch + " -t 5" + "\"\""; 45 | String[] cmdArray = cmdLine.split(","); 46 | builder.command(cmdArray); 47 | 48 | final Process process = builder.start(); 49 | 50 | Runtime.getRuntime().addShutdownHook(new Thread() { 51 | @Override 52 | public void run() { 53 | process.destroy(); 54 | } 55 | }); 56 | }catch (Exception e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | } 61 | }; 62 | 63 | scheduler.submit(task); 64 | cnt++; 65 | }catch (Exception e) { 66 | // TODO Auto-generated catch block 67 | e.printStackTrace(); 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /TextCNN/__pycache__/data_util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/TextCNN/__pycache__/data_util.cpython-36.pyc -------------------------------------------------------------------------------- /TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc -------------------------------------------------------------------------------- /TextCNN/data_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import codecs 3 | import random 4 | import numpy as np 5 | from tflearn.data_utils import pad_sequences 6 | from collections import Counter 7 | import os 8 | import pickle 9 | 10 | PAD_ID = 0 11 | UNK_ID=1 12 | _PAD="_PAD" 13 | _UNK="UNK" 14 | 15 | 16 | def load_data_multilabel(traning_data_path,vocab_word2index, vocab_label2index,sentence_len,training_portion=0.95): 17 | """ 18 | convert data as indexes using word2index dicts. 19 | :param traning_data_path: 20 | :param vocab_word2index: 21 | :param vocab_label2index: 22 | :return: 23 | """ 24 | file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8') 25 | lines = file_object.readlines() 26 | random.shuffle(lines) 27 | label_size=len(vocab_label2index) 28 | X = [] 29 | Y = [] 30 | for i,line in enumerate(lines): 31 | raw_list = line.strip().split("__label__") 32 | input_list = raw_list[0].strip().split(" ") 33 | input_list = [x.strip().replace(" ", "") for x in input_list if x != ''] 34 | x=[vocab_word2index.get(x,UNK_ID) for x in input_list] 35 | label_list = raw_list[1:] 36 | label_list=[l.strip().replace(" ", "") for l in label_list if l != ''] 37 | label_list=[vocab_label2index[label] for label in label_list] 38 | y=transform_multilabel_as_multihot(label_list,label_size) 39 | X.append(x) 40 | Y.append(y) 41 | X = pad_sequences(X, maxlen=sentence_len, value=0.) # padding to max length 42 | number_examples = len(lines) 43 | training_number=int(training_portion* number_examples) 44 | train = (X[0:training_number], Y[0:training_number]) 45 | valid_number=min(1000,number_examples-training_number) 46 | test = (X[training_number+ 1:training_number+valid_number+1], Y[training_number + 1:training_number+valid_number+1]) 47 | return train,test 48 | 49 | 50 | def transform_multilabel_as_multihot(label_list,label_size): 51 | """ 52 | convert to multi-hot style 53 | :param label_list: e.g.[0,1,4], here 4 means in the 4th position it is true value(as indicate by'1') 54 | :param label_size: e.g.199 55 | :return:e.g.[1,1,0,1,0,0,........] 56 | """ 57 | result=np.zeros(label_size) 58 | #set those location as 1, all else place as 0. 59 | result[label_list] = 1 60 | return result 61 | 62 | #use pretrained word embedding to get word vocabulary and labels, and its relationship with index 63 | def create_vocabulary(training_data_path,vocab_size,name_scope='cnn'): 64 | """ 65 | create vocabulary 66 | :param training_data_path: 67 | :param vocab_size: 68 | :param name_scope: 69 | :return: 70 | """ 71 | 72 | cache_vocabulary_label_pik='cache'+"_"+name_scope # path to save cache 73 | if not os.path.isdir(cache_vocabulary_label_pik): # create folder if not exists. 74 | os.makedirs(cache_vocabulary_label_pik) 75 | 76 | # if cache exists. load it; otherwise create it. 77 | cache_path =cache_vocabulary_label_pik+"/"+'vocab_label.pik' 78 | print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path)) 79 | if os.path.exists(cache_path): 80 | with open(cache_path, 'rb') as data_f: 81 | return pickle.load(data_f) 82 | else: 83 | vocabulary_word2index={} 84 | vocabulary_index2word={} 85 | vocabulary_word2index[_PAD]=PAD_ID 86 | vocabulary_index2word[PAD_ID]=_PAD 87 | vocabulary_word2index[_UNK]=UNK_ID 88 | vocabulary_index2word[UNK_ID]=_UNK 89 | 90 | vocabulary_label2index={} 91 | vocabulary_index2label={} 92 | 93 | #1.load raw data 94 | file_object = codecs.open(training_data_path, mode='r', encoding='utf-8') 95 | lines=file_object.readlines() 96 | #2.loop each line,put to counter 97 | c_inputs=Counter() 98 | c_labels=Counter() 99 | for line in lines: 100 | raw_list=line.strip().split("__label__") 101 | 102 | input_list = raw_list[0].strip().split(" ") 103 | input_list = [x.strip().replace(" ", "") for x in input_list if x != ''] 104 | label_list=[l.strip().replace(" ","") for l in raw_list[1:] if l!=''] 105 | c_inputs.update(input_list) 106 | c_labels.update(label_list) 107 | #return most frequency words 108 | vocab_list=c_inputs.most_common(vocab_size) 109 | label_list=c_labels.most_common() 110 | #put those words to dict 111 | for i,tuplee in enumerate(vocab_list): 112 | word,_=tuplee 113 | vocabulary_word2index[word]=i+2 114 | vocabulary_index2word[i+2]=word 115 | 116 | for i,tuplee in enumerate(label_list): 117 | label,_=tuplee;label=str(label) 118 | vocabulary_label2index[label]=i 119 | vocabulary_index2label[i]=label 120 | 121 | #save to file system if vocabulary of words not exists. 122 | if not os.path.exists(cache_path): 123 | with open(cache_path, 'ab') as data_f: 124 | pickle.dump((vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label), data_f) 125 | return vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label 126 | 127 | #training_data_path='../data/sample_multiple_label3.txt' 128 | #vocab_size=100 129 | #create_voabulary(training_data_path,vocab_size) 130 | -------------------------------------------------------------------------------- /TextCNN/other_experiement/p7_TextCNN_predict_ensemble.py: -------------------------------------------------------------------------------- 1 | from p7_TextCNN_predict import get_logits_with_value_by_input 2 | from p7_TextCNN_predict_exp import get_logits_with_value_by_input_exp 3 | import tensorflow as tf 4 | def main(_): 5 | for start in range(217360): 6 | end=start+1 7 | label_list,p_list=get_logits_with_value_by_input(start,end) 8 | label_list_exp, p_list_exp=get_logits_with_value_by_input_exp(start,end) 9 | 10 | if start<5: 11 | print("----------------------------------------------------") 12 | print(start,"label_list0:",label_list,"p_list0:",p_list) 13 | print(start,"label_list1:", label_list_exp, "p_list1:", p_list_exp) 14 | else: 15 | break 16 | 17 | 18 | 19 | if __name__ == "__main__": 20 | tf.app.run() -------------------------------------------------------------------------------- /TextCNN/other_experiement/p7_TextCNN_predict_exp512.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/zhihu_result_cnn_multilabel_v7_exp512_20170616.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 600, "number of filters") #128-->512 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) -------------------------------------------------------------------------------- /TextCNN/other_experiement/p7_TextCNN_predict_exp512_0609.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp512_0609/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp512_0609/zhihu_result_cnn_multilabel_exp512_0609.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[2,3,5,6,7,8] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) -------------------------------------------------------------------------------- /TextCNN/other_experiement/p7_TextCNN_predict_exp512_simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/zhihu_result_cnn_multilabel_exp512_simple.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 33 | tf.app.flags.DEFINE_string("ckpt_dir2","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[7] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) 116 | -------------------------------------------------------------------------------- /TextRCNN/p71_TextRCNN_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 10 | from tflearn.data_utils import pad_sequences #to_categorical 11 | import os 12 | import codecs 13 | from p71_TextRCNN_mode2 import TextRCNN 14 | 15 | #configuration 16 | FLAGS=tf.app.flags.FLAGS 17 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 18 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 19 | tf.app.flags.DEFINE_integer("batch_size", 80, "Batch size for training/evaluating.") #批处理的大小 32-->128 20 | tf.app.flags.DEFINE_integer("decay_steps", 6000, "how many steps before decay learning rate.") #6000批处理的大小 32-->128 21 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.65一次衰减多少 22 | tf.app.flags.DEFINE_string("ckpt_dir","text_rcnn_title_desc_checkpoint2/","checkpoint location for the model") 23 | tf.app.flags.DEFINE_integer("sentence_length",100,"max sentence length") 24 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 25 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 26 | tf.app.flags.DEFINE_string("predict_target_file","text_rcnn_title_desc_checkpoint2/zhihu_result_rcnn_multilabel.csv","target file path for final prediction") 27 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 28 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 29 | tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 30 | 31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 32 | # 1.load data with vocabulary of words and labels 33 | 34 | 35 | def main(_): 36 | # 1.load data with vocabulary of words and labels 37 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rcnn") 38 | vocab_size = len(vocabulary_word2index) 39 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="rcnn") 40 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 41 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 42 | testX=[] 43 | question_id_list=[] 44 | for tuple in test: 45 | question_id,question_string_list=tuple 46 | question_id_list.append(question_id) 47 | testX.append(question_string_list) 48 | # 2.Data preprocessing: Sequence padding 49 | print("start padding....") 50 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_length, value=0.) # padding to max length 51 | print("end padding...") 52 | # 3.create session. 53 | config=tf.ConfigProto() 54 | config.gpu_options.allow_growth=True 55 | with tf.Session(config=config) as sess: 56 | # 4.Instantiate Model 57 | textRCNN=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_length, 58 | vocab_size,FLAGS.embed_size,FLAGS.is_training,multi_label_flag=FLAGS.multi_label_flag) 59 | saver=tf.train.Saver() 60 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 61 | print("Restoring Variables from Checkpoint") 62 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) #TODO 63 | else: 64 | print("Can't find the checkpoint.going to stop") 65 | return 66 | # 5.feed data, to get logits 67 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 68 | index=0 69 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 70 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 71 | logits=sess.run(textRCNN.logits,feed_dict={textRCNN.input_x:testX2[start:end],textRCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 72 | # 6. get lable using logtis 73 | #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 74 | # 7. write question id and labels to file system. 75 | #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 76 | 77 | question_id_sublist=question_id_list[start:end] 78 | get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) 79 | 80 | index=index+1 81 | predict_target_file_f.close() 82 | 83 | # get label using logits 84 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 85 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 86 | index_list=index_list[::-1] 87 | label_list=[] 88 | for index in index_list: 89 | label=vocabulary_index2word_label[index] 90 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 91 | return label_list 92 | 93 | # get label using logits 94 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 95 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 96 | index_list=index_list[::-1] 97 | value_list=[] 98 | label_list=[] 99 | for index in index_list: 100 | label=vocabulary_index2word_label[index] 101 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 102 | value_list.append(logits[index]) 103 | return label_list,value_list 104 | 105 | # write question id and labels to file system. 106 | def write_question_id_with_labels(question_id,labels_list,f): 107 | labels_string=",".join(labels_list) 108 | f.write(question_id+","+labels_string+"\n") 109 | 110 | # get label using logits 111 | def get_label_using_logits_batch(question_id_sublist,logits_batch,vocabulary_index2word_label,f,top_number=5): 112 | #print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5) 113 | for i,logits in enumerate(logits_batch): 114 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 115 | index_list=index_list[::-1] 116 | label_list=[] 117 | for index in index_list: 118 | label=vocabulary_index2word_label[index] 119 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 120 | #print("get_label_using_logits.label_list",label_list) 121 | write_question_id_with_labels(question_id_sublist[i], label_list, f) 122 | f.flush() 123 | #return label_list 124 | # write question id and labels to file system. 125 | def write_question_id_with_labels(question_id,labels_list,f): 126 | labels_string=",".join(labels_list) 127 | f.write(question_id+","+labels_string+"\n") 128 | 129 | if __name__ == "__main__": 130 | tf.app.run() -------------------------------------------------------------------------------- /TextRNN/p8_TextRNN_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | from p8_TextRNN_model import TextRNN 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 80, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 12000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_rnn_checkpoint/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sequence_length",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_string("traning_data_path","train-zhihu4-only-title-all.txt","path of traning data.") #train-zhihu4-only-title-all.txt.training-data/test-zhihu4-only-title.txt--->'training-data/train-zhihu5-only-title-multilabel.txt' 28 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec.bin-100","word2vec's vocabulary and vectors") 29 | tf.app.flags.DEFINE_string("predict_target_file","text_rnn_checkpoint/zhihu_result_rnn5.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction") 31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 32 | def main(_): 33 | # 1.load data with vocabulary of words and labels 34 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rnn") 35 | vocab_size = len(vocabulary_word2index) 36 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="rnn") 37 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 38 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 39 | testX=[] 40 | question_id_list=[] 41 | for tuple in test: 42 | question_id,question_string_list=tuple 43 | question_id_list.append(question_id) 44 | testX.append(question_string_list) 45 | # 2.Data preprocessing: Sequence padding 46 | print("start padding....") 47 | testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length 48 | print("end padding...") 49 | # 3.create session. 50 | config=tf.ConfigProto() 51 | config.gpu_options.allow_growth=True 52 | with tf.Session(config=config) as sess: 53 | # 4.Instantiate Model 54 | textRNN=TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, 55 | vocab_size, FLAGS.embed_size, FLAGS.is_training) 56 | saver=tf.train.Saver() 57 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 58 | print("Restoring Variables from Checkpoint for TextRNN") 59 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 60 | else: 61 | print("Can't find the checkpoint.going to stop") 62 | return 63 | # 5.feed data, to get logits 64 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 65 | index=0 66 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 67 | #for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 68 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 69 | logits=sess.run(textRNN.logits,feed_dict={textRNN.input_x:testX2[start:end],textRNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 70 | # 6. get lable using logtis 71 | #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #logits[0] 72 | # 7. write question id and labels to file system. 73 | #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 74 | ############################################################################################################# 75 | print("start:",start,";end:",end) 76 | question_id_sublist=question_id_list[start:end] 77 | get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) 78 | ######################################################################################################## 79 | index=index+1 80 | predict_target_file_f.close() 81 | 82 | # get label using logits 83 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 84 | #print("get_label_using_logits:",logits) 85 | print("get_label_using_logits.shape:", logits.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5) 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | print("get_label_using_logits.label_list",label_list) 93 | return label_list 94 | 95 | # get label using logits 96 | def get_label_using_logits_batch(question_id_sublist,logits_batch,vocabulary_index2word_label,f,top_number=5): 97 | #print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5) 98 | for i,logits in enumerate(logits_batch): 99 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 100 | index_list=index_list[::-1] 101 | label_list=[] 102 | for index in index_list: 103 | label=vocabulary_index2word_label[index] 104 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 105 | #print("get_label_using_logits.label_list",label_list) 106 | write_question_id_with_labels(question_id_sublist[i], label_list, f) 107 | f.flush() 108 | #return label_list 109 | # write question id and labels to file system. 110 | def write_question_id_with_labels(question_id,labels_list,f): 111 | labels_string=",".join(labels_list) 112 | f.write(question_id+","+labels_string+"\n") 113 | 114 | if __name__ == "__main__": 115 | tf.app.run() -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | """ must run in python3x""" 3 | import numpy as np 4 | import tensorflow as tf 5 | import os 6 | import shutil 7 | __author__ = 'Yu He' 8 | __version__ = 'v30' 9 | 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 11 | 12 | 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution') 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float) 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value') 16 | total_true_value = np.loadtxt(detail_filename,dtype=int) 17 | 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 19 | 20 | 21 | 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int) 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1) 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1) 25 | # 26 | 27 | 28 | filename_eval_log = os.path.join('./data', 'log_eval') 29 | file_eval_log = open(filename_eval_log, 'w') 30 | np.set_printoptions(threshold=np.nan) 31 | print('\nevaluation:', file=file_eval_log) 32 | print('\nevaluation:') 33 | 34 | total_predicted_value = total_predicted_value.astype(bool) 35 | total_true_value = total_true_value.astype(bool) 36 | 37 | print(' example based evaluations:', file=file_eval_log) 38 | print(' example based evaluations:') 39 | 40 | equal = total_true_value == total_predicted_value 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1) 42 | exact_match_ratio = np.sum(match) / np.size(match) 43 | print(' exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log) 44 | print(' exact_match_ratio = %.4f' % exact_match_ratio) 45 | 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1) 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1) 48 | accuracy = np.mean(true_and_predict / true_or_predict) 49 | print(' accuracy = %.4f' % accuracy, file=file_eval_log) 50 | print(' accuracy = %.4f' % accuracy) 51 | 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9)) 53 | print(' precison = %.4f' % precison, file=file_eval_log) 54 | print(' precison = %.4f' % precison) 55 | 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1)) 57 | print(' recall = %.4f' % recall, file=file_eval_log) 58 | print(' recall = %.4f' % recall) 59 | 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1) 61 | + np.sum(total_predicted_value, axis=1))) 62 | print(' F1_Measure = %.4f' % F1_Measure, file=file_eval_log) 63 | print(' F1_Measure = %.4f' % F1_Measure) 64 | 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value) 66 | print(' HammingLoss = %.4f' % HammingLoss, file=file_eval_log) 67 | print(' HammingLoss = %.4f' % HammingLoss) 68 | 69 | 70 | print(' label based evaluations:', file=file_eval_log) 71 | print(' label based evaluations:') 72 | 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32) 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32) 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32) 76 | 77 | TP_re = np.reshape(TP,[TP.shape[0],1]) 78 | FP_re = np.reshape(FP,[FP.shape[0],1]) 79 | FN_re = np.reshape(FN,[FN.shape[0],1]) 80 | re = np.concatenate((TP_re,FP_re,FN_re),axis=1) 81 | print('TP FP FN:') 82 | print('TP FP FN:', file=file_eval_log) 83 | print(re,file=file_eval_log) 84 | print(re) 85 | 86 | 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:])) 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:])) 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:])) 90 | 91 | # for i in [6,28,31,36,52]: 92 | # TP[i] = TP[i-1] 93 | # FP[i] = FP[i - 1] 94 | # FN[i] = FN[i - 1] 95 | # 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:])) 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:])) 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:])) 99 | 100 | 101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9 ) 102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9 ) 103 | Micro_F1 = (2 * _P *_R) / (_P + _R) 104 | print(' P = %.4f' % _P, file=file_eval_log) 105 | print(' P = %.4f' % _P) 106 | print(' R = %.4f' % _R, file=file_eval_log) 107 | print(' R = %.4f' % _R) 108 | print(' Micro-F1 = %.4f' % Micro_F1, file=file_eval_log) 109 | print(' Micro-F1 = %.4f' % Micro_F1) 110 | 111 | _P_t = TP / (TP + FP + 1e-9) 112 | _R_t = TP / (TP + FN + 1e-9) 113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 114 | 115 | 116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1]) 117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1]) 118 | re = np.concatenate((_P_t_re,_R_t_re),axis=1) 119 | print('_P_t _R_t:') 120 | print('_P_t:', file=file_eval_log) 121 | print(re,file=file_eval_log) 122 | print(re) 123 | 124 | print(' Macro-F1 = %.4f' % Macro_F1, file=file_eval_log) 125 | print(' Macro-F1 = %.4f' % Macro_F1) 126 | -------------------------------------------------------------------------------- /__pycache__/graphcnn.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/graphcnn_GPU.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_GPU.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/graphcnn_generate_data.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_generate_data.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/graphcnn_input.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_input.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/graphcnn_option.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_option.cpython-34.pyc -------------------------------------------------------------------------------- /boosting/a08_boosting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | reload(sys) 4 | sys.setdefaultencoding('utf8') 5 | import tensorflow as tf 6 | 7 | #main process for boosting: 8 | #1.compute label weight after each epoch using validation data. 9 | #2.get weights for each batch during traininig process 10 | #3.compute loss using cross entropy with weights 11 | 12 | #1.compute label weight after each epoch using validation data. 13 | def compute_labels_weights(weights_label,logits,labels): 14 | """ 15 | compute weights for labels in current batch, and update weights_label(a dict) 16 | :param weights_label:a dict 17 | :param logit: [None,Vocabulary_size] 18 | :param label: [None,] 19 | :return: 20 | """ 21 | labels_predict=np.argmax(logits,axis=1) # logits:(256,108,754) 22 | for i in range(len(labels)): 23 | label=labels[i] 24 | label_predict=labels_predict[i] 25 | weight=weights_label.get(label,None) 26 | if weight==None: 27 | if label_predict == label: 28 | weights_label[label]=(1,1) 29 | else: 30 | weights_label[label]=(1,0) 31 | else: 32 | number=weight[0] 33 | correct=weight[1] 34 | number=number+1 35 | if label_predict==label: 36 | correct=correct+1 37 | weights_label[label]=(number,correct) 38 | return weights_label 39 | 40 | #2.get weights for each batch during traininig process 41 | def get_weights_for_current_batch(answer_list,weights_dict): 42 | """ 43 | get weights for current batch 44 | :param answer_list: a numpy array contain labels for a batch 45 | :param weights_dict: a dict that contain weights for all labels 46 | :return: a list. length is label size. 47 | """ 48 | weights_list_batch=list(np.ones((len(answer_list)))) 49 | answer_list=list(answer_list) 50 | for i,label in enumerate(answer_list): 51 | acc=weights_dict[label] 52 | weights_list_batch[i]=min(1.5,1.0/(acc+0.001)) 53 | #if np.random.choice(200)==0: #print something from time to time 54 | # print("weights_list_batch:",weights_list_batch) 55 | return weights_list_batch 56 | 57 | #3.compute loss using cross entropy with weights 58 | def loss(logits,labels,weights): 59 | loss= tf.losses.sparse_softmax_cross_entropy(labels, logits,weights=weights) 60 | return loss 61 | 62 | ####################################################################### 63 | #util function 64 | def get_weights_label_as_standard_dict(weights_label): 65 | weights_dict = {} 66 | for k,v in weights_label.items(): 67 | count,correct=v 68 | weights_dict[k]=float(correct)/float(count) 69 | return weights_dict 70 | -------------------------------------------------------------------------------- /graphcnn_hier_eval_without_labels_all.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 222 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | 19 | evalDataSet = None 20 | 21 | FLAGS = tf.app.flags.FLAGS 22 | 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 24 | """Directory where to write event logs.""") 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 26 | """Directory where to read model checkpoints.""") 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 28 | """How often to run the eval.""") 29 | tf.app.flags.DEFINE_boolean('run_once', False, 30 | """Whether to run eval only once.""") 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 32 | """Whether to log device placement.""") 33 | 34 | 35 | 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 37 | 38 | def evaluate(checkpoint,test_index_array): 39 | with tf.Graph().as_default() as g, tf.device('/cpu:0'): 40 | # Get images and labels 41 | data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH, 42 | graphcnn_input.NUM_CHANNELS]) 43 | # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) 44 | 45 | # inference 46 | logits = graphcnn_model.inference(data, eval_data=True) 47 | # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False) 48 | 49 | # multi-label sigmoid 50 | logits = tf.sigmoid(logits) 51 | 52 | # Restore the moving average version of the learned variables for eval. # ????????????????????????? 53 | variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY) 54 | variables_to_restore = variable_averages.variables_to_restore() 55 | saver = tf.train.Saver(variables_to_restore) 56 | 57 | # Build the summary operation based on the TF collection of Summaries. 58 | # summary_op = tf.merge_all_summaries() 59 | # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) 60 | 61 | 62 | with tf.Session(config=tf.ConfigProto( 63 | allow_soft_placement=True, 64 | log_device_placement=FLAGS.log_device_placement)) as sess: 65 | if checkpoint == '0': 66 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) 67 | if ckpt and ckpt.model_checkpoint_path: 68 | # Restores from checkpoint 69 | saver.restore(sess, ckpt.model_checkpoint_path) 70 | # extract global_step 71 | global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) 72 | else: 73 | print('No checkpoint file found') 74 | return 75 | else: 76 | if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)): 77 | saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)) 78 | global_step_for_restore = int(checkpoint) 79 | else: 80 | print('No checkpoint file found') 81 | return 82 | 83 | num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE)) 84 | total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE 85 | step = 0 86 | total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32) ## 87 | while step < num_iter: 88 | test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE) 89 | predicted_value = sess.run( 90 | logits, feed_dict={data: test_data}) 91 | total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0) 92 | step += 1 93 | 94 | total_predicted_value = total_predicted_value[1:] 95 | 96 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 97 | if os.path.exists(detail_filename): 98 | os.remove(detail_filename) 99 | np.savetxt(detail_filename, total_predicted_value, fmt='%.4f') 100 | 101 | 102 | filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval') 103 | file_eval_log = open(filename_eval_log, 'w') 104 | np.set_printoptions(threshold=np.nan) 105 | print('\nevaluation:', file=file_eval_log) 106 | print('\nevaluation:') 107 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log) 108 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore)) 109 | print('evaluation is end...') 110 | print('evaluation is end...', file=file_eval_log) 111 | 112 | print('evaluation samples number:%d, evaluation classes number:%d' % 113 | (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log) 114 | print('evaluation samples number:%d, evaluation classes number:%d' % 115 | (total_predicted_value.shape[0], total_predicted_value.shape[1])) 116 | print('evaluation detail: ' 117 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 118 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'), 119 | file=file_eval_log) 120 | print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval') 121 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 122 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution')) 123 | file_eval_log.close() 124 | 125 | 126 | 127 | def main(argv=None): # pylint: disable=unused-argument 128 | global evalDataSet 129 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 130 | 131 | if tf.gfile.Exists(FLAGS.eval_dir): 132 | # print('the evaluate data has already exists!') 133 | # str = input('continue will delete the old evaluate directory:(y/n)') 134 | # if str == 'y' or str == 'Y': 135 | tf.gfile.DeleteRecursively(FLAGS.eval_dir) 136 | #elif str == 'n' or str == 'N': 137 | # print('eval end!') 138 | # return 139 | #else: 140 | # print('invalid input!') 141 | # return 142 | tf.gfile.MakeDirs(FLAGS.eval_dir) 143 | 144 | test_index_array = np.array(range(0, 81262)) 145 | 146 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 147 | checkpoint = '0' 148 | evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array, 149 | data_dir=graphcnn_option.EVAL_DATA_DIR, 150 | ont_hot=True, 151 | index_mode=True, 152 | label_used=False) 153 | print('evaluating...') 154 | evaluate(checkpoint,test_index_array) 155 | 156 | 157 | if __name__ == '__main__': 158 | tf.app.run() 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /graphcnn_hier_eval_without_labels_some.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 444 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | 37 | 38 | def generate_eval_index(): 39 | test_index_array = [] 40 | # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME) 41 | filepath = '../hier_eval_root' 42 | pathDir = os.listdir(filepath) 43 | for allDir in pathDir: 44 | child = os.path.join(filepath, allDir) 45 | if os.path.getsize(child): 46 | example_label_array = np.loadtxt(child,dtype=int) 47 | examlpe_array = example_label_array[:,0] 48 | label_array = example_label_array[:, 1] 49 | for root in graphcnn_option.HIER_ROOT_CODE: 50 | index = np.where(label_array==root)[0] 51 | for one in examlpe_array[index]: 52 | if one not in test_index_array: 53 | test_index_array.append(one) 54 | 55 | # for allDir in pathDir: 56 | # child = os.path.join(filepath, allDir) 57 | # os.remove(child) 58 | 59 | 60 | filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index') 61 | np.savetxt(filename,test_index_array,fmt='%d') 62 | 63 | return test_index_array 64 | 65 | 66 | def evaluate(checkpoint,test_index_array): 67 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 68 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 69 | total_predicted_value = total_predicted_value[test_index_array] 70 | 71 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 72 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 73 | total_predicted_value = ( 74 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 75 | 76 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 77 | if os.path.exists(detail_filename): 78 | os.remove(detail_filename) 79 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 80 | 81 | 82 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 83 | total_remap = np.loadtxt(filename, dtype=int) 84 | 85 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 86 | graphcnn_option.HIER_labels_remap_file) 87 | remap = np.loadtxt(detail_filename, dtype=int) 88 | 89 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 90 | fr_leaf = open(filename,'a') 91 | filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file) 92 | fr_leaf_exp = open(filename, 'a') 93 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 94 | fr_root = open(filename, 'w') 95 | 96 | # rootstr_tmp = [] 97 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 98 | fr = open(detail_filename, 'w') 99 | for i in range(0, np.size(total_predicted_value, axis=0)): 100 | labels = np.where(total_predicted_value[i] == 1)[0] 101 | if len(labels) > 0: 102 | labels_remap = remap[labels, 0] 103 | for elem in labels_remap: 104 | print(elem, end=' ', file=fr) 105 | if elem in total_remap[:,0]: # leaf 106 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 107 | else: 108 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 109 | # for j in range(0,len(rootlist)): 110 | # if elem in rootlist[j]: 111 | # if rootstr[j] not in rootstr_tmp: 112 | # rootstr_tmp.append(rootstr[j]) 113 | print('', file=fr) 114 | else: 115 | # labels_remap = remap[:, 0] 116 | labels = total_predicted_value_argmax[i] 117 | labels_value = total_predicted_value_max[i] 118 | labels_remap = remap[labels, 0] 119 | # for elem in labels_remap: 120 | elem = labels_remap 121 | print(elem, file=fr) 122 | if elem in total_remap[:, 0]: # leaf 123 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp) 124 | else: 125 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 126 | # if labels_value < 0.5: 127 | # labels_remap = remap[:, 0] 128 | # for elem in labels_remap: 129 | # if elem not in total_remap[:, 0]: 130 | # print('%d %d' % (test_index_array[i], elem), file=fr_root) 131 | 132 | fr.close() 133 | fr_leaf.close() 134 | fr_root.close() 135 | fr_leaf_exp.close() 136 | 137 | # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root') 138 | # fr = open(filename, 'w') 139 | # for one in rootstr_tmp: 140 | # print(one) 141 | # print(one,file=fr) 142 | # fr.close() 143 | 144 | 145 | 146 | 147 | def main(argv=None): # pylint: disable=unused-argument 148 | global evalDataSet 149 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 150 | 151 | # test_index_array = np.array(range(0, 81262)) 152 | if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root 153 | test_index_array = np.array(range(0,81262)) 154 | # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int) 155 | else: 156 | test_index_array = generate_eval_index() 157 | if test_index_array is None or len(test_index_array)==0: 158 | print('no hier_data need eval') 159 | return 160 | else: 161 | print('choosing for evaluation...') 162 | print('choosed number:%d' % len(test_index_array)) 163 | 164 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 165 | checkpoint = '0' 166 | 167 | # print('choosing for evaluation...') 168 | evaluate(checkpoint,test_index_array) 169 | 170 | 171 | if __name__ == '__main__': 172 | tf.app.run() 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /graphcnn_hier_eval_without_labels_some2.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 333 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | def evaluate(checkpoint,test_index_array): 37 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 38 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 39 | total_predicted_value = total_predicted_value[test_index_array] 40 | 41 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 42 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 43 | total_predicted_value = ( 44 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 45 | 46 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 47 | if os.path.exists(detail_filename): 48 | os.remove(detail_filename) 49 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 50 | 51 | 52 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 53 | total_remap = np.loadtxt(filename, dtype=int) 54 | 55 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 56 | graphcnn_option.HIER_labels_remap_file) 57 | remap = np.loadtxt(detail_filename, dtype=int) 58 | 59 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 60 | fr_leaf = open(filename,'a') 61 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 62 | fr_root = open(filename, 'w') 63 | 64 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr') 65 | # fr = open(filename, 'r') 66 | # rootstr = fr.readlines() 67 | # fr.close() 68 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist') 69 | # fr = open(filename, 'r') 70 | # rootlines = fr.readlines() 71 | # fr.close() 72 | # rootlist = [] 73 | # for line in rootlines: 74 | # line = line.strip() 75 | # linelist = line.split(' ') 76 | # linelist = [int(k) for k in linelist] 77 | # rootlist.append(linelist) 78 | 79 | # rootstr_tmp = [] 80 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 81 | fr = open(detail_filename, 'w') 82 | for i in range(0, np.size(total_predicted_value, axis=0)): 83 | labels = np.where(total_predicted_value[i] == 1)[0] 84 | if len(labels) > 0: 85 | labels_remap = remap[labels, 0] 86 | for elem in labels_remap: 87 | print(elem, end=' ', file=fr) 88 | if elem in total_remap[:,0]: # leaf 89 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 90 | print('', file=fr) 91 | else: 92 | labels = total_predicted_value_argmax[i] 93 | labels_remap = remap[labels, 0] 94 | elem = labels_remap 95 | labels_value = total_predicted_value_max[i] 96 | print(elem, file=fr) 97 | if elem in total_remap[:, 0]: # leaf 98 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root) 99 | 100 | 101 | fr.close() 102 | fr_leaf.close() 103 | fr_root.close() 104 | 105 | 106 | 107 | 108 | def main(argv=None): # pylint: disable=unused-argument 109 | global evalDataSet 110 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 111 | 112 | test_index_array = np.array(range(0, 81262)) 113 | print('choosing for evaluation...') 114 | print('choosed number:%d' % len(test_index_array)) 115 | 116 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 117 | checkpoint = '0' 118 | 119 | # print('choosing for evaluation...') 120 | evaluate(checkpoint,test_index_array) 121 | 122 | 123 | if __name__ == '__main__': 124 | tf.app.run() 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /graphcnn_option.py: -------------------------------------------------------------------------------- 1 | 2 | ## data 3 | ORI_DATA_NAME = 'graphs' 4 | ORI_TRAIN_DATA_NAME = 'train_graphs' 5 | ORI_TEST_DATA_NAME = 'test_graphs' 6 | ORI_DATA_VEC_NAME = 'index2vec' 7 | ORI_DATA_OPTION_NAME = 'option' 8 | 9 | TRAIN_DATA_NAME = 'data.train' 10 | TEST_DATA_NAME = 'data.test' 11 | DATA_OPTION_NAME = 'data.option' 12 | 13 | DATA_LABELS_REMAP_NAME = 'remap' 14 | 15 | ## LSHTC Hierarchy training 16 | 17 | 18 | HIER_used = True 19 | HIER_test_used = True 20 | rootstr = '_1_2322682_' # ???? 21 | HIER_ROOT_CODE = [2322682] # ???? 22 | HIER_DIR_NAME = 'hier' 23 | HIER_labels_remap_file = 'hier'+rootstr+'remap' 24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index' 25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels' 26 | HIER_train_data_file = 'hier'+rootstr+'train_data' # ?? 27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index' 28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels' 29 | HIER_test_data_file = 'hier'+rootstr+'test_data' # ?? 30 | 31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf' 32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp' 33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root' 34 | 35 | if HIER_used: 36 | TRAIN_DATA_NAME = HIER_train_data_file 37 | if HIER_test_used: 38 | TEST_DATA_NAME = HIER_test_data_file 39 | 40 | 41 | 42 | 43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn 44 | # lr_decay_ecophs = [2,150,750,1250,1500] # single-label wiki_cn 45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001] 46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001] 47 | # lr_decay_ecophs = [10,400,1500,1800,2000] # multi-label, RCV 48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500] # multi-label, RCV 49 | 50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600 51 | 52 | 53 | 54 | ## Basic parameters. 55 | TRAIN_DATA_DIR = '../graphCNN_data' # Path to the train data directory. 56 | EVAL_DATA_DIR = '../graphCNN_data' # Path to the test data directory. 57 | DATA_PATH = './data' # Path to data directory 58 | 59 | USE_FP16 = False # Train the model using fp16. 60 | 61 | # summaryWriter 62 | SUMMARYWRITER = False 63 | 64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 65 | # to differentiate the operations. Note that this prefix is removed from the 66 | # names of the summaries when visualizing a model. 67 | TOWER_NAME = 'tower' 68 | 69 | 70 | 71 | ## model parameters 72 | NUM_EPOCHS_PER_DECAY = 1000 #350 # Epochs after which learning rate decays. 73 | INITIAL_LEARNING_RATE = 0.001 # Initial learning rate. 74 | LEARNING_RATE_DECAY_RATE = 0.1 # Learning rate decay rate. 75 | 76 | MOMENTUM = 0.9 # Momentum of SGD 77 | 78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training. 79 | 80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average. 81 | 82 | WEIGHT_DECAY = 0.0005 # 0.00005 # 0.0005 # l2 regularization weight decay 83 | 84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint 85 | 86 | 87 | ## train parameters 88 | NUM_GPUS = 4 # How many GPUs to use 89 | 90 | CKPT_PERIOD = 5000 91 | 92 | 93 | ## eval parameters 94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification 95 | -------------------------------------------------------------------------------- /utils/read: -------------------------------------------------------------------------------- 1 | a 1 2 | a 1 3 | a 1 4 | a 1 5 | a 1 6 | a 1 7 | a 1 8 | a 1 9 | b 1 10 | b 1 11 | b 1 12 | b 1 13 | c 1 14 | c 1 15 | c 1 16 | c 1 17 | a 1 18 | a 1 19 | a 1 20 | a 1 21 | b 1 22 | b 1 23 | b 1 24 | b 1 25 | c 1 26 | c 1 27 | c 1 28 | c 1 29 | -------------------------------------------------------------------------------- /utils/tmp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | import shutil 6 | 7 | # 遍历指定目录,显示目录下的所有文件名 8 | def eachFile(filepath): 9 | pathDir = os.listdir(filepath) 10 | for allDir in pathDir: 11 | child = os.path.join('%s%s' % (filepath, allDir)) 12 | 13 | def xx(): 14 | filename = 'graphcnn_hier_eval_without_labels.py' 15 | DIR = '.' 16 | pathDir = os.listdir(DIR) 17 | for path in pathDir: 18 | if len(path)>5 and path[0:5]=='LSHTC': 19 | sourceFile = os.path.join(DIR, filename) 20 | targetFile = os.path.join(DIR,path,filename) 21 | if os.path.exists(targetFile): 22 | os.remove(targetFile) 23 | shutil.copy(sourceFile, targetFile) 24 | 25 | 26 | a = np.array([[1,2,3],[1,2,3]]) 27 | a = np.reshape(a,[-1,1]) 28 | print(a) -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | def main(): 5 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups' 6 | fr = open(filename, 'r') 7 | lines = fr.readlines() 8 | fr.close() 9 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info' 10 | fr = open(filename, 'w') 11 | for line in lines: 12 | line = line.strip() 13 | linelist = line.split(' ') 14 | print(len(linelist),file=fr) 15 | fr.close() 16 | 17 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups' 18 | fr = open(filename, 'r') 19 | lines = fr.readlines() 20 | fr.close() 21 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info' 22 | fr = open(filename, 'w') 23 | for line in lines: 24 | line = line.strip() 25 | linelist = line.split(' ') 26 | print(len(linelist),file=fr) 27 | fr.close() 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | --------------------------------------------------------------------------------