├── BiLstmTextRelation
    ├── p9_BiLstmTextRelation_model.py
    └── p9_BiLstmTextRelation_train.py
├── CNN
    ├── LICENSE
    ├── README.md
    ├── binary_class_data_loader.py
    ├── char_data_processor.py
    ├── data_helpers.py
    ├── eval.py
    ├── multi_class_data_loader.py
    ├── text_cnn.py
    ├── train.py
    └── word_data_processor.py
├── CNNSentenceClassificationTflearn
    ├── p4_cnn_sentence_classification.py
    ├── p4_cnn_sentence_classification_zhihu.py
    ├── p4_cnn_sentence_classification_zhihu2.py
    ├── p4_cnn_sentence_classification_zhihu2_predict.py
    └── p4_conv_classification_tflearn.py
├── DynamicMemoryNet
    ├── a8_dynamic_memory_network.py
    ├── a8_predict.py
    └── a8_train.py
├── GraphCNN
    ├── SVM_eval.py
    ├── SVM_model.py
    ├── SVM_train.py
    ├── __init__.py
    ├── graphcnn_eval_SVM.py
    ├── graphcnn_eval_multilabel.py
    ├── graphcnn_eval_singlelabel.py
    ├── graphcnn_eval_without_labels.py
    ├── graphcnn_generate_data.py
    ├── graphcnn_hier_eval_without_labels.py
    ├── graphcnn_hier_eval_without_labels_SVM.py
    ├── graphcnn_hier_eval_without_labels_all.py
    ├── graphcnn_hier_eval_without_labels_some.py
    ├── graphcnn_hier_eval_without_labels_some2.py
    ├── graphcnn_hier_eval_without_labels_some_root.py
    ├── graphcnn_input.py
    ├── graphcnn_model.py
    ├── graphcnn_option.py
    ├── graphcnn_train.py
    └── utils
    │   ├── NYT_utils.py
    │   ├── lshtc_utils.py
    │   ├── lshtc_utils2.py
    │   ├── read
    │   ├── tmp.py
    │   └── utils.py
├── HLSTM
    └── src
    │   ├── Dataset.py
    │   ├── EmbLayer.py
    │   ├── HiddenLayer.py
    │   ├── LSTMLayer.py
    │   ├── LSTMModel.py
    │   ├── PoolLayer.py
    │   ├── SentenceSortLayer.py
    │   ├── Update.py
    │   ├── test.py
    │   └── train.py
├── HierarchicalAttentionNetwork
    ├── HAN_model.py
    ├── p1_HierarchicalAttention_model.py
    ├── p1_HierarchicalAttention_model_transformer.py
    ├── p1_HierarchicalAttention_predict.py
    ├── p1_HierarchicalAttention_train.py
    └── p1_seq2seq.py
├── Keras_Version
    ├── main.py
    ├── model2.py
    ├── test2matrix_process.py
    └── words_index.json
├── NewGraphCNNs
├── Pytorch_GraphCNNs
    ├── make_graphs.py
    ├── make_heiring.py
    ├── rcv1_processer.py
    ├── test.py
    ├── test_extra.py
    ├── train.py
    └── unzip.py
├── RCNN
    └── v-cpp
    │   ├── ecnn-noada.cpp
    │   └── fileutil.hpp
├── README.md
├── SVM_eval.py
├── SVM_model.py
├── SVM_train.py
├── Seq2seqWithAttention
    ├── a1_seq2seq.py
    ├── a1_seq2seq_attention_model.py
    ├── a1_seq2seq_attention_predict.py
    └── a1_seq2seq_attention_train.py
├── Text2Graph
    ├── Text2Graph-master
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── ecs
    │   │               ├── CoreNLPService.java
    │   │               └── TestCoreNLP.java
    └── src
    │   └── main
    │       └── java
    │           └── ecs
    │               ├── CoreNLPService.java
    │               └── TestCoreNLP.java
├── TextCNN
    ├── __pycache__
    │   ├── data_util.cpython-36.pyc
    │   └── p7_TextCNN_model.cpython-36.pyc
    ├── data_util.py
    ├── other_experiement
    │   ├── data_util_zhihu.py
    │   ├── p7_TextCNN_predict_ensemble.py
    │   ├── p7_TextCNN_predict_exp.py
    │   ├── p7_TextCNN_predict_exp512.py
    │   ├── p7_TextCNN_predict_exp512_0609.py
    │   ├── p7_TextCNN_predict_exp512_simple.py
    │   ├── p7_TextCNN_train_exp.py
    │   ├── p7_TextCNN_train_exp512.py
    │   ├── p7_TextCNN_train_exp_512_0609.py
    │   └── p8_TextCNN_predict_exp.py
    ├── p7_TextCNN_model.py
    ├── p7_TextCNN_model_multilayers.py
    ├── p7_TextCNN_predict.py
    └── p7_TextCNN_train.py
├── TextRCNN
    ├── p71_TextRCNN_mode2.py
    ├── p71_TextRCNN_model.py
    ├── p71_TextRCNN_predict.py
    └── p71_TextRCNN_train.py
├── TextRNN
    ├── p8_TextRNN_model.py
    ├── p8_TextRNN_model_multi_layers.py
    ├── p8_TextRNN_predict.py
    └── p8_TextRNN_train.py
├── __init__.py
├── __pycache__
    ├── graphcnn.cpython-34.pyc
    ├── graphcnn_GPU.cpython-34.pyc
    ├── graphcnn_generate_data.cpython-34.pyc
    ├── graphcnn_input.cpython-34.pyc
    └── graphcnn_option.cpython-34.pyc
├── boosting
    └── a08_boosting.py
├── graphcnn_eval_SVM.py
├── graphcnn_eval_multilabel.py
├── graphcnn_eval_singlelabel.py
├── graphcnn_eval_without_labels.py
├── graphcnn_generate_data.py
├── graphcnn_hier_eval_without_labels.py
├── graphcnn_hier_eval_without_labels_SVM.py
├── graphcnn_hier_eval_without_labels_all.py
├── graphcnn_hier_eval_without_labels_some.py
├── graphcnn_hier_eval_without_labels_some2.py
├── graphcnn_hier_eval_without_labels_some_root.py
├── graphcnn_input.py
├── graphcnn_model.py
├── graphcnn_option.py
├── graphcnn_train.py
└── utils
    ├── NYT_utils.py
    ├── lshtc_utils.py
    ├── lshtc_utils2.py
    ├── read
    ├── tmp.py
    └── utils.py


/CNN/README.md:
--------------------------------------------------------------------------------
 1 | **[This code belongs to the "Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)**
 2 | 
 3 | It is slightly simplified implementation of Kim's [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) paper in Tensorflow.
 4 | 
 5 | ## Requirements
 6 | 
 7 | - Python 3
 8 | - Tensorflow > 0.8
 9 | - Numpy
10 | 
11 | ## Training
12 | 
13 | Print parameters:
14 | 
15 | ```bash
16 | ./train.py --help
17 | ```
18 | 
19 | ```
20 | optional arguments:
21 |   -h, --help            show this help message and exit
22 |   --embedding_dim EMBEDDING_DIM
23 |                         Dimensionality of character embedding (default: 128)
24 |   --filter_sizes FILTER_SIZES
25 |                         Comma-separated filter sizes (default: '3,4,5')
26 |   --num_filters NUM_FILTERS
27 |                         Number of filters per filter size (default: 128)
28 |   --l2_reg_lambda L2_REG_LAMBDA
29 |                         L2 regularizaion lambda (default: 0.0)
30 |   --dropout_keep_prob DROPOUT_KEEP_PROB
31 |                         Dropout keep probability (default: 0.5)
32 |   --batch_size BATCH_SIZE
33 |                         Batch Size (default: 64)
34 |   --num_epochs NUM_EPOCHS
35 |                         Number of training epochs (default: 100)
36 |   --evaluate_every EVALUATE_EVERY
37 |                         Evaluate model on dev set after this many steps
38 |                         (default: 100)
39 |   --checkpoint_every CHECKPOINT_EVERY
40 |                         Save model after this many steps (default: 100)
41 |   --allow_soft_placement ALLOW_SOFT_PLACEMENT
42 |                         Allow device soft device placement
43 |   --noallow_soft_placement
44 |   --log_device_placement LOG_DEVICE_PLACEMENT
45 |                         Log placement of ops on devices
46 |   --nolog_device_placement
47 | 
48 | ```
49 | 
50 | Train:
51 | 
52 | ```bash
53 | ./train.py
54 | ```
55 | 
56 | ## Evaluating
57 | 
58 | ```bash
59 | ./eval.py --eval_train --checkpoint_dir="./runs/1459637919/checkpoints/"
60 | ```
61 | 
62 | Replace the checkpoint dir with the output from the training. To use your own data, change the `eval.py` script to load your data.
63 | 
64 | 
65 | ## References
66 | 
67 | - [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882)
68 | - [A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1510.03820)


--------------------------------------------------------------------------------
/CNN/binary_class_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from tensorflow.contrib import learn
 4 | 
 5 | class BinaryClassDataLoader(object):
 6 |     """
 7 |     Load binary classification data from two files (positive and negative) and
 8 |     split data into train and dev.
 9 |     """
10 |     def __init__(self, flags, data_processor, clean_data=None, classes=None):
11 |         self.__flags = flags
12 |         self.__data_processor = data_processor
13 |         self.__clean_data = clean_data
14 |         self.__classes = classes
15 | 
16 |     def define_flags(self):
17 |         self.__flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
18 |         self.__flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
19 |         self.__flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.")
20 | 
21 |     def prepare_data(self):
22 |         self.__resolve_params()
23 | 
24 |         x_text, y = self.load_data_and_labels()
25 | 
26 |         # Build vocabulary
27 |         self.vocab_processor = self.__data_processor.vocab_processor(x_text)
28 |         x = np.array(list(self.vocab_processor.fit_transform(x_text)))
29 | 
30 |         # Randomly shuffle data
31 |         np.random.seed(10)
32 |         shuffle_indices = np.random.permutation(np.arange(len(y)))
33 |         x_shuffled = x[shuffle_indices]
34 |         y_shuffled = y[shuffle_indices]
35 | 
36 |         # Split train/test set
37 |         # TODO: This is very crude, should use cross-validation
38 |         dev_sample_index = -1 * int(self.__dev_sample_percentage * float(len(y)))
39 |         x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
40 |         y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
41 |         return [x_train, y_train, x_dev, y_dev]
42 | 
43 |     def restore_vocab_processor(self, vocab_path):
44 |         self.vocab_processor = self.__data_processor.restore_vocab_processor(vocab_path)
45 |         return self.vocab_processor
46 | 
47 |     def class_labels(self, class_indexes):
48 |         if self.__classes is None:
49 |             result = class_indexes
50 |         else:
51 |             result = [ self.__classes[idx] for idx in class_indexes ]
52 |         return result
53 | 
54 |     def load_data_and_labels(self):
55 |         """
56 |         Loads MR polarity data from files, splits the data into words and generates labels.
57 |         Returns split sentences and labels.
58 |         """
59 |         self.__resolve_params()
60 | 
61 |         # Load data from files
62 |         positive_examples = list(open(self.__positive_data_file, "r").readlines())
63 |         negative_examples = list(open(self.__negative_data_file, "r").readlines())
64 |         # Split by words
65 |         x_text = positive_examples + negative_examples
66 |         x_text = [self.__data_processor.clean_data(sent) for sent in x_text]
67 |         # Generate labels
68 |         positive_labels = [[0, 1] for _ in positive_examples]
69 |         negative_labels = [[1, 0] for _ in negative_examples]
70 |         y = np.concatenate([positive_labels, negative_labels], 0)
71 |         return [x_text, y]
72 | 
73 |     def __resolve_params(self):
74 |         self.__dev_sample_percentage = self.__flags.FLAGS.dev_sample_percentage
75 |         self.__positive_data_file = self.__flags.FLAGS.positive_data_file
76 |         self.__negative_data_file = self.__flags.FLAGS.negative_data_file
77 | 


--------------------------------------------------------------------------------
/CNN/char_data_processor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import codecs
 3 | 
 4 | class CharDataProcessor(object):
 5 |     def vocab_processor(_, *texts):
 6 |         max_document_length = 0
 7 |         for text in texts:
 8 |             max_doc_len = max([len(line.decode("utf-8")) for line in text])
 9 |             if max_doc_len > max_document_length:
10 |                 max_document_length = max_doc_len
11 |         return VocabularyProcessor(max_document_length)
12 | 
13 |     def restore_vocab_processor(_, vocab_path):
14 |         return VocabularyProcessor.restore(vocab_path)
15 | 
16 |     def clean_data(_, string):
17 |         return string
18 | 
19 | class VocabularyProcessor(object):
20 |     def __init__(self, max_document_length, min_frequency=0, vocabulary=None,
21 |                        tokenizer_fn=None):
22 |     # init a class. index  maxdocument length and a vocabulabrary
23 |         if vocabulary == None:
24 |             self.vocabulary_ = {"<PAD>":0} # padding
25 |         else:
26 |             self.vocabulary_ = vocabulary
27 | 
28 |         self.index = 1
29 |         self.max_document_length = max_document_length
30 |     def fit_transform(self, raw_documents, unused_y=None, fit=True):
31 |         result = []
32 |         for raw_document in raw_documents:
33 |             # mark for this, we can find it is a [[I am a  student]]
34 |             result.append([self.__vocab_id(char, fit) for char in raw_document.decode("utf-8")])
35 | 
36 |         if self.max_document_length == None:
37 |             max_document_length = max([len(vocab_ids) for vocab_ids in result])
38 |         else:
39 |             max_document_length = self.max_document_length
40 | 
41 |         result = self.__smooth_lengths(result, max_document_length)
42 | 
43 |         return result
44 | 
45 |     def transform(self, raw_documents):
46 |         return self.fit_transform(raw_documents, None, False)
47 | 
48 |     def save(self, file):
49 |         with codecs.open(file, 'w', 'utf-8') as f:
50 |             data = {"vocabulary_": self.vocabulary_, "index": self.index,
51 |                     "max_document_length": self.max_document_length}
52 |             f.write(json.dumps(data, ensure_ascii=False))
53 | 
54 |     @classmethod
55 |     def restore(cls, file):
56 |         with codecs.open(file, "r", "utf-8") as f:
57 |             data = json.loads(f.readline())
58 |             vp = cls(data["max_document_length"], 0, data["vocabulary_"])
59 |             vp.index = data["index"]
60 |             return vp
61 | 
62 |     @staticmethod
63 |     def __smooth_lengths(documents, length):
64 |         result = []
65 |         for document in documents:
66 |             if len(document) > length:
67 |                 doccument = document[:length]
68 |             elif len(document) < length:
69 |                 document = document + [0] * (length - len(document))
70 |             result.append(document)
71 |         return result
72 | 
73 |     def __vocab_id(self, char, fit = True):
74 |         # every word has a id
75 |         if char not in self.vocabulary_:
76 |             if fit:
77 |                 self.vocabulary_[char] = self.index
78 |                 self.index += 1
79 |             else:
80 |                 char = "<PAD>"
81 |         return self.vocabulary_[char]
82 | 
83 | 


--------------------------------------------------------------------------------
/CNN/data_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | import itertools
 4 | from collections import Counter
 5 | 
 6 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
 7 |     """
 8 |     Generates a batch iterator for a dataset.
 9 |     """
10 |     data = np.array(data)
11 |     data_size = len(data)
12 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
13 |     for epoch in range(num_epochs):
14 |         # Shuffle the data at each epoch
15 |         if shuffle:
16 |             shuffle_indices = np.random.permutation(np.arange(data_size))
17 |             shuffled_data = data[shuffle_indices]
18 |         else:
19 |             shuffled_data = data
20 |         for batch_num in range(num_batches_per_epoch):
21 |             start_index = batch_num * batch_size
22 |             end_index = min((batch_num + 1) * batch_size, data_size)
23 |             yield shuffled_data[start_index:end_index]
24 | 


--------------------------------------------------------------------------------
/CNN/eval.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | import os
 6 | import time
 7 | import datetime
 8 | import data_helpers
 9 | from text_cnn import TextCNN
10 | #from binary_class_data_loader import BinaryClassDataLoader
11 | from multi_class_data_loader import MultiClassDataLoader
12 | #from word_data_processor import WordDataProcessor
13 | from char_data_processor import CharDataProcessor
14 | import csv
15 | 
16 | # Parameters
17 | # ==================================================
18 | 
19 | # Eval Parameters
20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
21 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
22 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")
23 | 
24 | # Misc Parameters
25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
27 | 
28 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor())
29 | data_loader.define_flags()
30 | 
31 | FLAGS = tf.flags.FLAGS
32 | FLAGS._parse_flags()
33 | print("\nParameters:")
34 | for attr, value in sorted(FLAGS.__flags.items()):
35 |     print("{}={}".format(attr.upper(), value))
36 | print("")
37 | 
38 | # CHANGE THIS: Load data. Load your own data here
39 | if FLAGS.eval_train:
40 |     x_raw, y_test = data_loader.load_data_and_labels()
41 |     y_test = np.argmax(y_test, axis=1)
42 | else:
43 |     x_raw = ["a masterpiece four years in the making", "everything is off."]
44 |     y_test = [1, 0]
45 | 
46 | # Map data into vocabulary
47 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
48 | vocab_processor = data_loader.restore_vocab_processor(vocab_path)
49 | x_test = np.array(list(vocab_processor.transform(x_raw)))
50 | 
51 | print("\nEvaluating...\n")
52 | 
53 | # Evaluation
54 | # ==================================================
55 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
56 | graph = tf.Graph()
57 | with graph.as_default():
58 |     session_conf = tf.ConfigProto(
59 |       allow_soft_placement=FLAGS.allow_soft_placement,
60 |       log_device_placement=FLAGS.log_device_placement)
61 |     sess = tf.Session(config=session_conf)
62 |     with sess.as_default():
63 |         # Load the saved meta graph and restore variables
64 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
65 |         saver.restore(sess, checkpoint_file)
66 | 
67 |         # Get the placeholders from the graph by name
68 |         input_x = graph.get_operation_by_name("input_x").outputs[0]
69 |         # input_y = graph.get_operation_by_name("input_y").outputs[0]
70 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
71 | 
72 |         # Tensors we want to evaluate
73 |         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
74 | 
75 |         # Generate batches for one epoch
76 |         batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)
77 | 
78 |         # Collect the predictions here
79 |         all_predictions = []
80 | 
81 |         for x_test_batch in batches:
82 |             batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
83 |             all_predictions = np.concatenate([all_predictions, batch_predictions])
84 | 
85 | # Print accuracy if y_test is defined
86 | if y_test is not None:
87 |     correct_predictions = float(sum(all_predictions == y_test))
88 |     print("Total number of test examples: {}".format(len(y_test)))
89 |     print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
90 | 
91 | # Save the evaluation to a csv
92 | all_predictions = data_loader.class_labels(all_predictions.astype(int))
93 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
94 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
95 | print("Saving evaluation to {0}".format(out_path))
96 | with open(out_path, 'w') as f:
97 |     csv.writer(f).writerows(predictions_human_readable)
98 | 


--------------------------------------------------------------------------------
/CNN/multi_class_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import csv
 3 | 
 4 | class MultiClassDataLoader(object):
 5 |     """
 6 |     Handles multi-class training data.  It takes predefined sets of "train_data_file" and "dev_data_file"
 7 |     of the following record format.
 8 |         <text>\t<class label>
 9 |       ex. "what a masterpiece!	Positive"
10 | 
11 |     Class labels are given as "class_data_file", which is a list of class labels.
12 |     """
13 |     def __init__(self, flags, data_processor):
14 |         self.__flags = flags
15 |         self.__data_processor = data_processor
16 |         self.__train_data_file = None
17 |         self.__dev_data_file = None
18 |         self.__class_data_file = None
19 |         self.__classes_cache = None
20 | 
21 | 
22 |     def define_flags(self):
23 |         self.__flags.DEFINE_string("train_data_file", "./data/rt-polaritydata/train.txt", "Data source for the training data.")
24 |         self.__flags.DEFINE_string("dev_data_file", "./data/rt-polaritydata/test.txt", "Data source for the cross validation data.")
25 |         self.__flags.DEFINE_string("class_data_file", "./data/rt-polaritydata/lable.txt", "Data source for the class list.")
26 | 
27 |     def prepare_data(self):
28 |         self.__resolve_params()
29 |         x_train, y_train = self.__load_data_and_labels(self.__train_data_file)
30 |         x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file)
31 | 
32 |         max_doc_len = max([len(doc.decode("utf-8")) for doc in x_train])
33 |         max_doc_len_dev = max([len(doc.decode("utf-8")) for doc in x_dev])
34 |         if max_doc_len_dev > max_doc_len:
35 |             max_doc_len = max_doc_len_dev
36 |         # Build vocabulary
37 |         self.vocab_processor = self.__data_processor.vocab_processor(x_train, x_dev)
38 |         x_train = np.array(list(self.vocab_processor.fit_transform(x_train)))
39 |         # Build vocabulary
40 |         x_dev = np.array(list(self.vocab_processor.fit_transform(x_dev)))
41 |         return [x_train, y_train, x_dev, y_dev]
42 | 
43 |     def restore_vocab_processor(self, vocab_path):
44 |         return self.__data_processor.restore_vocab_processor(vocab_path)
45 | 
46 |     def class_labels(self, class_indexes):
47 |         return [ self.__classes()[idx] for idx in class_indexes ]
48 | 
49 |     def load_data_and_labels(self):
50 |         self.__resolve_params()
51 |         x_train, y_train = self.__load_data_and_labels(self.__train_data_file)
52 |         x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file)
53 |         x_all = x_train + x_dev
54 |         y_all = np.concatenate([y_train, y_dev], 0)
55 |         return [x_all, y_all]
56 | 
57 |     def __load_data_and_labels(self, data_file):
58 |         x_text = []
59 |         y = []
60 |         with open(data_file, 'r') as tsvin:
61 |             classes = self.__classes()
62 |             one_hot_vectors = np.eye(len(classes), dtype=int)
63 |             class_vectors = {}
64 |             for i, cls in enumerate(classes):
65 |                 class_vectors[cls] = one_hot_vectors[i]
66 |             #edit for the first to the code.
67 |             all_lines = tsvin.readlines()
68 |             for line in all_lines:
69 |                 temp = line.split(' ',1)
70 |                 data = self.__data_processor.clean_data(temp[1])
71 |                 x_text.append(data)
72 |                 y.append(class_vectors[temp[0]])
73 |             #edit
74 | #            tsvin = csv.reader(tsvin, delimiter='\t')
75 | #            for row in tsvin:
76 | #                data = self.__data_processor.clean_data(row[0])
77 | #                x_text.append(data)
78 | #                y.append(class_vectors[row[1]])
79 |         return [x_text, np.array(y)]
80 | 
81 |     def __classes(self):
82 |         self.__resolve_params()
83 |         if self.__classes_cache is None:
84 |             with open(self.__class_data_file, 'r') as catin:
85 |                 classes = list(catin.readlines())
86 |                 self.__classes_cache = [s.strip() for s in classes]
87 |         return self.__classes_cache
88 | 
89 |     def __resolve_params(self):
90 |         if self.__class_data_file is None:
91 |             self.__train_data_file = self.__flags.FLAGS.train_data_file
92 |             self.__dev_data_file = self.__flags.FLAGS.dev_data_file
93 |             self.__class_data_file = self.__flags.FLAGS.class_data_file
94 | 


--------------------------------------------------------------------------------
/CNN/text_cnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextCNN(object):
 6 |     """
 7 |     A CNN for text classification.
 8 |     Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
 9 |     """
10 |     def __init__(
11 |       self, sequence_length, num_classes, vocab_size,
12 |       embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 | 
19 |         # Keeping track of l2 regularization loss (optional)
20 |         l2_loss = tf.constant(0.0)
21 | 
22 |         # Embedding layer
23 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
24 |             self.W = tf.Variable(
25 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
26 |                 trainable = False,
27 |                 name="W")
28 | 
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
30 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
31 | 
32 |         # Create a convolution + maxpool layer for each filter size
33 |         pooled_outputs = []
34 |         for i, filter_size in enumerate(filter_sizes):
35 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
36 |                 # Convolution Layer
37 |                 filter_shape = [filter_size, embedding_size, 1, num_filters]
38 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
39 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
40 |                 conv = tf.nn.conv2d(
41 |                     self.embedded_chars_expanded,
42 |                     W,
43 |                     strides=[1, 1, 1, 1],
44 |                     padding="VALID",
45 |                     name="conv")
46 |                 # Apply nonlinearity
47 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
48 |                 # Maxpooling over the outputs
49 |                 pooled = tf.nn.max_pool(
50 |                     h,
51 |                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
52 |                     strides=[1, 1, 1, 1],
53 |                     padding='VALID',
54 |                     name="pool")
55 |                 pooled_outputs.append(pooled)
56 | 
57 |         # Combine all the pooled features
58 |         num_filters_total = num_filters * len(filter_sizes)
59 |         self.h_pool = tf.concat(3, pooled_outputs)
60 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
61 | 
62 |         # Add dropout
63 |         with tf.name_scope("dropout"):
64 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
65 | 
66 |         # Final (unnormalized) scores and predictions
67 |         with tf.name_scope("output"):
68 |             W = tf.get_variable(
69 |                 "W",
70 |                 shape=[num_filters_total, num_classes],
71 |                 initializer=tf.contrib.layers.xavier_initializer())
72 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
73 |             l2_loss += tf.nn.l2_loss(W)
74 |             l2_loss += tf.nn.l2_loss(b)
75 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
76 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
77 | 
78 |         # CalculateMean cross-entropy loss
79 |         with tf.name_scope("loss"):
80 |             losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
81 |             self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
82 | 
83 |         # Accuracy
84 |         with tf.name_scope("accuracy"):
85 |             correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
86 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
87 | 


--------------------------------------------------------------------------------
/CNN/word_data_processor.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from tensorflow.contrib import learn
 3 | 
 4 | class WordDataProcessor(object):
 5 |     def vocab_processor(_, *texts):
 6 |         max_document_length = 0
 7 |         for text in texts:
 8 |             max_doc_len = max([len(line.split(" ")) for line in text])
 9 |             if max_doc_len > max_document_length:
10 |                 max_document_length = max_doc_len
11 |         return learn.preprocessing.VocabularyProcessor(max_document_length)
12 | 
13 |     def restore_vocab_processor(_, vocab_path):
14 |         return learn.preprocessing.VocabularyProcessor.restore(vocab_path)
15 | 
16 |     def clean_data(_, string):
17 |         """
18 |         Tokenization/string cleaning for all datasets except for SST.
19 |         Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
20 |         """
21 |         string = string.strip()
22 |         string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
23 |         string = re.sub(r"\'s", " \'s", string)
24 |         string = re.sub(r"\'ve", " \'ve", string)
25 |         string = re.sub(r"n\'t", " n\'t", string)
26 |         string = re.sub(r"\'re", " \'re", string)
27 |         string = re.sub(r"\'d", " \'d", string)
28 |         string = re.sub(r"\'ll", " \'ll", string)
29 |         string = re.sub(r",", " , ", string)
30 |         string = re.sub(r"!", " ! ", string)
31 |         string = re.sub(r"\(", " \( ", string)
32 |         string = re.sub(r"\)", " \) ", string)
33 |         string = re.sub(r"\?", " \? ", string)
34 |         string = re.sub(r"\s{2,}", " ", string)
35 |         return string.strip().lower()
36 | 


--------------------------------------------------------------------------------
/CNNSentenceClassificationTflearn/p4_cnn_sentence_classification.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division, print_function, absolute_import
 3 | 
 4 | """
 5 | Simple example using convolutional neural network to classify IMDB
 6 | sentiment dataset.
 7 | References:
 8 |     - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
 9 |     and Christopher Potts. (2011). Learning Word Vectors for Sentiment
10 |     Analysis. The 49th Annual Meeting of the Association for Computational
11 |     Linguistics (ACL 2011).
12 |     - Kim Y. Convolutional Neural Networks for Sentence Classification[C].
13 |     Empirical Methods in Natural Language Processing, 2014.
14 | Links:
15 |     - http://ai.stanford.edu/~amaas/data/sentiment/
16 |     - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf
17 | """
18 | import tensorflow as tf
19 | import tflearn
20 | from tflearn.layers.core import input_data, dropout, fully_connected
21 | from tflearn.layers.conv import conv_1d, global_max_pool
22 | from tflearn.layers.merge_ops import merge
23 | from tflearn.layers.estimator import regression
24 | from tflearn.data_utils import to_categorical, pad_sequences
25 | from tflearn.datasets import imdb
26 | import numpy as np
27 | 
28 | print("started...")
29 | # 1.IMDB Dataset loading
30 | train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1)
31 | trainX, trainY = train
32 | testX, testY = test
33 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话
34 | print("testY.shape:",np.array(testY).shape) #2500个label
35 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
36 | print("testY[0]:",testY[0]) #0
37 | 
38 | # 2.Data preprocessing
39 | # Sequence padding
40 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length
41 | testX = pad_sequences(testX, maxlen=100, value=0.)   #padding to max length
42 | # Converting labels to binary vectors
43 | trainY = to_categorical(trainY, nb_classes=2) #y as one hot
44 | testY = to_categorical(testY, nb_classes=2)   #y as one hot
45 | 
46 | # 3.Building convolutional network
47 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
48 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
49 | network = tflearn.embedding(network, input_dim=10000, output_dim=128) #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
50 |          #conv_1d(incoming,nb_filter,filter_size)
51 | branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns
52 | branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters]
53 | branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters]
54 | network = merge([branch1, branch2, branch3], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters]
55 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape
56 | network = global_max_pool(network) #[batch_size, pooled dim]
57 | network = dropout(network, 0.5) #[batch_size, pooled dim]
58 | network = fully_connected(network, 2, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,2]
59 | network = regression(network, optimizer='adam', learning_rate=0.001,
60 |                      loss='categorical_crossentropy', name='target')
61 | # Training
62 | model = tflearn.DNN(network, tensorboard_verbose=0)
63 | model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
64 | print("ended...")


--------------------------------------------------------------------------------
/CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division, print_function, absolute_import
 3 | 
 4 | """
 5 | Simple example using convolutional neural network to classify IMDB
 6 | sentiment dataset.
 7 | References:
 8 |     - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
 9 |     and Christopher Potts. (2011). Learning Word Vectors for Sentiment
10 |     Analysis. The 49th Annual Meeting of the Association for Computational
11 |     Linguistics (ACL 2011).
12 |     - Kim Y. Convolutional Neural Networks for Sentence Classification[C].
13 |     Empirical Methods in Natural Language Processing, 2014.
14 | Links:
15 |     - http://ai.stanford.edu/~amaas/data/sentiment/
16 |     - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf
17 | """
18 | import tensorflow as tf
19 | import tflearn
20 | from tflearn.layers.core import input_data, dropout, fully_connected
21 | from tflearn.layers.conv import conv_1d, global_max_pool
22 | from tflearn.layers.merge_ops import merge
23 | from tflearn.layers.estimator import regression
24 | from tflearn.data_utils import to_categorical, pad_sequences
25 | #from tflearn.datasets import imdb
26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label
27 | import numpy as np
28 | import pickle
29 | 
30 | print("started...")
31 | f_cache='data_zhihu.pik'
32 | # 1. loading dataset
33 | with open(f_cache, 'r') as f:
34 |     trainX,trainY,testX,testY=pickle.load(f)
35 | if trainX is not None and trainY is not None: #如果训练数据，不存在
36 |     print("training data not exist==>load data, and dump it to file system")
37 |     vocabulary_word2index, vocabulary_index2word = create_voabulary()
38 |     vocabulary_word2index_label = create_voabulary_label()
39 |     train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label)
40 |     trainX, trainY = train
41 |     testX, testY = test
42 |     nb_classes=1999
43 |     print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话
44 |     print("testY.shape:",np.array(testY).shape) #2500个label
45 |     print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
46 |     print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0
47 | 
48 |     # 2.Data preprocessing
49 |     # Sequence padding
50 |     print("start padding & transform to one hot...")
51 |     trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length
52 |     testX = pad_sequences(testX, maxlen=100, value=0.)   #padding to max length
53 |     # Converting labels to binary vectors
54 |     trainY = to_categorical(trainY, nb_classes=nb_classes) #y as one hot
55 |     testY = to_categorical(testY, nb_classes=nb_classes)   #y as one hot
56 |     print("end padding & transform to one hot...")
57 |     #cahe trainX,trainY,testX,testY for next time use.
58 |     pickle.dump((trainX,trainY,testX,testY))
59 | else:
60 |     print("traning data exists in cache. going to use it.")
61 | 
62 | # 3.Building convolutional network
63 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC##############################################################################################
64 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
65 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
66 | network = tflearn.embedding(network, input_dim=10000, output_dim=128) #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
67 |          #conv_1d(incoming,nb_filter,filter_size)
68 | branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns
69 | branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters]
70 | branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters]
71 | network = merge([branch1, branch2, branch3], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters]
72 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape
73 | network = global_max_pool(network) #[batch_size, pooled dim]
74 | network = dropout(network, 0.5) #[batch_size, pooled dim]
75 | network = fully_connected(network, nb_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,2]
76 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target')
77 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################
78 | # 4.Training
79 | model = tflearn.DNN(network, tensorboard_verbose=0)
80 | model.fit(trainX, trainY, n_epoch = 5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32
81 | print("ended...")


--------------------------------------------------------------------------------
/CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division, print_function, absolute_import
  3 | 
  4 | """
  5 | Simple example using convolutional neural network to classify IMDB
  6 | sentiment dataset.
  7 | References:
  8 |     - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
  9 |     and Christopher Potts. (2011). Learning Word Vectors for Sentiment
 10 |     Analysis. The 49th Annual Meeting of the Association for Computational
 11 |     Linguistics (ACL 2011).
 12 |     - Kim Y. Convolutional Neural Networks for Sentence Classification[C].
 13 |     Empirical Methods in Natural Language Processing, 2014.
 14 | Links:
 15 |     - http://ai.stanford.edu/~amaas/data/sentiment/
 16 |     - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf
 17 | """
 18 | import tensorflow as tf
 19 | import tflearn
 20 | from tflearn.layers.core import input_data, dropout, fully_connected
 21 | from tflearn.layers.conv import conv_1d, global_max_pool
 22 | from tflearn.layers.merge_ops import merge
 23 | from tflearn.layers.estimator import regression
 24 | from tflearn.data_utils import to_categorical, pad_sequences
 25 | #from tflearn.datasets import imdb
 26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label
 27 | import numpy as np
 28 | import pickle
 29 | import os
 30 | #import tflearn.metrics.Top_k as Top_k
 31 | 
 32 | print("started...")
 33 | f_cache='data_zhihu.pik'
 34 | # 1. loading dataset
 35 | trainX,trainY,testX,testY=None,None,None,None
 36 | number_classes=1999
 37 | #if os.path.exists(f_cache):
 38 | #    with open(f_cache, 'r') as f:
 39 | #        trainX,trainY,testX,testY,vocab_size=pickle.load(f)
 40 | #if trainX is None or trainY is None: #如果训练数据，不存在
 41 | #-------------------------------------------------------------------------------------------------
 42 | print("training data not exist==>load data, and dump it to file system")
 43 | vocabulary_word2index, vocabulary_index2word = create_voabulary()
 44 | vocab_size=len(vocabulary_word2index)
 45 | vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
 46 | train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label)
 47 | trainX, trainY = train
 48 | testX, testY = test
 49 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话
 50 | print("testY.shape:",np.array(testY).shape) #2500个label
 51 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
 52 | print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0
 53 | 
 54 | # 2.Data preprocessing
 55 | # Sequence padding
 56 | print("start padding & transform to one hot...")
 57 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length
 58 | testX = pad_sequences(testX, maxlen=100, value=0.)   #padding to max length
 59 | # Converting labels to binary vectors
 60 | trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot
 61 | testY = to_categorical(testY, nb_classes=number_classes)   #y as one hot
 62 | print("end padding & transform to one hot...")
 63 | #--------------------------------------------------------------------------------------------------
 64 |     # cache trainX,trainY,testX,testY for next time use.
 65 | #    with open(f_cache, 'w') as f:
 66 | #        pickle.dump((trainX,trainY,testX,testY,vocab_size),f)
 67 | #else:
 68 | #    print("traning data exists in cache. going to use it.")
 69 | 
 70 | # 3.Building convolutional network
 71 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC##############################################################################################
 72 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
 73 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
 74 | network = tflearn.embedding(network, input_dim=vocab_size, output_dim=256) #TODO 128 [None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
 75 |          #conv_1d(incoming,nb_filter,filter_size)
 76 | branch1 = conv_1d(network, 256, 1, padding='valid', activation='relu', regularizer="L2") #128
 77 | branch2 = conv_1d(network, 256, 2, padding='valid', activation='relu', regularizer="L2") #128
 78 | branch3 = conv_1d(network, 256, 3, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns
 79 | branch4 = conv_1d(network, 256, 4, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps2, nb_filters]
 80 | branch5 = conv_1d(network, 256, 5, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters]
 81 | branch6 = conv_1d(network, 256, 6, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD
 82 | branch7 = conv_1d(network, 256, 7, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD
 83 | branch8 = conv_1d(network, 256, 7, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD
 84 | branch9 = conv_1d(network, 256, 8, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD
 85 | branch10 = conv_1d(network,256, 9, padding='valid', activation='relu', regularizer="L2") #128 [batch_size, new steps3, nb_filters] #ADD
 86 | network = merge([branch1, branch2, branch3,branch4,branch5,branch6, branch7, branch8,branch9,branch10], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters]
 87 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape
 88 | network = global_max_pool(network) #input: 4-D tensors,[batch_size,height,width,in_channels]; output:2-D Tensor,[batch_size, pooled dim]
 89 | network = dropout(network, 0.5) #[batch_size, pooled dim]
 90 | network = fully_connected(network, number_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,number_classes]
 91 | #top5 = tflearn.metrics.Top_k(k=5)
 92 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') #,metric=top5
 93 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################
 94 | # 4.Training
 95 | model = tflearn.DNN(network, tensorboard_verbose=0)
 96 | model.fit(trainX, trainY, n_epoch = 10, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32
 97 | model.save('model_zhihu_cnn12345')
 98 | 
 99 | print("going to make a prediction...")
100 | model.predict(testX[0:1000])
101 | print("ended...")


--------------------------------------------------------------------------------
/CNNSentenceClassificationTflearn/p4_cnn_sentence_classification_zhihu2_predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division, print_function, absolute_import
 3 | 
 4 | """
 5 | Simple example using convolutional neural network to classify IMDB
 6 | sentiment dataset.
 7 | References:
 8 |     - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
 9 |     and Christopher Potts. (2011). Learning Word Vectors for Sentiment
10 |     Analysis. The 49th Annual Meeting of the Association for Computational
11 |     Linguistics (ACL 2011).
12 |     - Kim Y. Convolutional Neural Networks for Sentence Classification[C].
13 |     Empirical Methods in Natural Language Processing, 2014.
14 | Links:
15 |     - http://ai.stanford.edu/~amaas/data/sentiment/
16 |     - http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf
17 | """
18 | import tensorflow as tf
19 | import tflearn
20 | from tflearn.layers.core import input_data, dropout, fully_connected
21 | from tflearn.layers.conv import conv_1d, global_max_pool
22 | from tflearn.layers.merge_ops import merge
23 | from tflearn.layers.estimator import regression
24 | from tflearn.data_utils import to_categorical, pad_sequences
25 | #from tflearn.datasets import imdb
26 | from p4_zhihu_load_data import load_data,create_voabulary,create_voabulary_label
27 | import numpy as np
28 | import pickle
29 | import os
30 | #import tflearn.metrics.Metric.Top_k as Top_k
31 | 
32 | print("started...")
33 | f_cache='data_zhihu.pik'
34 | # 1. loading dataset
35 | trainX,trainY,testX,testY=None,None,None,None
36 | number_classes=1999
37 | #if os.path.exists(f_cache):
38 | #    with open(f_cache, 'r') as f:
39 | #        trainX,trainY,testX,testY,vocab_size=pickle.load(f)
40 | #if trainX is None or trainY is None: #如果训练数据，不存在
41 | #-------------------------------------------------------------------------------------------------
42 | print("training data not exist==>load data, and dump it to file system")
43 | vocabulary_word2index, vocabulary_index2word = create_voabulary()
44 | vocab_size=len(vocabulary_word2index)
45 | vocabulary_word2index_label = create_voabulary_label()
46 | train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label)
47 | trainX, trainY = train
48 | testX, testY = test
49 | print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话
50 | print("testY.shape:",np.array(testY).shape) #2500个label
51 | print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
52 | print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0
53 | 
54 | # 2.Data preprocessing
55 | # Sequence padding
56 | print("start padding & transform to one hot...")
57 | trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length
58 | testX = pad_sequences(testX, maxlen=100, value=0.)   #padding to max length
59 | # Converting labels to binary vectors
60 | trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot
61 | testY = to_categorical(testY, nb_classes=number_classes)   #y as one hot
62 | print("end padding & transform to one hot...")
63 | #--------------------------------------------------------------------------------------------------
64 |     # cache trainX,trainY,testX,testY for next time use.
65 | #    with open(f_cache, 'w') as f:
66 | #        pickle.dump((trainX,trainY,testX,testY,vocab_size),f)
67 | #else:
68 | #    print("traning data exists in cache. going to use it.")
69 | 
70 | # 3.Building convolutional network
71 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC##############################################################################################
72 | #(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
73 | network = input_data(shape=[None, 100], name='input') #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
74 | network = tflearn.embedding(network, input_dim=vocab_size, output_dim=128) #TODO [None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
75 |          #conv_1d(incoming,nb_filter,filter_size)
76 | branch1 = conv_1d(network, 128, 1, padding='valid', activation='relu', regularizer="L2")
77 | branch2 = conv_1d(network, 128, 2, padding='valid', activation='relu', regularizer="L2")
78 | branch3 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps1, nb_filters]. padding:"VALID",only ever drops the right-most columns
79 | branch4 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps2, nb_filters]
80 | branch5 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") # [batch_size, new steps3, nb_filters]
81 | network = merge([branch1, branch2, branch3,branch4,branch5], mode='concat', axis=1) # merge a list of `Tensor` into a single one.===>[batch_size, new steps1+new step2+new step3, nb_filters]
82 | network = tf.expand_dims(network, 2) #[batch_size, new steps1+new step2+new step3,1, nb_filters] Inserts a dimension of 1 into a tensor's shape
83 | network = global_max_pool(network) #[batch_size, pooled dim]
84 | network = dropout(network, 0.5) #[batch_size, pooled dim]
85 | network = fully_connected(network, number_classes, activation='softmax') #matmul([batch_size, pooled_dim],[pooled_dim,2])---->[batch_size,number_classes]
86 | top5 = tflearn.metrics.Top_k(k=5)
87 | network = regression(network, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy', name='target') #metric=top5
88 | ######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC################################################################################################
89 | # 4.Training
90 | model = tflearn.DNN(network, tensorboard_verbose=0)
91 | #model.fit(trainX, trainY, n_epoch = 10, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=256) #32
92 | #model.save('model_zhihu_cnn12345')
93 | model.load('model_zhihu_cnn12345')
94 | print("going to make a prediction...")
95 | predict_result=model.predict(testX[0:1000])
96 | print("predict_result:",predict_result)
97 | print("ended...")


--------------------------------------------------------------------------------
/CNNSentenceClassificationTflearn/p4_conv_classification_tflearn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | # -*- coding: utf-8 -*-
 6 | 
 7 | """ Convolutional network applied to CIFAR-10 dataset classification task.
 8 | References:
 9 |     Learning Multiple Layers of Features from Tiny Images, A. Krizhevsky, 2009.
10 | Links:
11 |     [CIFAR-10 Dataset](https://www.cs.toronto.edu/~kriz/cifar.html)
12 | """
13 | 
14 | import tflearn
15 | from tflearn.data_utils import shuffle, to_categorical
16 | from tflearn.layers.core import input_data, dropout, fully_connected
17 | from tflearn.layers.conv import conv_2d, max_pool_2d
18 | from tflearn.layers.estimator import regression
19 | from tflearn.data_preprocessing import ImagePreprocessing
20 | from tflearn.data_augmentation import ImageAugmentation
21 | 
22 | print("started...")
23 | # Data loading and preprocessing
24 | from tflearn.datasets import cifar10
25 | (X, Y), (X_test, Y_test) = cifar10.load_data()
26 | X, Y = shuffle(X, Y)
27 | Y = to_categorical(Y, 10)
28 | Y_test = to_categorical(Y_test, 10)
29 | 
30 | # Real-time data preprocessing
31 | img_prep = ImagePreprocessing()
32 | img_prep.add_featurewise_zero_center()
33 | img_prep.add_featurewise_stdnorm()
34 | 
35 | # Real-time data augmentation
36 | img_aug = ImageAugmentation()
37 | img_aug.add_random_flip_leftright()
38 | img_aug.add_random_rotation(max_angle=25.)
39 | 
40 | # Convolutional network building
41 | #-------------------------------------------------------------------------------------------
42 | network = input_data(shape=[None, 32, 32, 3],
43 |                      data_preprocessing=img_prep,
44 |                      data_augmentation=img_aug)
45 | network = conv_2d(network, 32, 3, activation='relu')
46 | network = max_pool_2d(network, 2)
47 | network = conv_2d(network, 64, 3, activation='relu')
48 | network = conv_2d(network, 64, 3, activation='relu')
49 | network = max_pool_2d(network, 2)
50 | network = fully_connected(network, 512, activation='relu')
51 | network = dropout(network, 0.5)
52 | network = fully_connected(network, 10, activation='softmax')
53 | network = regression(network, optimizer='adam',
54 |                      loss='categorical_crossentropy',
55 |                      learning_rate=0.001)
56 | #-----------------------------------------------------------------------------------------
57 | # Train using classifier
58 | model = tflearn.DNN(network, tensorboard_verbose=0)
59 | model.fit(X, Y, n_epoch=50, shuffle=True, validation_set=(X_test, Y_test),
60 | show_metric=True, batch_size=96, run_id='cifar10_cnn')
61 | print("end...")


--------------------------------------------------------------------------------
/GraphCNN/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ must run in python3x"""
  3 | import  numpy as np
  4 | import tensorflow as tf
  5 | import os
  6 | import shutil
  7 | __author__ = 'Yu He'
  8 | __version__ = 'v30'
  9 | 
 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 11 | 
 12 | 
 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution')
 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float)
 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value')
 16 | total_true_value = np.loadtxt(detail_filename,dtype=int)
 17 | 
 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 19 | 
 20 | 
 21 | 
 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int)
 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1)
 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1)
 25 | #
 26 | 
 27 | 
 28 | filename_eval_log = os.path.join('./data', 'log_eval')
 29 | file_eval_log = open(filename_eval_log, 'w')
 30 | np.set_printoptions(threshold=np.nan)
 31 | print('\nevaluation:', file=file_eval_log)
 32 | print('\nevaluation:')
 33 | 
 34 | total_predicted_value = total_predicted_value.astype(bool)
 35 | total_true_value = total_true_value.astype(bool)
 36 | 
 37 | print('  example based evaluations:', file=file_eval_log)
 38 | print('  example based evaluations:')
 39 | 
 40 | equal = total_true_value == total_predicted_value
 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1)
 42 | exact_match_ratio = np.sum(match) / np.size(match)
 43 | print('      exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log)
 44 | print('      exact_match_ratio = %.4f' % exact_match_ratio)
 45 | 
 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1)
 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1)
 48 | accuracy = np.mean(true_and_predict / true_or_predict)
 49 | print('      accuracy = %.4f' % accuracy, file=file_eval_log)
 50 | print('      accuracy = %.4f' % accuracy)
 51 | 
 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9))
 53 | print('      precison = %.4f' % precison, file=file_eval_log)
 54 | print('      precison = %.4f' % precison)
 55 | 
 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1))
 57 | print('      recall = %.4f' % recall, file=file_eval_log)
 58 | print('      recall = %.4f' % recall)
 59 | 
 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1)
 61 |                                                + np.sum(total_predicted_value, axis=1)))
 62 | print('      F1_Measure = %.4f' % F1_Measure, file=file_eval_log)
 63 | print('      F1_Measure = %.4f' % F1_Measure)
 64 | 
 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value)
 66 | print('      HammingLoss = %.4f' % HammingLoss, file=file_eval_log)
 67 | print('      HammingLoss = %.4f' % HammingLoss)
 68 | 
 69 | 
 70 | print('  label based evaluations:', file=file_eval_log)
 71 | print('  label based evaluations:')
 72 | 
 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32)
 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32)
 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32)
 76 | 
 77 | TP_re = np.reshape(TP,[TP.shape[0],1])
 78 | FP_re = np.reshape(FP,[FP.shape[0],1])
 79 | FN_re = np.reshape(FN,[FN.shape[0],1])
 80 | re =  np.concatenate((TP_re,FP_re,FN_re),axis=1)
 81 | print('TP FP FN:')
 82 | print('TP FP FN:', file=file_eval_log)
 83 | print(re,file=file_eval_log)
 84 | print(re)
 85 | 
 86 | 
 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:]))
 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:]))
 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:]))
 90 | 
 91 | # for i in [6,28,31,36,52]:
 92 | #     TP[i] = TP[i-1]
 93 | #     FP[i] = FP[i - 1]
 94 | #     FN[i] = FN[i - 1]
 95 | #
 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:]))
 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:]))
 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:]))
 99 | 
100 | 
101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP)  + 1e-9 )
102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN)  + 1e-9 )
103 | Micro_F1 = (2 * _P *_R) / (_P + _R)
104 | print('      P = %.4f' % _P, file=file_eval_log)
105 | print('      P = %.4f' % _P)
106 | print('      R = %.4f' % _R, file=file_eval_log)
107 | print('      R = %.4f' % _R)
108 | print('      Micro-F1 = %.4f' % Micro_F1, file=file_eval_log)
109 | print('      Micro-F1 = %.4f' % Micro_F1)
110 | 
111 | _P_t = TP / (TP + FP + 1e-9)
112 | _R_t = TP / (TP + FN + 1e-9)
113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
114 | 
115 | 
116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1])
117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1])
118 | re =  np.concatenate((_P_t_re,_R_t_re),axis=1)
119 | print('_P_t _R_t:')
120 | print('_P_t:', file=file_eval_log)
121 | print(re,file=file_eval_log)
122 | print(re)
123 | 
124 | print('      Macro-F1 = %.4f' % Macro_F1, file=file_eval_log)
125 | print('      Macro-F1 = %.4f' % Macro_F1)
126 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_all.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 222
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | 
 19 | evalDataSet = None
 20 | 
 21 | FLAGS = tf.app.flags.FLAGS
 22 | 
 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 24 |                            """Directory where to write event logs.""")
 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 26 |                            """Directory where to read model checkpoints.""")
 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 28 |                             """How often to run the eval.""")
 29 | tf.app.flags.DEFINE_boolean('run_once', False,
 30 |                          """Whether to run eval only once.""")
 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 32 |                             """Whether to log device placement.""")
 33 | 
 34 | 
 35 | 
 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 37 | 
 38 | def evaluate(checkpoint,test_index_array):
 39 |     with tf.Graph().as_default() as g, tf.device('/cpu:0'):
 40 |         # Get images and labels
 41 |         data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH,
 42 |                                            graphcnn_input.NUM_CHANNELS])
 43 |         # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES])
 44 | 
 45 |         # inference
 46 |         logits = graphcnn_model.inference(data, eval_data=True)
 47 |         # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False)
 48 | 
 49 |         # multi-label sigmoid
 50 |         logits = tf.sigmoid(logits)
 51 | 
 52 |         # Restore the moving average version of the learned variables for eval. # ?????????????????????????
 53 |         variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY)
 54 |         variables_to_restore = variable_averages.variables_to_restore()
 55 |         saver = tf.train.Saver(variables_to_restore)
 56 | 
 57 |         # Build the summary operation based on the TF collection of Summaries.
 58 |         # summary_op = tf.merge_all_summaries()
 59 |         # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)
 60 | 
 61 | 
 62 |         with tf.Session(config=tf.ConfigProto(
 63 |                 allow_soft_placement=True,
 64 |                 log_device_placement=FLAGS.log_device_placement)) as sess:
 65 |             if checkpoint == '0':
 66 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
 67 |                 if ckpt and ckpt.model_checkpoint_path:
 68 |                     # Restores from checkpoint
 69 |                     saver.restore(sess, ckpt.model_checkpoint_path)
 70 |                     # extract global_step
 71 |                     global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
 72 |                 else:
 73 |                     print('No checkpoint file found')
 74 |                     return
 75 |             else:
 76 |                 if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)):
 77 |                     saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint))
 78 |                     global_step_for_restore = int(checkpoint)
 79 |                 else:
 80 |                     print('No checkpoint file found')
 81 |                     return
 82 | 
 83 |             num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE))
 84 |             total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE
 85 |             step = 0
 86 |             total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32)  ##
 87 |             while step < num_iter:
 88 |                 test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE)
 89 |                 predicted_value = sess.run(
 90 |                     logits, feed_dict={data: test_data})
 91 |                 total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0)
 92 |                 step += 1
 93 | 
 94 |             total_predicted_value = total_predicted_value[1:]
 95 | 
 96 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 97 |             if os.path.exists(detail_filename):
 98 |                 os.remove(detail_filename)
 99 |             np.savetxt(detail_filename, total_predicted_value, fmt='%.4f')
100 | 
101 | 
102 |             filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval')
103 |             file_eval_log = open(filename_eval_log, 'w')
104 |             np.set_printoptions(threshold=np.nan)
105 |             print('\nevaluation:', file=file_eval_log)
106 |             print('\nevaluation:')
107 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log)
108 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore))
109 |             print('evaluation is end...')
110 |             print('evaluation is end...', file=file_eval_log)
111 | 
112 |             print('evaluation samples number:%d, evaluation classes number:%d' %
113 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log)
114 |             print('evaluation samples number:%d, evaluation classes number:%d' %
115 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]))
116 |             print('evaluation detail: '
117 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
118 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'),
119 |                   file=file_eval_log)
120 |             print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval')
121 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
122 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'))
123 |             file_eval_log.close()
124 | 
125 | 
126 | 
127 | def main(argv=None):  # pylint: disable=unused-argument
128 |     global evalDataSet
129 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
130 | 
131 |     if tf.gfile.Exists(FLAGS.eval_dir):
132 |         # print('the evaluate data has already exists!')
133 |         # str = input('continue will delete the old evaluate directory:(y/n)')
134 |         # if str == 'y' or str == 'Y':
135 |         tf.gfile.DeleteRecursively(FLAGS.eval_dir)
136 |         #elif str == 'n' or str == 'N':
137 |         #    print('eval end!')
138 |         #    return
139 |         #else:
140 |         #    print('invalid input!')
141 |         #    return
142 |     tf.gfile.MakeDirs(FLAGS.eval_dir)
143 | 
144 |     test_index_array = np.array(range(0, 81262))
145 | 
146 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
147 |     checkpoint = '0'
148 |     evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array,
149 |                                                          data_dir=graphcnn_option.EVAL_DATA_DIR,
150 |                                                          ont_hot=True,
151 |                                                          index_mode=True,
152 |                                                          label_used=False)
153 |     print('evaluating...')
154 |     evaluate(checkpoint,test_index_array)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     tf.app.run()
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_some.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 444
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # 生成测试数据的索引文件
 39 | def generate_eval_index():
 40 |     test_index_array = []
 41 |     # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME)
 42 |     filepath = '../hier_eval_root'
 43 |     pathDir = os.listdir(filepath)
 44 |     for allDir in pathDir:
 45 |         child = os.path.join(filepath, allDir)
 46 |         if os.path.getsize(child):
 47 |             example_label_array = np.loadtxt(child,dtype=int)
 48 |             examlpe_array = example_label_array[:,0]
 49 |             label_array = example_label_array[:, 1]
 50 |             for root in graphcnn_option.HIER_ROOT_CODE:
 51 |                 index = np.where(label_array==root)[0]
 52 |                 for one in examlpe_array[index]:
 53 |                     if one not in test_index_array:
 54 |                         test_index_array.append(one)
 55 | 
 56 |     # for allDir in pathDir:
 57 |     #     child = os.path.join(filepath, allDir)
 58 |     #     os.remove(child)
 59 | 
 60 |     # 将索引文件写到hier_eval文件夹下
 61 |     filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index')
 62 |     np.savetxt(filename,test_index_array,fmt='%d')
 63 | 
 64 |     return test_index_array
 65 | 
 66 | 
 67 | def evaluate(checkpoint,test_index_array):
 68 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 69 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 70 |         total_predicted_value = total_predicted_value[test_index_array]
 71 | 
 72 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 73 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 74 |         total_predicted_value = (
 75 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 76 | 
 77 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 78 |         if os.path.exists(detail_filename):
 79 |             os.remove(detail_filename)
 80 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 81 | 
 82 | 
 83 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 84 |         total_remap = np.loadtxt(filename, dtype=int)
 85 | 
 86 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 87 |                                        graphcnn_option.HIER_labels_remap_file)
 88 |         remap = np.loadtxt(detail_filename, dtype=int)
 89 | 
 90 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 91 |         fr_leaf = open(filename,'a')
 92 |         filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file)
 93 |         fr_leaf_exp = open(filename, 'a')
 94 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 95 |         fr_root = open(filename, 'w')
 96 | 
 97 |         # rootstr_tmp = []
 98 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 99 |         fr = open(detail_filename, 'w')
100 |         for i in range(0, np.size(total_predicted_value, axis=0)):
101 |             labels = np.where(total_predicted_value[i] == 1)[0]
102 |             if len(labels) > 0:
103 |                 labels_remap = remap[labels, 0]
104 |                 for elem in labels_remap:
105 |                     print(elem, end=' ', file=fr)
106 |                     if elem in total_remap[:,0]: # leaf
107 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
108 |                     else:
109 |                         print('%d %d' % (test_index_array[i], elem), file=fr_root)
110 |                         # for j in range(0,len(rootlist)):
111 |                         #     if elem in rootlist[j]:
112 |                         #         if rootstr[j] not in rootstr_tmp:
113 |                         #             rootstr_tmp.append(rootstr[j])
114 |                 print('', file=fr)
115 |             else:
116 |                 # labels_remap = remap[:, 0]
117 |                 labels = total_predicted_value_argmax[i]
118 |                 labels_value = total_predicted_value_max[i]
119 |                 labels_remap = remap[labels, 0]
120 |                 # for elem in labels_remap:
121 |                 elem = labels_remap
122 |                 print(elem, file=fr)
123 |                 if elem in total_remap[:, 0]:  # leaf
124 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp)
125 |                 else:
126 |                     print('%d %d' % (test_index_array[i], elem), file=fr_root)
127 |                 # if labels_value < 0.5:
128 |                 #     labels_remap = remap[:, 0]
129 |                 #     for elem in labels_remap:
130 |                 #         if elem not in total_remap[:, 0]:
131 |                 #             print('%d %d' % (test_index_array[i], elem), file=fr_root)
132 | 
133 |         fr.close()
134 |         fr_leaf.close()
135 |         fr_root.close()
136 |         fr_leaf_exp.close()
137 | 
138 |         # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root')
139 |         # fr = open(filename, 'w')
140 |         # for one in rootstr_tmp:
141 |         #     print(one)
142 |         #     print(one,file=fr)
143 |         # fr.close()
144 | 
145 | 
146 | 
147 | 
148 | def main(argv=None):  # pylint: disable=unused-argument
149 |     global evalDataSet
150 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
151 | 
152 |     # test_index_array = np.array(range(0, 81262))
153 |     if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root
154 |         test_index_array = np.array(range(0,81262))
155 |         # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int)
156 |     else:
157 |         test_index_array = generate_eval_index()
158 |     if test_index_array is None or len(test_index_array)==0:
159 |         print('no hier_data need eval')
160 |         return
161 |     else:
162 |         print('choosing for evaluation...')
163 |         print('choosed number:%d' % len(test_index_array))
164 | 
165 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
166 |     checkpoint = '0'
167 | 
168 |     # print('choosing for evaluation...')
169 |     evaluate(checkpoint,test_index_array)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     tf.app.run()
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_some2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 333
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | def evaluate(checkpoint,test_index_array):
 37 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 38 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 39 |         total_predicted_value = total_predicted_value[test_index_array]
 40 | 
 41 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 42 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 43 |         total_predicted_value = (
 44 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 45 | 
 46 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 47 |         if os.path.exists(detail_filename):
 48 |             os.remove(detail_filename)
 49 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 50 | 
 51 | 
 52 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 53 |         total_remap = np.loadtxt(filename, dtype=int)
 54 | 
 55 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 56 |                                        graphcnn_option.HIER_labels_remap_file)
 57 |         remap = np.loadtxt(detail_filename, dtype=int)
 58 | 
 59 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 60 |         fr_leaf = open(filename,'a')
 61 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 62 |         fr_root = open(filename, 'w')
 63 | 
 64 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr')
 65 |         # fr = open(filename, 'r')
 66 |         # rootstr = fr.readlines()
 67 |         # fr.close()
 68 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist')
 69 |         # fr = open(filename, 'r')
 70 |         # rootlines = fr.readlines()
 71 |         # fr.close()
 72 |         # rootlist = []
 73 |         # for line in rootlines:
 74 |         #     line = line.strip()
 75 |         #     linelist = line.split(' ')
 76 |         #     linelist = [int(k) for k in linelist]
 77 |         #     rootlist.append(linelist)
 78 | 
 79 |         # rootstr_tmp = []
 80 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 81 |         fr = open(detail_filename, 'w')
 82 |         for i in range(0, np.size(total_predicted_value, axis=0)):
 83 |             labels = np.where(total_predicted_value[i] == 1)[0]
 84 |             if len(labels) > 0:
 85 |                 labels_remap = remap[labels, 0]
 86 |                 for elem in labels_remap:
 87 |                     print(elem, end=' ', file=fr)
 88 |                     if elem in total_remap[:,0]: # leaf
 89 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
 90 |                 print('', file=fr)
 91 |             else:
 92 |                 labels = total_predicted_value_argmax[i]
 93 |                 labels_remap = remap[labels, 0]
 94 |                 elem = labels_remap
 95 |                 labels_value = total_predicted_value_max[i]
 96 |                 print(elem, file=fr)
 97 |                 if elem in total_remap[:, 0]:  # leaf
 98 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root)
 99 | 
100 | 
101 |         fr.close()
102 |         fr_leaf.close()
103 |         fr_root.close()
104 | 
105 | 
106 | 
107 | 
108 | def main(argv=None):  # pylint: disable=unused-argument
109 |     global evalDataSet
110 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
111 | 
112 |     test_index_array = np.array(range(0, 81262))
113 |     print('choosing for evaluation...')
114 |     print('choosed number:%d' % len(test_index_array))
115 | 
116 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
117 |     checkpoint = '0'
118 | 
119 |     # print('choosing for evaluation...')
120 |     evaluate(checkpoint,test_index_array)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     tf.app.run()
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_option.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## data
 3 | ORI_DATA_NAME = 'graphs'
 4 | ORI_TRAIN_DATA_NAME = 'train_graphs'
 5 | ORI_TEST_DATA_NAME = 'test_graphs'
 6 | ORI_DATA_VEC_NAME = 'index2vec'
 7 | ORI_DATA_OPTION_NAME = 'option'
 8 | 
 9 | TRAIN_DATA_NAME = 'data.train'
10 | TEST_DATA_NAME = 'data.test'
11 | DATA_OPTION_NAME = 'data.option'
12 | 
13 | DATA_LABELS_REMAP_NAME = 'remap'
14 | 
15 | ## LSHTC Hierarchy training
16 | 
17 | 
18 | HIER_used = True
19 | HIER_test_used = True
20 | rootstr = '_1_2322682_' # ????
21 | HIER_ROOT_CODE = [2322682] # ????
22 | HIER_DIR_NAME = 'hier'
23 | HIER_labels_remap_file = 'hier'+rootstr+'remap'
24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index'
25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels'
26 | HIER_train_data_file = 'hier'+rootstr+'train_data'  # ??
27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index'
28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels'
29 | HIER_test_data_file = 'hier'+rootstr+'test_data'  # ??
30 | 
31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf'
32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp'
33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root'
34 | 
35 | if HIER_used:
36 |     TRAIN_DATA_NAME = HIER_train_data_file
37 |     if HIER_test_used:
38 |         TEST_DATA_NAME = HIER_test_data_file
39 | 
40 | 
41 | 
42 | 
43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn
44 | # lr_decay_ecophs = [2,150,750,1250,1500]   # single-label wiki_cn
45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001]
46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001]
47 | # lr_decay_ecophs = [10,400,1500,1800,2000]   # multi-label, RCV
48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500]   # multi-label, RCV
49 | 
50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600
51 | 
52 | 
53 | 
54 | ## Basic parameters.
55 | TRAIN_DATA_DIR = '../graphCNN_data'  # Path to the train data directory.
56 | EVAL_DATA_DIR = '../graphCNN_data'  # Path to the test data directory.
57 | DATA_PATH = './data'   # Path to data directory
58 | 
59 | USE_FP16 = False  # Train the model using fp16.
60 | 
61 | # summaryWriter
62 | SUMMARYWRITER = False
63 | 
64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
65 | # to differentiate the operations. Note that this prefix is removed from the
66 | # names of the summaries when visualizing a model.
67 | TOWER_NAME = 'tower'
68 | 
69 | 
70 | 
71 | ## model parameters
72 | NUM_EPOCHS_PER_DECAY = 1000 #350     # Epochs after which learning rate decays.
73 | INITIAL_LEARNING_RATE = 0.001       # Initial learning rate.
74 | LEARNING_RATE_DECAY_RATE = 0.1  # Learning rate decay rate.
75 | 
76 | MOMENTUM = 0.9 # Momentum of SGD
77 | 
78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training.
79 | 
80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average.
81 | 
82 | WEIGHT_DECAY = 0.0005     # 0.00005  # 0.0005 # l2 regularization weight decay
83 | 
84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint
85 | 
86 | 
87 | ## train parameters
88 | NUM_GPUS = 4 # How many GPUs to use
89 | 
90 | CKPT_PERIOD = 5000
91 | 
92 | 
93 | ## eval parameters
94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification
95 | 


--------------------------------------------------------------------------------
/GraphCNN/utils/read:
--------------------------------------------------------------------------------
 1 | a 1
 2 | a 1
 3 | a 1
 4 | a 1
 5 | a 1
 6 | a 1
 7 | a 1
 8 | a 1
 9 | b 1
10 | b 1
11 | b 1
12 | b 1
13 | c 1
14 | c 1
15 | c 1
16 | c 1
17 | a 1
18 | a 1
19 | a 1
20 | a 1
21 | b 1
22 | b 1
23 | b 1
24 | b 1
25 | c 1
26 | c 1
27 | c 1
28 | c 1
29 | 


--------------------------------------------------------------------------------
/GraphCNN/utils/tmp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | import shutil
 6 | 
 7 | # 遍历指定目录，显示目录下的所有文件名
 8 | def eachFile(filepath):
 9 |     pathDir =  os.listdir(filepath)
10 |     for allDir in pathDir:
11 |         child = os.path.join('%s%s' % (filepath, allDir))
12 | 
13 | def xx():
14 |     filename = 'graphcnn_hier_eval_without_labels.py'
15 |     DIR = '.'
16 |     pathDir =  os.listdir(DIR)
17 |     for path in pathDir:
18 |         if len(path)>5 and path[0:5]=='LSHTC':
19 |             sourceFile = os.path.join(DIR, filename)
20 |             targetFile = os.path.join(DIR,path,filename)
21 |             if os.path.exists(targetFile):
22 |                 os.remove(targetFile)
23 |             shutil.copy(sourceFile, targetFile)
24 | 
25 | 
26 | a = np.array([[1,2,3],[1,2,3]])
27 | a = np.reshape(a,[-1,1])
28 | print(a)


--------------------------------------------------------------------------------
/GraphCNN/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import  os
 3 | 
 4 | def main():
 5 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups'
 6 |     fr = open(filename, 'r')
 7 |     lines = fr.readlines()
 8 |     fr.close()
 9 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info'
10 |     fr = open(filename, 'w')
11 |     for line in lines:
12 |         line = line.strip()
13 |         linelist = line.split(' ')
14 |         print(len(linelist),file=fr)
15 |     fr.close()
16 | 
17 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups'
18 |     fr = open(filename, 'r')
19 |     lines = fr.readlines()
20 |     fr.close()
21 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info'
22 |     fr = open(filename, 'w')
23 |     for line in lines:
24 |         line = line.strip()
25 |         linelist = line.split(' ')
26 |         print(len(linelist),file=fr)
27 |     fr.close()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/HLSTM/src/Dataset.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import numpy
  3 | import copy
  4 | import theano
  5 | import random
  6 | 
  7 | def genBatch(data):
  8 |     m =0 
  9 |     maxsentencenum = len(data[0])
 10 |     for doc in data:
 11 |         for sentence in doc:
 12 |             if len(sentence)>m:
 13 |                 m = len(sentence)
 14 |         for i in xrange(maxsentencenum - len(doc)):
 15 |             doc.append([-1])
 16 |     tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data)                          #[-1]是加在最前面
 17 |     tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp)
 18 |     return tmp 
 19 |             
 20 | def genLenBatch(lengths,maxsentencenum):
 21 |     lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths)
 22 |     return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths)
 23 | 
 24 | def genwordmask(docsbatch):
 25 |     mask = copy.deepcopy(docsbatch)
 26 |     mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask)
 27 |     mask = numpy.asarray(mask,dtype=numpy.float32)
 28 |     mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 
 29 |     return mask
 30 | 
 31 | def gensentencemask(sentencenum):
 32 |     maxnum = sentencenum[0]
 33 |     mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32)
 34 |     return mask.T
 35 | 
 36 | class Dataset(object):
 37 |     def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ):
 38 |         lines = map(lambda x: x.split('\t\t'), open(filename).readlines())
 39 |         # here i need more label. there is only one label            
 40 |         label = map(lambda x: x[0].split(' '), lines)
 41 |         oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32)
 42 |         for i in range(0,len(label)):
 43 |             for j in label[i]:
 44 |                 oneslable[i][int(j)] = 1
 45 |         label = oneslable
 46 |         print("already done the ones-hot")
 47 |         docs = map(lambda x: x[1][0:len(x[1])-1], lines) 
 48 |         docs = map(lambda x: x.split('<sssss>'), docs) 
 49 |         docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs)
 50 |         docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs)
 51 |         tmp = zip(docs, label)
 52 |         #random.shuffle(tmp)
 53 |         tmp.sort(lambda x, y: len(y[0]) - len(x[0]))  
 54 |         docs, label = zip(*tmp)
 55 | 
 56 |         sentencenum = map(lambda x : len(x),docs)
 57 |         length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs)
 58 |         self.epoch = len(docs) / maxbatch                                        
 59 |         if len(docs) % maxbatch != 0:
 60 |             self.epoch += 1
 61 |         
 62 |         # self.docs = []
 63 |         # self.label = []
 64 |         # self.wordmask = []
 65 |         # self.sentencemask = []
 66 |         # self.maxsentencenum = []
 67 | 
 68 |         # for i in xrange(self.epoch):
 69 |         #     self.maxsentencenum.append(sentencenum[i*maxbatch])
 70 |         #     docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 71 |         #     self.docs.append(docsbatch)
 72 |         #     self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 73 |         #     self.wordmask.append(genwordmask(docsbatch))
 74 |         #     self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 75 |         self.docs = []
 76 |         self.label = []
 77 |         self.length = []
 78 |         self.sentencenum = []
 79 |         self.wordmask = []
 80 |         self.sentencemask = []
 81 |         self.maxsentencenum = []
 82 | 
 83 |         for i in xrange(self.epoch):
 84 |             self.maxsentencenum.append(sentencenum[i*maxbatch])
 85 |             self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 
 86 |             docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 87 |             self.docs.append(docsbatch)
 88 |             self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 89 |             self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4))
 90 |             self.wordmask.append(genwordmask(docsbatch))
 91 |             self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 92 |         
 93 | 
 94 | class Wordlist(object):
 95 |     def __init__(self, filename, maxn = 100000):
 96 |         lines = map(lambda x: x.split(), open(filename).readlines()[:maxn])
 97 |         self.size = len(lines)
 98 | 
 99 |         self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))]
100 |         self.voc = dict(self.voc)
101 | 
102 |     def getID(self, word):
103 |         try:
104 |             return self.voc[word]
105 |         except:
106 |             return -1
107 | 
108 | 


--------------------------------------------------------------------------------
/HLSTM/src/EmbLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class EmbLayer(object):
 8 |     def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None):
 9 |         self.input = inp
10 |         self.name = name
11 | 
12 |         if prefix == None:
13 |             f = file('../data/'+dataname+'/embinit.save', 'rb')
14 |             W = cPickle.load(f)
15 |             f.close()
16 |             W = theano.shared(value=W, name='E', borrow=True)    
17 |         else:
18 |             f = file(prefix + name + '.save', 'rb')
19 |             W = cPickle.load(f)
20 |             f.close()
21 |         self.W = W
22 | 
23 |         self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim))
24 |         self.params = [self.W]
25 | 
26 |     def save(self, prefix):
27 |         f = file(prefix + self.name + '.save', 'wb')
28 |         for obj in self.params:
29 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
30 |         f.close()
31 | 


--------------------------------------------------------------------------------
/HLSTM/src/HiddenLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class HiddenLayer(object):
 8 |     def __init__(self, rng, input, n_in, n_out, name, prefix=None,
 9 |                  activation=T.tanh):
10 |         self.name = name
11 |         self.input = input
12 | 
13 |         if prefix is None:
14 |             W_values = numpy.asarray(
15 |                 rng.uniform(
16 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
17 |                     high=numpy.sqrt(6. / (n_in + n_out)),
18 |                     size=(n_in, n_out)
19 |                 ),
20 |                 dtype=numpy.float32
21 |             )
22 |             if activation == theano.tensor.nnet.sigmoid:
23 |                 W_values *= 4
24 |             W = theano.shared(value=W_values, name='W', borrow=True)
25 | 
26 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
27 |             b = theano.shared(value=b_values, name='b', borrow=True)
28 |         else:
29 |             f = file(prefix + name + '.save', 'rb')
30 |             W = cPickle.load(f)
31 |             b = cPickle.load(f)
32 |             f.close()
33 | 
34 |         self.W = W
35 |         self.b = b
36 | 
37 |         lin_output = T.dot(input, self.W) + self.b
38 |         self.output = (
39 |             lin_output if activation is None
40 |             else activation(lin_output)
41 |         )
42 | 
43 |         self.params = [self.W, self.b]
44 | 
45 |     def save(self, prefix):
46 |         f = file(prefix + self.name + '.save', 'wb')
47 |         for obj in self.params:
48 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
49 |         f.close()
50 | 


--------------------------------------------------------------------------------
/HLSTM/src/LSTMLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def randMatrix(rng, shape, lim):
  8 |     return numpy.asarray(
  9 |         rng.uniform(
 10 |             low=-lim,
 11 |             high=lim,
 12 |             size=shape
 13 |         ),
 14 |         dtype=numpy.float32
 15 |     )
 16 | 
 17 | class LSTMLayer(object):
 18 |     def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None):
 19 |         self.input = input
 20 |         self.name = name
 21 | 
 22 |         limV = numpy.sqrt(6. / (n_in + n_out * 2))
 23 |         limG = limV * 4
 24 | 
 25 |         if prefix is None:
 26 |             Wi1_values = randMatrix(rng, (n_in, n_out), limG)
 27 |             Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True)
 28 |             Wi2_values = randMatrix(rng, (n_out, n_out), limG)
 29 |             Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True)
 30 |             bi_values = numpy.zeros((n_out,), dtype=numpy.float32)
 31 |             bi = theano.shared(value=bi_values, name='bi', borrow=True)
 32 | 
 33 |             Wo1_values = randMatrix(rng, (n_in, n_out), limG)
 34 |             Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True)
 35 |             Wo2_values = randMatrix(rng, (n_out, n_out), limG)
 36 |             Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True)
 37 |             bo_values = numpy.zeros((n_out,), dtype=numpy.float32)
 38 |             bo = theano.shared(value=bo_values, name='bo', borrow=True)
 39 | 
 40 |             Wf1_values = randMatrix(rng, (n_in, n_out), limG)
 41 |             Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True)
 42 |             Wf2_values = randMatrix(rng, (n_out, n_out), limG)
 43 |             Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True)
 44 |             bf_values = numpy.zeros((n_out,), dtype=numpy.float32)
 45 |             bf = theano.shared(value=bf_values, name='bf', borrow=True)
 46 | 
 47 |             Wc1_values = randMatrix(rng, (n_in, n_out), limV)
 48 |             Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True)
 49 |             Wc2_values = randMatrix(rng, (n_out, n_out), limV)
 50 |             Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True)
 51 |             bc_values = numpy.zeros((n_out,), dtype=numpy.float32)
 52 |             bc = theano.shared(value=bc_values, name='bc', borrow=True)
 53 | 
 54 |         else:
 55 |             f = file(prefix + name + '.save', 'rb')
 56 |             Wi1 = cPickle.load(f)
 57 |             Wi2 = cPickle.load(f)
 58 |             bi = cPickle.load(f)
 59 | 
 60 |             Wo1 = cPickle.load(f)
 61 |             Wo2 = cPickle.load(f)
 62 |             bo = cPickle.load(f)
 63 | 
 64 |             Wf1 = cPickle.load(f)
 65 |             Wf2 = cPickle.load(f)
 66 |             bf = cPickle.load(f)
 67 | 
 68 |             Wc1 = cPickle.load(f)
 69 |             Wc2 = cPickle.load(f)
 70 |             bc = cPickle.load(f)
 71 | 
 72 |             f.close()
 73 | 
 74 |         self.Wi1 = Wi1
 75 |         self.Wi2 = Wi2
 76 |         self.bi = bi
 77 | 
 78 |         self.Wo1 = Wo1
 79 |         self.Wo2 = Wo2
 80 |         self.bo = bo
 81 | 
 82 |         self.Wf1 = Wf1
 83 |         self.Wf2 = Wf2
 84 |         self.bf = bf
 85 | 
 86 |         self.Wc1 = Wc1
 87 |         self.Wc2 = Wc2
 88 |         self.bc = bc
 89 | 
 90 |         def step(emb, mask, C, prev):
 91 |             Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi)
 92 |             Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo)
 93 |             Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf)
 94 |             Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc)
 95 | 
 96 |             CC = C * Gf + Ct * Gi
 97 |             CC = CC * mask.dimshuffle(0,'x') 
 98 |             CC = T.cast(CC,'float32')
 99 |             h = T.tanh(CC) * Go
100 |             h = h * mask.dimshuffle(0,'x') 
101 |             h = T.cast(h,'float32')
102 |             return [CC, h]
103 | 
104 |         outs, _ = theano.scan(fn=step,
105 |             outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))],
106 |             sequences=[input, mask])
107 | 
108 |         self.output = outs[1]
109 | 
110 |         self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo,
111 |             self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc]
112 | 
113 |     def save(self, prefix):
114 |         f = file(prefix + self.name + '.save', 'wb')
115 |         for obj in self.params:
116 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
117 |         f.close()
118 | 


--------------------------------------------------------------------------------
/HLSTM/src/PoolLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def softmask(x,mask):
  8 |     y = T.exp(x)
  9 |     y =y *mask
 10 |     sumx = T.sum(y,axis=1)
 11 |     x = y/sumx.dimshuffle(0,'x')
 12 |     return x
 13 | 
 14 | class LastPoolLayer(object):
 15 |     def __init__(self, input):
 16 |         self.input = input
 17 |         self.output = input[-1]
 18 |         self.params = []
 19 | 
 20 |     def save(self, prefix):
 21 |         pass
 22 | 
 23 | class MeanPoolLayer(object):
 24 |     def __init__(self, input, ll):
 25 |         self.input = input
 26 |         self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x')          
 27 |         self.params = []
 28 | 
 29 |     def save(self, prefix):
 30 |         pass
 31 | 
 32 | 
 33 | class MaxPoolLayer(object):
 34 |     def __init__(self, input):
 35 |         self.input = input
 36 |         self.output = T.max(input, axis = 0)
 37 |         self.params = []
 38 | 
 39 |     def save(self, prefix):
 40 |         pass
 41 | 
 42 | 
 43 | class SimpleAttentionLayer(object):
 44 |     def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None):
 45 |         self.input = input
 46 | 
 47 |         if prefix is None:
 48 |             W_values = numpy.asarray(                                              
 49 |                 rng.uniform(
 50 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
 51 |                     high=numpy.sqrt(6. / (n_in + n_out)),
 52 |                     size=(n_in, n_out)
 53 |                 ),
 54 |                 dtype=numpy.float32
 55 |             )
 56 |             W = theano.shared(value=W_values, name='W', borrow=True)
 57 |             
 58 |             v_values = numpy.asarray(
 59 |                 rng.normal(scale=0.1, size=(n_out,)),
 60 |                 dtype=numpy.float32
 61 |             )
 62 |             v = theano.shared(value=v_values, name='v', borrow=True)
 63 |             
 64 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)            
 65 |             b = theano.shared(value=b_values, name='b', borrow=True)
 66 | 
 67 |         else:
 68 |             f = file(prefix + name + '.save', 'rb')
 69 |             W = cPickle.load(f)
 70 |             v = cPickle.load(f)
 71 |             b = cPickle.load(f)
 72 |             f.close()
 73 | 
 74 |         self.W = W
 75 |         self.v = v
 76 |         self.b = b
 77 | 
 78 |         atten = T.tanh(T.dot(input, self.W)+ b)                        
 79 |         atten = T.sum(atten * v, axis=2, acc_dtype='float32')                   
 80 |         atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0)         
 81 |         output = atten.dimshuffle(0, 1, 'x') * input
 82 |         self.output = T.sum(output, axis=0, acc_dtype='float32')                
 83 |         
 84 |         self.params = [self.W,self.v,self.b]
 85 |         self.name=name
 86 |         self.atten = atten
 87 | 
 88 |     def save(self, prefix):
 89 |         f = file(prefix + self.name + '.save', 'wb')
 90 |         for obj in self.params:
 91 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
 92 |         f.close()
 93 | 
 94 | 
 95 | class Dropout(object):
 96 |     def __init__(self, input, rate, istrain):
 97 |         rate = numpy.float32(rate)
 98 |         self.input = input
 99 |         srng = T.shared_randomstreams.RandomStreams()
100 |         mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
101 |         self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
102 |         self.params = []
103 | 
104 |     def save(self, prefix):
105 |         pass
106 | 


--------------------------------------------------------------------------------
/HLSTM/src/SentenceSortLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | 
 6 | class SentenceSortLayer(object):
 7 |     def __init__(self, input,maxsentencenum):
 8 |         self.input = input
 9 |         [sentencelen,emblen] = T.shape(input)
10 |         output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen))
11 |         output = output.dimshuffle(1,0,2)
12 |         self.output = output
13 |         self.params = []
14 |         
15 | 
16 |     def save(self, prefix):
17 |         pass
18 | 


--------------------------------------------------------------------------------
/HLSTM/src/Update.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | def AdaUpdates(parameters, gradients, rho, eps):
 7 | 	rho = np.float32(rho)
 8 | 	eps = np.float32(eps)
 9 | 	
10 | 	gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
11 | 	deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
12 | 
13 | 	gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ]
14 | 	deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
15 | 
16 | 	deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ]
17 | 
18 | 	gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
19 | 	deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
20 | 	parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
21 | 	return gradient_sq_updates + deltas_sq_updates + parameters_updates
22 | 


--------------------------------------------------------------------------------
/HLSTM/src/test.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import sys
 3 | from Dataset import *
 4 | from LSTMModel import LSTMModel
 5 | 
 6 | dataname = sys.argv[1]
 7 | classes = sys.argv[2]
 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
 9 | 
10 | testset = Dataset('../data/'+dataname+'/test.txt', voc)
11 | trainset = []
12 | print 'data loaded.'
13 | 
14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel')
15 | print 'model loaded.'
16 | model.test()
17 | 


--------------------------------------------------------------------------------
/HLSTM/src/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #-*- coding: UTF-8 -*-  
 3 | import sys
 4 | from Dataset import *
 5 | from LSTMModel import LSTMModel
 6 | 
 7 | dataname = sys.argv[1]
 8 | classes = sys.argv[2]
 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
10 | 
11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes)
12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes)
13 | print 'data loaded.'
14 | 
15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None)
16 | model.train(100)
17 | print '****************************************************************************'
18 | print 'test 1'
19 | result = model.test()
20 | print '****************************************************************************'
21 | print '\n'
22 | for i in xrange(1,400):
23 | 	model.train(1000)
24 | 	print '****************************************************************************'
25 | 	print 'test',i+1
26 | 	newresult=model.test()
27 | 	print '****************************************************************************'
28 | 	print '\n'
29 | 	if newresult[0]>result[0] :
30 | 		result=newresult
31 | 		model.save('../model/'+dataname+'/bestmodel')
32 | print 'bestmodel saved!'
33 | 
34 | 


--------------------------------------------------------------------------------
/HierarchicalAttentionNetwork/p1_seq2seq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tensorflow as tf
 3 | 
 4 | # 【该方法测试的时候使用】返回一个方法。这个方法根据输入的值，得到对应的索引，再得到这个词的embedding.
 5 | def extract_argmax_and_embed(embedding, output_projection=None):
 6 |     """
 7 |     Get a loop_function that extracts the previous symbol and embeds it. Used by decoder.
 8 |     :param embedding: embedding tensor for symbol
 9 |     :param output_projection: None or a pair (W, B). If provided, each fed previous output will
10 |     first be multiplied by W and added B.
11 |     :return: A loop function
12 |     """
13 |     def loop_function(prev, _):
14 |         if output_projection is not None:
15 |             prev = tf.matmul(prev, output_projection[0]) + output_projection[1]
16 |         prev_symbol = tf.argmax(prev, 1) #得到对应的INDEX
17 |         emb_prev = tf.gather(embedding, prev_symbol) #得到这个INDEX对应的embedding
18 |         return emb_prev
19 |     return loop_function
20 | 
21 | # RNN的解码部分。
22 | # 如果是训练，使用训练数据的输入；如果是test,将t时刻的输出作为t+1时刻的s输入
23 | def rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):#3D Tensor [batch_size x attn_length x attn_size]
24 |     """RNN decoder for the sequence-to-sequence model.
25 |     Args:
26 |         decoder_inputs: A list of 2D Tensors [batch_size x input_size].it is target Y, but shift by one.
27 |         initial_state: 2D Tensor with shape [batch_size x cell.state_size].it is the encoded vector of input sentences, which represent 'thought vector'
28 |         cell: core_rnn_cell.RNNCell defining the cell function and size.
29 |         loop_function: If not None, this function will be applied to the i-th output
30 |             in order to generate the i+1-st input, and decoder_inputs will be ignored,
31 |             except for the first element ("GO" symbol). This can be used for decoding,
32 |             but also for training to emulate http://arxiv.org/abs/1506.03099.
33 |             Signature -- loop_function(prev, i) = next
34 |                 * prev is a 2D Tensor of shape [batch_size x output_size],
35 |                 * i is an integer, the step number (when advanced control is needed),
36 |                 * next is a 2D Tensor of shape [batch_size x input_size].
37 |         attention_states: 3D Tensor [batch_size x attn_length x attn_size].it is a input X.
38 |         scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
39 |     Returns:
40 |         A tuple of the form (outputs, state), where:
41 |         outputs: A list of the same length as decoder_inputs of 2D Tensors with
42 |             shape [batch_size x output_size] containing generated outputs.
43 |         state: The state of each cell at the final time-step.
44 |             It is a 2D Tensor of shape [batch_size x cell.state_size].
45 |             (Note that in some cases, like basic RNN cell or GRU cell, outputs and
46 |             states can be the same. They are different for LSTM cells though.)
47 |     """
48 |     with tf.variable_scope(scope or "rnn_decoder"):
49 |         print("rnn_decoder_with_attention started...")
50 |         state = initial_state  #[batch_size x cell.state_size]
51 |         _, hidden_size = state.get_shape().as_list()
52 |         batch_size,sequence_length,embed_size=attention_states.get_shape().as_list()
53 |         outputs = []
54 |         prev = None
55 |         W_a = tf.get_variable("W_a", shape=[embed_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1))
56 |         attention_states=tf.reshape(attention_states,shape=(-1,embed_size)) #attention_states:[batch_size*sequence_length,embed_size]
57 |         attention_states = tf.nn.tanh(tf.matmul(attention_states, W_a)) #attention_states:[batch_size*sequence_length,hidden_size]
58 |         attention_states=tf.reshape(attention_states,shape=(-1,sequence_length,hidden_size)) #attention_states:[batch_size,sequence_length,hidden_size]
59 |         for i, inp in enumerate(decoder_inputs):#循环解码部分的输入。如sentence_length个[batch_size x input_size]
60 |             # 如果是训练，使用训练数据的输入；如果是test, 将t时刻的输出作为t + 1 时刻的s输入
61 |             if loop_function is not None and prev is not None:#测试的时候：如果loop_function不为空且前一个词的值不为空，那么使用前一个的值作为RNN的输入
62 |                 with tf.variable_scope("loop_function", reuse=True):
63 |                     inp = loop_function(prev, i)
64 |             if i > 0:
65 |                 tf.get_variable_scope().reuse_variables()
66 |             ##ATTENTION#################################################################################################################################################
67 |             # 1.use Full connected layer to match dimension for two parts of attention.<Wx*X,Wy*y>
68 |             W_s = tf.get_variable("W_s_attention", shape=[hidden_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
69 | 
70 |             state_transfered=tf.nn.tanh(tf.matmul(state,W_s))
71 |             # 2.get possibility attention for each encoder input. attention_states:[batch_size x attn_length x attn_size]; query=state:[batch_size x cell.state_size]
72 |             query=tf.expand_dims(state_transfered,axis=1)                        #[batch_size x 1 x cell.state_size]
73 |             # get logits using attention_states and query
74 |             attention_logits=tf.multiply(attention_states,query)      #TODO [batch_size x attn_length x attn_size]. notice: cell.state_size=atten_size=embedding_size
75 |             attention_logits=tf.reduce_sum(attention_logits,2)        #[batch_size x attn_length]
76 |             attention_logits_max=tf.reduce_max(attention_logits,axis=1,keep_dims=True) #[batch_size x 1]
77 |             # possibility distribution for each encoder input.it means how much attention or focus for each encoder input
78 |             p_attention=tf.nn.softmax(attention_logits-attention_logits_max)#[batch_size x attn_length]
79 | 
80 |             # 3.get weighted sum of hidden state for each encoder input as attention state
81 |             p_attention=tf.expand_dims(p_attention,axis=2)            #[batch_size x attn_length x 1]
82 |             # attention_states:[batch_size x attn_length x attn_size]; p_attention:[batch_size x attn_length];
83 |             # final attention
84 |             attention_final=tf.multiply(attention_states,p_attention) #[batch_size x attn_length x attn_size]
85 |             attention_final=tf.reduce_sum(attention_final,axis=1)     #[batch_size x attn_size]
86 |             ############################################################################################################################################################
87 |             output, state = cell(inp+attention_final, state)          #使用RNN走一步 #TODO SHOULD WE ADD OR CONCAT THESE TWO PARTS.
88 |             outputs.append(output) # 将输出添加到结果列表中
89 |             if loop_function is not None:
90 |                 prev = output
91 |     print("rnn_decoder_with_attention ended...")
92 |     return outputs, state


--------------------------------------------------------------------------------
/Keras_Version/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | #from keras.utils.vis_utils import plot_model
  5 | from model2 import gcnn
  6 | from keras.optimizers import Adam
  7 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  8 | import pickle
  9 | from keras import backend as K
 10 | from keras.utils import to_categorical
 11 | import gc
 12 | import keras.backend.tensorflow_backend as KTF
 13 | import tensorflow as tf
 14 | import h5py
 15 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
 16 | 
 17 | 
 18 | config = tf.ConfigProto()
 19 | config.gpu_options.allow_growth=True
 20 | sess = tf.Session(config=config)
 21 | KTF.set_session(sess)
 22 | 
 23 | 
 24 | def accu(y_true, y_pred):
 25 |     a = K.argmax(y_true,1)
 26 |     b = K.argmax(y_pred,1)
 27 |     c = K.equal(a,b)
 28 |     accuracy = (K.cast(c, K.float32))
 29 |     return accuracy
 30 | 
 31 | 
 32 | batch_size = 128
 33 | depth = 3
 34 | mkenerls = [64,64,32]
 35 | conv_conf = [2,1]
 36 | pooling_conf = ["max",2,2]
 37 | bn = False
 38 | dropout = True
 39 | rate = 0.8
 40 | activation = "relu"
 41 | conf = [50,300,10] #input size
 42 | output_dim = 20
 43 | 
 44 | lr = 0.0008
 45 | epoch = 200
 46 | epoch_cont = 300
 47 | data_dic = os.path.dirname(os.path.abspath(__file__))
 48 | filepath = os.path.join(os.path.dirname(os.path.realpath(__file__)),'cache','Words2Matrix_{}_{}_{}.h5'.format(conf[0], conf[1], conf[2]))
 49 | 
 50 | path_result = 'RET'
 51 | path_model = 'MODEL'
 52 | if os.path.isdir(path_result) is False:
 53 |     os.mkdir(path_result)
 54 | if os.path.isdir(path_model) is False:
 55 |     os.mkdir(path_model)
 56 | 
 57 | #build model
 58 | def build_model():
 59 |     model = gcnn(depth, mkenerls, conv_conf, pooling_conf, bn, dropout, rate, activation, conf, output_dim)
 60 |     adam = Adam(lr=lr)
 61 |     model.compile(loss='categorical_crossentropy', optimizer=adam,metrics = ["categorical_accuracy"])
 62 |     #plot_model(model, to_file='model.png', show_shapes=True)
 63 |     return model
 64 | 
 65 | 
 66 | def getdata(path):
 67 |     print(path)
 68 |     h5 = h5py.File(path, 'r')
 69 |     datax = h5['datax'].value
 70 |     datay = h5['datay'].value
 71 |     h5.close()
 72 |     return datax,datay
 73 | 
 74 | #read data
 75 | def read_data():
 76 |     X_train,Y_train = getdata(os.path.join(data_dic, "data", "train.h5"))
 77 |     X_valid,Y_valid = getdata(os.path.join(data_dic, "data", "valid.h5"))
 78 |     X_test,Y_test = getdata(os.path.join(data_dic, "data", "test.h5"))
 79 |     print(X_train.shape)
 80 |     print(X_valid.shape)
 81 |     print(X_test.shape)
 82 |     print(Y_train.shape)
 83 |     print(Y_valid.shape)
 84 |     print(Y_test.shape)
 85 |     return X_train,X_valid,X_test,Y_train,Y_valid,Y_test
 86 |             
 87 | def cache(path,X_train,X_valid,X_test,Y_train,Y_valid,Y_test):
 88 |     h5 = h5py.File(path, 'w')
 89 |     h5.create_dataset('X_train', data=X_train)
 90 |     h5.create_dataset('X_valid', data=X_valid)
 91 |     h5.create_dataset('X_test', data=X_test)
 92 |     h5.create_dataset('Y_train', data=Y_train)
 93 |     h5.create_dataset('Y_valid', data=Y_valid)
 94 |     h5.create_dataset('Y_test', data=Y_test)
 95 |     h5.close()
 96 |     
 97 | def read_cache(path):
 98 |     h5 = h5py.File(path, 'r')
 99 |     X_train = h5['X_train'].value
100 |     X_valid = h5['X_valid'].value
101 |     X_test = h5['X_test'].value
102 |     Y_train = h5['Y_train'].value
103 |     Y_valid = h5['Y_valid'].value
104 |     Y_test = h5['Y_test'].value
105 |     h5.close()
106 |     return X_train,X_valid,X_test,Y_train,Y_valid,Y_test
107 |     
108 | def main():
109 |     if os.path.exists(filepath):
110 |         print("read data from file")
111 |         X_train,X_valid,X_test,Y_train,Y_valid,Y_test = read_cache(filepath)
112 |     else:
113 |         print("read and store data")
114 |         X_train,X_valid,X_test,Y_train,Y_valid,Y_test = read_data()
115 |         cache(filepath,X_train,X_valid,X_test,Y_train,Y_valid,Y_test)
116 |     model = build_model()
117 | 
118 |     
119 |     
120 | 
121 |     fname_param = os.path.join(data_dic,'MODEL', 'best2.h5')
122 |     '''
123 |     early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=5, mode='max')
124 |     model_checkpoint = ModelCheckpoint(fname_param, monitor='val_categorical_accuracy', verbose=0, save_best_only=True, mode='max')
125 |     print('=' * 10)
126 |     print("training model...")
127 |     history = model.fit(X_train, Y_train,
128 |                         nb_epoch=epoch,
129 |                         batch_size=batch_size,
130 |                         validation_data=(X_valid, Y_valid),
131 |                         callbacks=[early_stopping, model_checkpoint],
132 |                         verbose=1)
133 |     #保存训练最好模型训练细节，此时测试集为验证集
134 |     model.save_weights(fname_param, overwrite=True)
135 |     pickle.dump((history.history), open(os.path.join(path_result, 'history.pkl'), 'wb'))
136 |     
137 |     model.load_weights(fname_param)
138 |     score = model.evaluate(X_train, Y_train, batch_size=X_train.shape[0],verbose=0)
139 |     print('训练集最好模型进行预测')
140 |     print('Train score: %s' % str(score))
141 |     score = model.evaluate(X_test,Y_test,batch_size=X_test.shape[0],verbose=0)
142 |     print('Test score: %s' % str(score))
143 |     '''
144 | 
145 | 
146 |     #fname_param = os.path.join(data_dic, 'MODEL', 'cont.best2.h5')
147 |     model.load_weights(fname_param)
148 |     print('=' * 10)
149 |     print("training model (cont)...")
150 |     fname_param = os.path.join(data_dic,'MODEL', 'cont2.best2.h5')
151 |     X_train2 = np.concatenate((X_train,X_valid),axis = 0)
152 |     y_train2 = np.concatenate((Y_train,Y_valid),axis = 0)
153 |     print(X_train2.shape,y_train2.shape)
154 |     # early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
155 |     model_checkpoint = ModelCheckpoint(fname_param, monitor='val_categorical_accuracy', verbose=0, save_best_only=True, mode='max')
156 |     #保存训练最好模型训练细节，此时训练集+验证集为新的训练集，测试集为测试集
157 |     history = model.fit(X_train2, y_train2,
158 |                         nb_epoch=epoch_cont, 
159 |                         verbose=1, 
160 |                         batch_size=batch_size, 
161 |                         callbacks=[model_checkpoint],#early_stopping,model_checkpoint],
162 |                         validation_data=(X_test, Y_test))
163 |     pickle.dump((history.history), open(os.path.join(path_result, 'cont.history.pkl'), 'wb'))
164 |     model.save_weights(fname_param, overwrite=True)
165 |     print('=' * 10)
166 |     print('The best model to predict')
167 |     
168 |     model.load_weights(fname_param)
169 |     score = model.evaluate(X_train2, y_train2, batch_size=X_train2.shape[0],verbose=0)
170 |     print('Train score: %s' % str(score))
171 |     score = model.evaluate(X_test,Y_test,batch_size=X_test.shape[0],verbose=0)
172 |     print('Test score: %s' % str(score))
173 |     
174 |     print('=' * 10)
175 |     print('Done')
176 |     
177 |     gc.collect()
178 | if __name__ == "__main__":
179 |     main()
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/Keras_Version/model2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from keras.layers import (
  3 |     Input,
  4 |     Activation,
  5 |     Dropout,
  6 |     Flatten,
  7 |     Dense,
  8 |     Reshape)
  9 | from keras.layers.convolutional import Convolution2D
 10 | from keras.layers.normalization import BatchNormalization
 11 | from keras.models import Model
 12 | from keras.layers.pooling import MaxPool2D, AveragePooling2D
 13 | import time
 14 | 
 15 | 
 16 | def mpool(type, input, size, stride):
 17 |     if type == "max":
 18 |         return MaxPool2D(pool_size=(size, size), strides=stride, padding='same')(input)
 19 |     elif type == "avg":
 20 |         return AveragePooling2D(pool_size=(size, size), strides=stride, padding='same')(input)
 21 |     else:
 22 |         raise ValueError("pooling type invalid")
 23 | 
 24 | 
 25 | def active(type, input):
 26 |     if type == "relu":
 27 |         return Activation("relu")(input)
 28 |     elif type == "sigmoid":
 29 |         return Activation("sigmoid")(input)
 30 |     elif type == "tanh":
 31 |         return Activation("tanh")(input)
 32 |     elif type == "softmax":
 33 |         return Activation("softmax")(input)
 34 |     else:
 35 |         raise ValueError("activation type invalid")
 36 | 
 37 | 
 38 | def gcnn(depth=4, mkenerls=[64, 64, 64, 32], conv_conf=[2, 1], pooling_conf=["max", 2, 2], bn=False, dropout=True,
 39 |          rate=0.8, activation="relu", conf=[50, 300, 10], output_dim=20):
 40 |     assert depth == len(mkenerls)
 41 |     mchannel, mheight, mwidth = conf
 42 |     conv_size, conv_stride = conv_conf
 43 |     pooling_type, pooling_size, pooling_stride = pooling_conf
 44 |     input = Input(shape=(mchannel, mheight, mwidth))
 45 | 
 46 |     conv1 = Convolution2D(filters=mkenerls[0], kernel_size=(1, mwidth), strides=(1, 1), padding="valid")(input)
 47 |     # bn1 = BatchNormalization(axis=1)(conv1)
 48 |     activation1 = Activation("relu")(conv1)
 49 |     pool1 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation1)
 50 |     _k1, _n1 = map(int, pool1.shape[1:3])
 51 |     reshape_pool1 = Reshape((1, _k1, _n1))(pool1)
 52 | 
 53 |     conv2 = Convolution2D(filters=mkenerls[1], kernel_size=(1, _n1), strides=(1, 1), padding="valid")(reshape_pool1)
 54 |     # bn2 = BatchNormalization(axis=1)(conv2)
 55 |     activation2 = Activation("relu")(conv2)
 56 |     pool2 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation2)
 57 |     _k2, _n2 = map(int, pool2.shape[1:3])
 58 |     reshape_pool2 = Reshape((1, _k2, _n2))(pool2)
 59 | 
 60 |     conv3 = Convolution2D(filters=mkenerls[1], kernel_size=(1, _n2), strides=(1, 1), padding="valid")(reshape_pool2)
 61 |     # bn2 = BatchNormalization(axis=1)(conv2)
 62 |     activation3 = Activation("relu")(conv3)
 63 |     pool3 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation3)
 64 |     _k3, _n3 = map(int, pool2.shape[1:3])
 65 |     reshape_pool3 = Reshape((1, _k2, _n2))(pool3)
 66 | 
 67 |     conv4 = Convolution2D(filters=mkenerls[2], kernel_size=(1, _n3), strides=(1, 1), padding="valid")(reshape_pool2)
 68 |     # bn3 = BatchNormalization(axis=1)(conv3)
 69 |     activation4 = Activation("relu")(conv4)
 70 |     pool4 = MaxPool2D(pool_size=(2, 1), strides=(2, 1), padding='same')(activation4)
 71 | 
 72 |     # step_results = [input]
 73 |     # for i in range(depth - 1):
 74 |     #     mconv = Convolution2D(
 75 |     #         nb_filter=mkenerls[i], nb_row=conv_size, nb_col=conv_size, strides=(conv_stride, conv_stride),
 76 |     #         border_mode="same")(step_results[-1])
 77 |     #     if bn:
 78 |     #         mbn = BatchNormalization(axis=1)(mconv)
 79 |     #     else:
 80 |     #         mbn = mconv
 81 |     #     mactivation = active(activation, mbn)
 82 |     #     mpooling = mpool(pooling_type, mactivation, pooling_size, pooling_stride)
 83 |     #     if dropout:
 84 |     #         mdropout = Dropout(rate=rate, seed=time.time())(mpooling)
 85 |     #     else:
 86 |     #         mdropout = mpooling
 87 |     #     step_results.append(mdropout)
 88 | 
 89 |     # last_conv = Convolution2D(
 90 |     #     nb_filter=mkenerls[-1], nb_row=conv_size, nb_col=conv_size, border_mode="same")(step_results[-1])
 91 |     # last_pooling = mpool(pooling_type, last_conv, pooling_size, pooling_stride)
 92 |     mFlatten = Flatten()(pool4)
 93 |     ms_output = Dense(output_dim=128)(mFlatten)
 94 |     msinput = active("sigmoid", ms_output)
 95 |     moutput = Dense(output_dim=output_dim)(msinput)
 96 |     output = active("softmax", moutput)
 97 |     model = Model(input=input, output=output)
 98 |     return model
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     model = gcnn()
103 |     model.summary()
104 | 
105 | 


--------------------------------------------------------------------------------
/NewGraphCNNs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/make_heiring.py:
--------------------------------------------------------------------------------
 1 | f = open("rcv1.topics.hier.orig.txt",'r')
 2 | lines = f.readlines()
 3 | nodes = []
 4 | for line in lines:
 5 |     keys = line.split(' ')
 6 |     while '' in keys:
 7 |         keys.remove("")
 8 |     node ={}
 9 |     node['parent'] = keys[1]
10 |     node['child'] =keys[3]
11 |     nodes.append(node)
12 | 
13 | f.close()
14 | 
15 | relation = {}
16 | for node in nodes:
17 |     parent = node['parent']
18 |     child = node['child']
19 |     if parent not in relation:
20 |         relation[parent] = []
21 |     relation[parent].append(child)
22 | 
23 |     
24 | import json
25 | result = []
26 | with open('classes.json','r') as f:
27 |     classes = json.load(f)
28 |     for key in relation:
29 |         if len(relation[key]) <2:
30 |             continue
31 |         new = []
32 |         for index,values in enumerate(relation[key]):
33 |             new.append(classes[values])
34 |         result.append(new)
35 |         
36 | final = []      
37 | for single in result:
38 |     length = len(single)
39 |     for i in range(length-1):
40 |         for j in range(i+1,length):
41 |             temp = []
42 |             temp.append(single[i])
43 |             temp.append(single[j])
44 |             final.append(temp)
45 | for v  in final:
46 |     print(str(v))
47 | with open ('heiring.json','w') as f:
48 |     j = json.dump(final,f)
49 | #print(j)
50 |         
51 |     
52 |     


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/rcv1_processer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import zipfile
  4 | from multiprocessing import Pool
  5 | import xml.etree.ElementTree as ET
  6 | import re
  7 | import json
  8 | import numpy as np
  9 | import gensim
 10 | import h5py
 11 | from nltk.stem import WordNetLemmatizer
 12 | from nltk.tokenize import WordPunctTokenizer
 13 | import nltk
 14 | 
 15 | PATH  = "/home/penghao/mars/rcv2"
 16 | original_path = r'/home/penghao/mars/rcv2/reuters/training'
 17 | targetpath = r'/data/LJ/LJ/own/RCV1/target_files'
 18 | # targetpath = os.path.join(PATH,"target_files")
 19 | all = 0
 20 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*','”','“','’',"‘","'",'"']
 21 | wordEngStop = nltk.corpus.stopwords.words('english')
 22 | lemmatizer = WordNetLemmatizer()
 23 | 
 24 | def unzip(file,name):
 25 |     global all
 26 |     zip_file = zipfile.ZipFile(file)
 27 |     path = os.path.join(targetpath,name)
 28 |     print(path)
 29 |     if not os.path.exists(path):
 30 |         os.mkdir(path)
 31 |     for name in zip_file.namelist():
 32 |         zip_file.extract(name,path)
 33 |         all += 1
 34 |     print(all)
 35 | 
 36 | def zipp():
 37 |     flist = os.listdir(original_path)
 38 |     flist.sort()
 39 |     for f in flist:
 40 |         fname = f.split('.')[0]
 41 |         print(fname)
 42 |         fpath = os.path.join(original_path,f)
 43 |         print(fpath)
 44 |         unzip(fpath,fname)
 45 | 
 46 | def readfile(path):
 47 |     f = open(path,'r')
 48 |     s = f.readlines()
 49 | 
 50 |     topics = []
 51 | 
 52 | 
 53 |     
 54 |     
 55 |     finalwords = []
 56 |     for line in s:
 57 |         line = line.lower().strip().decode(errors="ignore")
 58 |         line = re.split('[-_\.:/ \"\'(),.;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', line)
 59 |         for word in line:
 60 |             if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha():
 61 |                 finalwords.append(word)
 62 | 
 63 |     # mtext = re.split('[-_:/ \"\'(),;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', mtext)
 64 | 
 65 |     # while "" in mtext:
 66 |     #     mtext.remove("")
 67 |     # print(mtext)
 68 |     # print(topics)
 69 |     #print finalwords
 70 |     return finalwords,topics
 71 | 
 72 | def haha1():
 73 |     # xxxx = 0
 74 |     all_words = {}
 75 |     opath = os.listdir('reuters/test')
 76 |     for ff in opath:
 77 |         simpath = os.path.join('reuters/test',ff)
 78 |         mcontent,_ = readfile(simpath)
 79 |         for word in mcontent:
 80 |             if word not in all_words.keys():
 81 |                 all_words[word] = True
 82 |     pp = os.path.join('data',"test.json")
 83 |     print(pp)
 84 |     with open(pp,"w") as fp:
 85 |         json.dump(all_words, fp)
 86 | 
 87 | def haha2():
 88 |     # xxxx = 0
 89 |     all_words = {}
 90 |     opath = os.listdir('reuters/training')
 91 |     for ff in opath:
 92 |         simpath = os.path.join('reuters/training',ff)
 93 |         mcontent,_ = readfile(simpath)
 94 |         for word in mcontent:
 95 |             if word not in all_words.keys():
 96 |                 all_words[word] = True
 97 |     pp = os.path.join('data',"training.json")
 98 |     print(pp)
 99 |     with open(pp,"w") as fp:
100 |         json.dump(all_words, fp)
101 | 
102 | def findwords():
103 |     #lnums = [(i*1000,(i+1)*1000) for i in range(15,21)]+[(14826,15000),(21000,21576)]    #test
104 |     lnums = [(i*1000,(i+1)*1000) for i in range(0,14)]+[(14000,14818)]
105 |     print(lnums)
106 |     #lnums = [(0,1)]
107 |     #tpath = r'E:\RCV1\words'
108 |     tpath = os.path.join(PATH,"data")
109 |     p = Pool(30)
110 |     results = []
111 |     for i in range(len(lnums)):
112 |         start,end = lnums[i]
113 |         print("process{0} start. Range({1},{2})".format(i,start,end))
114 |         results.append(p.apply_async(haha,args=(start,end,tpath)))
115 |         print("process{0} end".format(i))
116 |     p.close()
117 |     p.join()
118 |     for r in results:
119 |         print(r.get())
120 | 
121 | def isnumber(str):
122 |     if str.count('.') == 1:
123 |         left = str.split('.')[0]
124 |         right = str.split('.')[1]
125 |         lright = ''
126 |         if str.count('-') == 1 and str[0] == '-':
127 |             lright = left.split('-')[1]
128 |         elif str.count('-') == 0:
129 |             lright = left
130 |         else:
131 |             return False
132 |         if right.isdigit() and lright.isdigit():
133 |             return True
134 |         else:
135 |             return False
136 |     elif str.count('.') == 0:
137 |         if str[0] == "-":
138 |             str2 = str[1:]
139 |         else:
140 |             str2 = str
141 |         if str2.isdigit():
142 |             return True
143 |         return False
144 |     else:
145 |         return False
146 | 
147 | def allwords():
148 |     tpath = os.path.join(PATH,"data")
149 |     words = {}
150 |     ind = 0
151 |     flist = os.listdir(tpath)
152 |     flist.sort()
153 |     for f in flist:
154 |         ppath = os.path.join(tpath,f)
155 |         with open(ppath, "r") as f1:
156 |             simjson = json.load(f1)
157 |             for i in simjson.keys():
158 |                 if i not in words.keys():
159 |                     words[i] = ind
160 |                     ind += 1
161 |     print(len(list(words.keys())))
162 |     #print("1190" in words)
163 |     #893198
164 |     lens = len(list(words.keys()))
165 |     #print(list(words.keys()))
166 |     #assert  lens == 364830
167 |     wembeddingwords = np.random.uniform(-1.0, 1.0, (lens, 50))
168 |     word2vec_model = gensim.models.Word2Vec.load(r'/home/penghao/lj/Google_w2v/wiki.en.text.model')
169 |     xx = 0
170 |     for key in words.keys():
171 |         # if isnumber(key):
172 |         #     xx += 1
173 |         if key in word2vec_model:
174 |             #print(key)
175 |             xx += 1
176 |             index = words[key]
177 |             wembeddingwords[index, :] = word2vec_model[key]
178 |     print(xx)
179 |     with open(os.path.join(PATH,r"words.json"), "w") as f:
180 |         json.dump(words, f)
181 |     f = h5py.File(os.path.join(PATH,"matrix_rcv1.h5"), "w")
182 |     f.create_dataset("data", data=wembeddingwords)
183 |     f.close()
184 | 
185 | def classpro():
186 |     tpath = r'/home/user/LJ/own/RCV1/topic_codes.txt'
187 |     haha = {}
188 |     with open(tpath,"r") as f:
189 |         lines = f.readlines()
190 |         print(len(lines))
191 |         for index,line in enumerate(lines[2:]):
192 |             if line != '\n' and '\t' in line:
193 |                 haha[line.strip().split('\t')[0]] = index
194 |         for k,v in haha.items():
195 |             print(k,v)
196 |     print(len(list(haha.keys())))
197 |     with open(r'/home/user/LJ/own/RCV1/classes.json','w') as f:
198 |         json.dump(haha,f)
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     findwords()
203 |     haha1()
204 |     haha2()
205 |     allwords()
206 |     classpro()
207 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/unzip.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | 
 4 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD1/"
 5 | list = os.listdir(path)
 6 | 
 7 | for z in list:
 8 |     file_path = os.path.join(path,z)
 9 |     zipf = zipfile.ZipFile(file_path)
10 |     zipf.extractall('xml2')
11 |     zipf.close()
12 | 
13 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD2/"
14 | list = os.listdir(path)
15 | 
16 | for z in list:
17 |     file_path = os.path.join(path,z)
18 |     zipf = zipfile.ZipFile(file_path)
19 |     zipf.extractall('xml2')
20 |     zipf.close()


--------------------------------------------------------------------------------
/RCNN/v-cpp/ecnn-noada.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/RCNN/v-cpp/ecnn-noada.cpp


--------------------------------------------------------------------------------
/RCNN/v-cpp/fileutil.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/RCNN/v-cpp/fileutil.hpp


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deepgraphcnn
 2 | 
 3 | The code for our WWW2018 paper "Large-Scale Hierarchical Text Classification with Recursively Regularized Deep Graph-CNN"
 4 | 
 5 | Readers are welcomed to fork this repository to reproduce the experiments and follow our work. Just remmeber to cite our paper
 6 | 
 7 |     @inproceedings{peng2018deepgraphcnn,
 8 |     title={Large-Scale Hierarchical Text Classification with Recursively Regularized Deep Graph-CNN},
 9 |     author={Peng, Hao and Li, Jianxin and He, Yu and Liu, Yaopeng and Bao, Mengjiao and Song, Yangqiu and Yang, Qiang},
10 |     booktitle={WWW},
11 |     year={2018}
12 |     }
13 | 
14 | 
15 | ## Requirements
16 | - Python 3
17 | - Tensorflow > 0.8
18 | - Numpy
19 | 
20 | 
21 | 
22 | 
23 | Train:
24 | graphcnn_train.py
25 | 
26 | 


--------------------------------------------------------------------------------
/Seq2seqWithAttention/a1_seq2seq.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tensorflow as tf
  3 | 
  4 | # 【该方法测试的时候使用】返回一个方法。这个方法根据输入的值，得到对应的索引，再得到这个词的embedding.
  5 | def extract_argmax_and_embed(embedding, output_projection=None):
  6 |     """
  7 |     Get a loop_function that extracts the previous symbol and embeds it. Used by decoder.
  8 |     :param embedding: embedding tensor for symbol
  9 |     :param output_projection: None or a pair (W, B). If provided, each fed previous output will
 10 |     first be multiplied by W and added B.
 11 |     :return: A loop function
 12 |     """
 13 |     def loop_function(prev, _):
 14 |         if output_projection is not None:
 15 |             prev = tf.matmul(prev, output_projection[0]) + output_projection[1]
 16 |         prev_symbol = tf.argmax(prev, 1) #得到对应的INDEX
 17 |         emb_prev = tf.gather(embedding, prev_symbol) #得到这个INDEX对应的embedding
 18 |         return emb_prev
 19 |     return loop_function
 20 | 
 21 | # RNN的解码部分。
 22 | # 如果是训练，使用训练数据的输入；如果是test,将t时刻的输出作为t+1时刻的s输入
 23 | def rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):#3D Tensor [batch_size x attn_length x attn_size]
 24 |     """RNN decoder for the sequence-to-sequence model.
 25 |     Args:
 26 |         decoder_inputs: A list of 2D Tensors [batch_size x input_size].it is decoder input.
 27 |         initial_state: 2D Tensor with shape [batch_size x cell.state_size].it is the encoded vector of input sentences, which represent 'thought vector'
 28 |         cell: core_rnn_cell.RNNCell defining the cell function and size.
 29 |         loop_function: If not None, this function will be applied to the i-th output
 30 |             in order to generate the i+1-st input, and decoder_inputs will be ignored,
 31 |             except for the first element ("GO" symbol). This can be used for decoding,
 32 |             but also for training to emulate http://arxiv.org/abs/1506.03099.
 33 |             Signature -- loop_function(prev, i) = next
 34 |                 * prev is a 2D Tensor of shape [batch_size x output_size],
 35 |                 * i is an integer, the step number (when advanced control is needed),
 36 |                 * next is a 2D Tensor of shape [batch_size x input_size].
 37 |         attention_states: 3D Tensor [batch_size x attn_length x attn_size].it is represent input X.
 38 |         scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
 39 |     Returns:
 40 |         A tuple of the form (outputs, state), where:
 41 |         outputs: A list of the same length as decoder_inputs of 2D Tensors with
 42 |             shape [batch_size x output_size] containing generated outputs.
 43 |         state: The state of each cell at the final time-step.
 44 |             It is a 2D Tensor of shape [batch_size x cell.state_size].
 45 |             (Note that in some cases, like basic RNN cell or GRU cell, outputs and
 46 |             states can be the same. They are different for LSTM cells though.)
 47 |     """
 48 |     with tf.variable_scope(scope or "rnn_decoder"):
 49 |         print("rnn_decoder_with_attention started...")
 50 |         state = initial_state  #[batch_size x cell.state_size].
 51 |         _, hidden_size = state.get_shape().as_list() #200
 52 |         attention_states_original=attention_states
 53 |         batch_size,sequence_length,_=attention_states.get_shape().as_list()
 54 |         outputs = []
 55 |         prev = None
 56 |         #################################################
 57 |         for i, inp in enumerate(decoder_inputs):#循环解码部分的输入。如sentence_length个[batch_size x input_size]
 58 |             # 如果是训练，使用训练数据的输入；如果是test, 将t时刻的输出作为t + 1 时刻的s输入
 59 |             if loop_function is not None and prev is not None:#测试的时候：如果loop_function不为空且前一个词的值不为空，那么使用前一个的值作为RNN的输入
 60 |                 with tf.variable_scope("loop_function", reuse=True):
 61 |                     inp = loop_function(prev, i)
 62 |             if i > 0:
 63 |                 tf.get_variable_scope().reuse_variables()
 64 |             ##ATTENTION#################################################################################################################################################
 65 |             # 1.get logits of attention for each encoder input. attention_states:[batch_size x attn_length x attn_size]; query=state:[batch_size x cell.state_size]
 66 |             query=state
 67 |             W_a = tf.get_variable("W_a", shape=[hidden_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1))
 68 |             query=tf.matmul(query, W_a) #[batch_size,hidden_size]
 69 |             query=tf.expand_dims(query,axis=1) #[batch_size, 1, hidden_size]
 70 |             U_a = tf.get_variable("U_a", shape=[hidden_size, hidden_size],initializer=tf.random_normal_initializer(stddev=0.1))
 71 |             U_aa = tf.get_variable("U_aa", shape=[ hidden_size])
 72 |             attention_states=tf.reshape(attention_states,shape=(-1,hidden_size)) #[batch_size*sentence_length,hidden_size]
 73 |             attention_states=tf.matmul(attention_states, U_a) #[batch_size*sentence_length,hidden_size]
 74 |             #print("batch_size",batch_size," ;sequence_length:",sequence_length," ;hidden_size:",hidden_size) #print("attention_states:", attention_states) #(?, 200)
 75 |             attention_states=tf.reshape(attention_states,shape=(-1,sequence_length,hidden_size)) # TODO [batch_size,sentence_length,hidden_size]
 76 |             #query_expanded:            [batch_size,1,             hidden_size]
 77 |             #attention_states_reshaped: [batch_size,sentence_length,hidden_size]
 78 |             attention_logits=tf.nn.tanh(query+attention_states+U_aa) #[batch_size,sentence_length,hidden_size]. additive style
 79 | 
 80 |             # 2.get possibility of attention
 81 |             attention_logits=tf.reshape(attention_logits,shape=(-1,hidden_size)) #batch_size*sequence_length [batch_size*sentence_length,hidden_size]
 82 |             V_a = tf.get_variable("V_a", shape=[hidden_size,1],initializer=tf.random_normal_initializer(stddev=0.1)) #[hidden_size,1]
 83 |             attention_logits=tf.matmul(attention_logits,V_a) #最终需要的是[batch_size*sentence_length,1]<-----[batch_size*sentence_length,hidden_size],[hidden_size,1]
 84 |             attention_logits=tf.reshape(attention_logits,shape=(-1,sequence_length)) #attention_logits:[batch_size,sequence_length]
 85 |             ##########################################################################################################################################################
 86 |             #attention_logits=tf.reduce_sum(attention_logits,2)        #[batch_size x attn_length]
 87 |             attention_logits_max=tf.reduce_max(attention_logits,axis=1,keep_dims=True) #[batch_size x 1]
 88 |             # possibility distribution for each encoder input.it means how much attention or focus for each encoder input
 89 |             p_attention=tf.nn.softmax(attention_logits-attention_logits_max)#[batch_size x attn_length]
 90 | 
 91 |             # 3.get weighted sum of hidden state for each encoder input as attention state
 92 |             p_attention=tf.expand_dims(p_attention,axis=2)            #[batch_size x attn_length x 1]
 93 |             # attention_states:[batch_size x attn_length x attn_size]; p_attention:[batch_size x attn_length];
 94 |             attention_final=tf.multiply(attention_states_original,p_attention) #[batch_size x attn_length x attn_size]
 95 |             context_vector=tf.reduce_sum(attention_final,axis=1)     #[batch_size x attn_size]
 96 |             ############################################################################################################################################################
 97 |             #inp:[batch_size x input_size].it is decoder input;  attention_final:[batch_size x attn_size]
 98 |             output, state = cell(inp, state,context_vector)          #attention_final TODO 使用RNN走一步
 99 |             outputs.append(output) # 将输出添加到结果列表中
100 |             if loop_function is not None:
101 |                 prev = output
102 |     print("rnn_decoder_with_attention ended...")
103 |     return outputs, state


--------------------------------------------------------------------------------
/Text2Graph/Text2Graph-master/src/main/java/ecs/CoreNLPService.java:
--------------------------------------------------------------------------------
 1 | package ecs;
 2 | 
 3 | import java.util.concurrent.Executors;
 4 | import java.util.concurrent.ScheduledExecutorService;
 5 | 
 6 | /**
 7 |  * Created by LYP on 2016/11/24.
 8 |  */
 9 | public class CoreNLPService {
10 |     static String pathPatch = "/storage1/lyp/InputFiles/";
11 |     private static int threadNum = 50;
12 |     private static int threadEnd = 50;
13 |     private static int threadSta = 0;
14 |     //bd62->20 80 60 1391700+463958=>9279*50
15 |     //bd31->30 30 0
16 |     //bd54->30 60 30
17 |     public static void main(String[] args) {
18 | //        String str = "java怎么把字符1串中的的汉字2取出来";
19 | //        String reg = "[^0-9]";
20 | //        str = str.replaceAll(reg, "");
21 | //        System.out.println(str);
22 | //        System.exit(-1);
23 |         CoreNLPService coreNLPService = new CoreNLPService();
24 |         coreNLPService.service();
25 |     }
26 | 
27 |     public void service() {
28 |         ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum);
29 |         int cnt = threadSta;
30 |         while (cnt < threadEnd) {
31 |             try {
32 |                 final int inner = cnt;
33 |                 final Runnable task = new Runnable() {
34 |                     @Override
35 |                     public void run() {
36 |                         try {
37 |                             System.out.println("process start!");
38 |                             ProcessBuilder builder = new ProcessBuilder();
39 |                             builder.redirectError(ProcessBuilder.Redirect.INHERIT);
40 |                             builder.redirectOutput(ProcessBuilder.Redirect.INHERIT);
41 | 
42 |                             builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M");
43 |                             String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i "
44 |                                     + inner + " -c " + pathPatch + " -t 5" + "\"\"";
45 |                             String[] cmdArray = cmdLine.split(",");
46 |                             builder.command(cmdArray);
47 | 
48 |                             final Process process = builder.start();
49 | 
50 |                             Runtime.getRuntime().addShutdownHook(new Thread() {
51 |                                 @Override
52 |                                 public void run() {
53 |                                     process.destroy();
54 |                                 }
55 |                             });
56 |                         }catch (Exception e) {
57 |                             // TODO Auto-generated catch block
58 |                             e.printStackTrace();
59 |                         }
60 |                     }
61 |                 };
62 | 
63 |                 scheduler.submit(task);
64 |                 cnt++;
65 |             }catch (Exception e) {
66 |                 // TODO Auto-generated catch block
67 |                 e.printStackTrace();
68 |             }
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/Text2Graph/src/main/java/ecs/CoreNLPService.java:
--------------------------------------------------------------------------------
 1 | package ecs;
 2 | 
 3 | import java.util.concurrent.Executors;
 4 | import java.util.concurrent.ScheduledExecutorService;
 5 | 
 6 | /**
 7 |  * Created by LYP on 2016/11/24.
 8 |  */
 9 | public class CoreNLPService {
10 |     static String pathPatch = "/storage1/lyp/InputFiles/";
11 |     private static int threadNum = 50;
12 |     private static int threadEnd = 50;
13 |     private static int threadSta = 0;
14 |     //bd62->20 80 60 1391700+463958=>9279*50
15 |     //bd31->30 30 0
16 |     //bd54->30 60 30
17 |     public static void main(String[] args) {
18 | //        String str = "java怎么把字符1串中的的汉字2取出来";
19 | //        String reg = "[^0-9]";
20 | //        str = str.replaceAll(reg, "");
21 | //        System.out.println(str);
22 | //        System.exit(-1);
23 |         CoreNLPService coreNLPService = new CoreNLPService();
24 |         coreNLPService.service();
25 |     }
26 | 
27 |     public void service() {
28 |         ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum);
29 |         int cnt = threadSta;
30 |         while (cnt < threadEnd) {
31 |             try {
32 |                 final int inner = cnt;
33 |                 final Runnable task = new Runnable() {
34 |                     @Override
35 |                     public void run() {
36 |                         try {
37 |                             System.out.println("process start!");
38 |                             ProcessBuilder builder = new ProcessBuilder();
39 |                             builder.redirectError(ProcessBuilder.Redirect.INHERIT);
40 |                             builder.redirectOutput(ProcessBuilder.Redirect.INHERIT);
41 | 
42 |                             builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M");
43 |                             String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i "
44 |                                     + inner + " -c " + pathPatch + " -t 5" + "\"\"";
45 |                             String[] cmdArray = cmdLine.split(",");
46 |                             builder.command(cmdArray);
47 | 
48 |                             final Process process = builder.start();
49 | 
50 |                             Runtime.getRuntime().addShutdownHook(new Thread() {
51 |                                 @Override
52 |                                 public void run() {
53 |                                     process.destroy();
54 |                                 }
55 |                             });
56 |                         }catch (Exception e) {
57 |                             // TODO Auto-generated catch block
58 |                             e.printStackTrace();
59 |                         }
60 |                     }
61 |                 };
62 | 
63 |                 scheduler.submit(task);
64 |                 cnt++;
65 |             }catch (Exception e) {
66 |                 // TODO Auto-generated catch block
67 |                 e.printStackTrace();
68 |             }
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/TextCNN/__pycache__/data_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/TextCNN/__pycache__/data_util.cpython-36.pyc


--------------------------------------------------------------------------------
/TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc


--------------------------------------------------------------------------------
/TextCNN/data_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import codecs
  3 | import random
  4 | import numpy as np
  5 | from tflearn.data_utils import pad_sequences
  6 | from collections import Counter
  7 | import os
  8 | import pickle
  9 | 
 10 | PAD_ID = 0
 11 | UNK_ID=1
 12 | _PAD="_PAD"
 13 | _UNK="UNK"
 14 | 
 15 | 
 16 | def load_data_multilabel(traning_data_path,vocab_word2index, vocab_label2index,sentence_len,training_portion=0.95):
 17 |     """
 18 |     convert data as indexes using word2index dicts.
 19 |     :param traning_data_path:
 20 |     :param vocab_word2index:
 21 |     :param vocab_label2index:
 22 |     :return:
 23 |     """
 24 |     file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8')
 25 |     lines = file_object.readlines()
 26 |     random.shuffle(lines)
 27 |     label_size=len(vocab_label2index)
 28 |     X = []
 29 |     Y = []
 30 |     for i,line in enumerate(lines):
 31 |         raw_list = line.strip().split("__label__")
 32 |         input_list = raw_list[0].strip().split(" ")
 33 |         input_list = [x.strip().replace(" ", "") for x in input_list if x != '']
 34 |         x=[vocab_word2index.get(x,UNK_ID) for x in input_list]
 35 |         label_list = raw_list[1:]
 36 |         label_list=[l.strip().replace(" ", "") for l in label_list if l != '']
 37 |         label_list=[vocab_label2index[label] for label in label_list]
 38 |         y=transform_multilabel_as_multihot(label_list,label_size)
 39 |         X.append(x)
 40 |         Y.append(y)
 41 |     X = pad_sequences(X, maxlen=sentence_len, value=0.)  # padding to max length
 42 |     number_examples = len(lines)
 43 |     training_number=int(training_portion* number_examples)
 44 |     train = (X[0:training_number], Y[0:training_number])
 45 |     valid_number=min(1000,number_examples-training_number)
 46 |     test = (X[training_number+ 1:training_number+valid_number+1], Y[training_number + 1:training_number+valid_number+1])
 47 |     return train,test
 48 | 
 49 | 
 50 | def transform_multilabel_as_multihot(label_list,label_size):
 51 |     """
 52 |     convert to multi-hot style
 53 |     :param label_list: e.g.[0,1,4], here 4 means in the 4th position it is true value(as indicate by'1')
 54 |     :param label_size: e.g.199
 55 |     :return:e.g.[1,1,0,1,0,0,........]
 56 |     """
 57 |     result=np.zeros(label_size)
 58 |     #set those location as 1, all else place as 0.
 59 |     result[label_list] = 1
 60 |     return result
 61 | 
 62 | #use pretrained word embedding to get word vocabulary and labels, and its relationship with index
 63 | def create_vocabulary(training_data_path,vocab_size,name_scope='cnn'):
 64 |     """
 65 |     create vocabulary
 66 |     :param training_data_path:
 67 |     :param vocab_size:
 68 |     :param name_scope:
 69 |     :return:
 70 |     """
 71 | 
 72 |     cache_vocabulary_label_pik='cache'+"_"+name_scope # path to save cache
 73 |     if not os.path.isdir(cache_vocabulary_label_pik): # create folder if not exists.
 74 |         os.makedirs(cache_vocabulary_label_pik)
 75 | 
 76 |     # if cache exists. load it; otherwise create it.
 77 |     cache_path =cache_vocabulary_label_pik+"/"+'vocab_label.pik'
 78 |     print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path))
 79 |     if os.path.exists(cache_path):
 80 |         with open(cache_path, 'rb') as data_f:
 81 |             return pickle.load(data_f)
 82 |     else:
 83 |         vocabulary_word2index={}
 84 |         vocabulary_index2word={}
 85 |         vocabulary_word2index[_PAD]=PAD_ID
 86 |         vocabulary_index2word[PAD_ID]=_PAD
 87 |         vocabulary_word2index[_UNK]=UNK_ID
 88 |         vocabulary_index2word[UNK_ID]=_UNK
 89 | 
 90 |         vocabulary_label2index={}
 91 |         vocabulary_index2label={}
 92 | 
 93 |         #1.load raw data
 94 |         file_object = codecs.open(training_data_path, mode='r', encoding='utf-8')
 95 |         lines=file_object.readlines()
 96 |         #2.loop each line,put to counter
 97 |         c_inputs=Counter()
 98 |         c_labels=Counter()
 99 |         for line in lines:
100 |             raw_list=line.strip().split("__label__")
101 | 
102 |             input_list = raw_list[0].strip().split(" ")
103 |             input_list = [x.strip().replace(" ", "") for x in input_list if x != '']
104 |             label_list=[l.strip().replace(" ","") for l in raw_list[1:] if l!='']
105 |             c_inputs.update(input_list)
106 |             c_labels.update(label_list)
107 |         #return most frequency words
108 |         vocab_list=c_inputs.most_common(vocab_size)
109 |         label_list=c_labels.most_common()
110 |         #put those words to dict
111 |         for i,tuplee in enumerate(vocab_list):
112 |             word,_=tuplee
113 |             vocabulary_word2index[word]=i+2
114 |             vocabulary_index2word[i+2]=word
115 | 
116 |         for i,tuplee in enumerate(label_list):
117 |             label,_=tuplee;label=str(label)
118 |             vocabulary_label2index[label]=i
119 |             vocabulary_index2label[i]=label
120 | 
121 |         #save to file system if vocabulary of words not exists.
122 |         if not os.path.exists(cache_path):
123 |             with open(cache_path, 'ab') as data_f:
124 |                 pickle.dump((vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label), data_f)
125 |     return vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label
126 | 
127 | #training_data_path='../data/sample_multiple_label3.txt'
128 | #vocab_size=100
129 | #create_voabulary(training_data_path,vocab_size)
130 | 


--------------------------------------------------------------------------------
/TextCNN/other_experiement/p7_TextCNN_predict_ensemble.py:
--------------------------------------------------------------------------------
 1 | from  p7_TextCNN_predict import get_logits_with_value_by_input
 2 | from p7_TextCNN_predict_exp import get_logits_with_value_by_input_exp
 3 | import tensorflow as tf
 4 | def main(_):
 5 |     for start in range(217360):
 6 |         end=start+1
 7 |         label_list,p_list=get_logits_with_value_by_input(start,end)
 8 |         label_list_exp, p_list_exp=get_logits_with_value_by_input_exp(start,end)
 9 | 
10 |         if start<5:
11 |             print("----------------------------------------------------")
12 |             print(start,"label_list0:",label_list,"p_list0:",p_list)
13 |             print(start,"label_list1:", label_list_exp, "p_list1:", p_list_exp)
14 |         else:
15 |             break
16 | 
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     tf.app.run()


--------------------------------------------------------------------------------
/TextCNN/other_experiement/p7_TextCNN_predict_exp512.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/zhihu_result_cnn_multilabel_v7_exp512_20170616.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 600, "number of filters") #128-->512
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)


--------------------------------------------------------------------------------
/TextCNN/other_experiement/p7_TextCNN_predict_exp512_0609.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp512_0609/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp512_0609/zhihu_result_cnn_multilabel_exp512_0609.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[2,3,5,6,7,8] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)


--------------------------------------------------------------------------------
/TextCNN/other_experiement/p7_TextCNN_predict_exp512_simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/zhihu_result_cnn_multilabel_exp512_simple.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[7] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)
116 | 


--------------------------------------------------------------------------------
/TextRCNN/p71_TextRCNN_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 10 | from tflearn.data_utils import pad_sequences #to_categorical
 11 | import os
 12 | import codecs
 13 | from p71_TextRCNN_mode2 import TextRCNN
 14 | 
 15 | #configuration
 16 | FLAGS=tf.app.flags.FLAGS
 17 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 18 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 19 | tf.app.flags.DEFINE_integer("batch_size", 80, "Batch size for training/evaluating.") #批处理的大小 32-->128
 20 | tf.app.flags.DEFINE_integer("decay_steps", 6000, "how many steps before decay learning rate.") #6000批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.65一次衰减多少
 22 | tf.app.flags.DEFINE_string("ckpt_dir","text_rcnn_title_desc_checkpoint2/","checkpoint location for the model")
 23 | tf.app.flags.DEFINE_integer("sentence_length",100,"max sentence length")
 24 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 25 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 26 | tf.app.flags.DEFINE_string("predict_target_file","text_rcnn_title_desc_checkpoint2/zhihu_result_rcnn_multilabel.csv","target file path for final prediction")
 27 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 28 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 29 | tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 30 | 
 31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
 32 | # 1.load data with vocabulary of words and labels
 33 | 
 34 | 
 35 | def main(_):
 36 |     # 1.load data with vocabulary of words and labels
 37 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rcnn")
 38 |     vocab_size = len(vocabulary_word2index)
 39 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="rcnn")
 40 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 41 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 42 |     testX=[]
 43 |     question_id_list=[]
 44 |     for tuple in test:
 45 |         question_id,question_string_list=tuple
 46 |         question_id_list.append(question_id)
 47 |         testX.append(question_string_list)
 48 |     # 2.Data preprocessing: Sequence padding
 49 |     print("start padding....")
 50 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_length, value=0.)  # padding to max length
 51 |     print("end padding...")
 52 |    # 3.create session.
 53 |     config=tf.ConfigProto()
 54 |     config.gpu_options.allow_growth=True
 55 |     with tf.Session(config=config) as sess:
 56 |         # 4.Instantiate Model
 57 |         textRCNN=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_length,
 58 |                  vocab_size,FLAGS.embed_size,FLAGS.is_training,multi_label_flag=FLAGS.multi_label_flag)
 59 |         saver=tf.train.Saver()
 60 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 61 |             print("Restoring Variables from Checkpoint")
 62 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) #TODO
 63 |         else:
 64 |             print("Can't find the checkpoint.going to stop")
 65 |             return
 66 |         # 5.feed data, to get logits
 67 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 68 |         index=0
 69 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 70 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 71 |             logits=sess.run(textRCNN.logits,feed_dict={textRCNN.input_x:testX2[start:end],textRCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 72 |             # 6. get lable using logtis
 73 |             #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 74 |             # 7. write question id and labels to file system.
 75 |             #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 76 | 
 77 |             question_id_sublist=question_id_list[start:end]
 78 |             get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)
 79 | 
 80 |             index=index+1
 81 |         predict_target_file_f.close()
 82 | 
 83 | # get label using logits
 84 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 85 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 86 |     index_list=index_list[::-1]
 87 |     label_list=[]
 88 |     for index in index_list:
 89 |         label=vocabulary_index2word_label[index]
 90 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 91 |     return label_list
 92 | 
 93 | # get label using logits
 94 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 95 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 96 |     index_list=index_list[::-1]
 97 |     value_list=[]
 98 |     label_list=[]
 99 |     for index in index_list:
100 |         label=vocabulary_index2word_label[index]
101 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
102 |         value_list.append(logits[index])
103 |     return label_list,value_list
104 | 
105 | # write question id and labels to file system.
106 | def write_question_id_with_labels(question_id,labels_list,f):
107 |     labels_string=",".join(labels_list)
108 |     f.write(question_id+","+labels_string+"\n")
109 | 
110 | # get label using logits
111 | def get_label_using_logits_batch(question_id_sublist,logits_batch,vocabulary_index2word_label,f,top_number=5):
112 |     #print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5)
113 |     for i,logits in enumerate(logits_batch):
114 |         index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
115 |         index_list=index_list[::-1]
116 |         label_list=[]
117 |         for index in index_list:
118 |             label=vocabulary_index2word_label[index]
119 |             label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
120 |         #print("get_label_using_logits.label_list",label_list)
121 |         write_question_id_with_labels(question_id_sublist[i], label_list, f)
122 |     f.flush()
123 |     #return label_list
124 | # write question id and labels to file system.
125 | def write_question_id_with_labels(question_id,labels_list,f):
126 |     labels_string=",".join(labels_list)
127 |     f.write(question_id+","+labels_string+"\n")
128 | 
129 | if __name__ == "__main__":
130 |     tf.app.run()


--------------------------------------------------------------------------------
/TextRNN/p8_TextRNN_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | from p8_TextRNN_model import TextRNN
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 80, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 12000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_rnn_checkpoint/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sequence_length",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_string("traning_data_path","train-zhihu4-only-title-all.txt","path of traning data.") #train-zhihu4-only-title-all.txt.training-data/test-zhihu4-only-title.txt--->'training-data/train-zhihu5-only-title-multilabel.txt'
 28 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec.bin-100","word2vec's vocabulary and vectors")
 29 | tf.app.flags.DEFINE_string("predict_target_file","text_rnn_checkpoint/zhihu_result_rnn5.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction")
 31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
 32 | def main(_):
 33 |     # 1.load data with vocabulary of words and labels
 34 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rnn")
 35 |     vocab_size = len(vocabulary_word2index)
 36 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="rnn")
 37 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 38 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 39 |     testX=[]
 40 |     question_id_list=[]
 41 |     for tuple in test:
 42 |         question_id,question_string_list=tuple
 43 |         question_id_list.append(question_id)
 44 |         testX.append(question_string_list)
 45 |     # 2.Data preprocessing: Sequence padding
 46 |     print("start padding....")
 47 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
 48 |     print("end padding...")
 49 |    # 3.create session.
 50 |     config=tf.ConfigProto()
 51 |     config.gpu_options.allow_growth=True
 52 |     with tf.Session(config=config) as sess:
 53 |         # 4.Instantiate Model
 54 |         textRNN=TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
 55 |                         vocab_size, FLAGS.embed_size, FLAGS.is_training)
 56 |         saver=tf.train.Saver()
 57 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 58 |             print("Restoring Variables from Checkpoint for TextRNN")
 59 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 60 |         else:
 61 |             print("Can't find the checkpoint.going to stop")
 62 |             return
 63 |         # 5.feed data, to get logits
 64 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 65 |         index=0
 66 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 67 |        #for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 68 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 69 |             logits=sess.run(textRNN.logits,feed_dict={textRNN.input_x:testX2[start:end],textRNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 70 |             # 6. get lable using logtis
 71 |             #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #logits[0]
 72 |             # 7. write question id and labels to file system.
 73 |             #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 74 |             #############################################################################################################
 75 |             print("start:",start,";end:",end)
 76 |             question_id_sublist=question_id_list[start:end]
 77 |             get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)
 78 |             ########################################################################################################
 79 |             index=index+1
 80 |         predict_target_file_f.close()
 81 | 
 82 | # get label using logits
 83 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 84 |     #print("get_label_using_logits:",logits)
 85 |     print("get_label_using_logits.shape:", logits.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5)
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     print("get_label_using_logits.label_list",label_list)
 93 |     return label_list
 94 | 
 95 | # get label using logits
 96 | def get_label_using_logits_batch(question_id_sublist,logits_batch,vocabulary_index2word_label,f,top_number=5):
 97 |     #print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5)
 98 |     for i,logits in enumerate(logits_batch):
 99 |         index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
100 |         index_list=index_list[::-1]
101 |         label_list=[]
102 |         for index in index_list:
103 |             label=vocabulary_index2word_label[index]
104 |             label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
105 |         #print("get_label_using_logits.label_list",label_list)
106 |         write_question_id_with_labels(question_id_sublist[i], label_list, f)
107 |     f.flush()
108 |     #return label_list
109 | # write question id and labels to file system.
110 | def write_question_id_with_labels(question_id,labels_list,f):
111 |     labels_string=",".join(labels_list)
112 |     f.write(question_id+","+labels_string+"\n")
113 | 
114 | if __name__ == "__main__":
115 |     tf.app.run()


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ must run in python3x"""
  3 | import  numpy as np
  4 | import tensorflow as tf
  5 | import os
  6 | import shutil
  7 | __author__ = 'Yu He'
  8 | __version__ = 'v30'
  9 | 
 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 11 | 
 12 | 
 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution')
 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float)
 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value')
 16 | total_true_value = np.loadtxt(detail_filename,dtype=int)
 17 | 
 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 19 | 
 20 | 
 21 | 
 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int)
 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1)
 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1)
 25 | #
 26 | 
 27 | 
 28 | filename_eval_log = os.path.join('./data', 'log_eval')
 29 | file_eval_log = open(filename_eval_log, 'w')
 30 | np.set_printoptions(threshold=np.nan)
 31 | print('\nevaluation:', file=file_eval_log)
 32 | print('\nevaluation:')
 33 | 
 34 | total_predicted_value = total_predicted_value.astype(bool)
 35 | total_true_value = total_true_value.astype(bool)
 36 | 
 37 | print('  example based evaluations:', file=file_eval_log)
 38 | print('  example based evaluations:')
 39 | 
 40 | equal = total_true_value == total_predicted_value
 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1)
 42 | exact_match_ratio = np.sum(match) / np.size(match)
 43 | print('      exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log)
 44 | print('      exact_match_ratio = %.4f' % exact_match_ratio)
 45 | 
 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1)
 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1)
 48 | accuracy = np.mean(true_and_predict / true_or_predict)
 49 | print('      accuracy = %.4f' % accuracy, file=file_eval_log)
 50 | print('      accuracy = %.4f' % accuracy)
 51 | 
 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9))
 53 | print('      precison = %.4f' % precison, file=file_eval_log)
 54 | print('      precison = %.4f' % precison)
 55 | 
 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1))
 57 | print('      recall = %.4f' % recall, file=file_eval_log)
 58 | print('      recall = %.4f' % recall)
 59 | 
 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1)
 61 |                                                + np.sum(total_predicted_value, axis=1)))
 62 | print('      F1_Measure = %.4f' % F1_Measure, file=file_eval_log)
 63 | print('      F1_Measure = %.4f' % F1_Measure)
 64 | 
 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value)
 66 | print('      HammingLoss = %.4f' % HammingLoss, file=file_eval_log)
 67 | print('      HammingLoss = %.4f' % HammingLoss)
 68 | 
 69 | 
 70 | print('  label based evaluations:', file=file_eval_log)
 71 | print('  label based evaluations:')
 72 | 
 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32)
 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32)
 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32)
 76 | 
 77 | TP_re = np.reshape(TP,[TP.shape[0],1])
 78 | FP_re = np.reshape(FP,[FP.shape[0],1])
 79 | FN_re = np.reshape(FN,[FN.shape[0],1])
 80 | re =  np.concatenate((TP_re,FP_re,FN_re),axis=1)
 81 | print('TP FP FN:')
 82 | print('TP FP FN:', file=file_eval_log)
 83 | print(re,file=file_eval_log)
 84 | print(re)
 85 | 
 86 | 
 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:]))
 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:]))
 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:]))
 90 | 
 91 | # for i in [6,28,31,36,52]:
 92 | #     TP[i] = TP[i-1]
 93 | #     FP[i] = FP[i - 1]
 94 | #     FN[i] = FN[i - 1]
 95 | #
 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:]))
 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:]))
 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:]))
 99 | 
100 | 
101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP)  + 1e-9 )
102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN)  + 1e-9 )
103 | Micro_F1 = (2 * _P *_R) / (_P + _R)
104 | print('      P = %.4f' % _P, file=file_eval_log)
105 | print('      P = %.4f' % _P)
106 | print('      R = %.4f' % _R, file=file_eval_log)
107 | print('      R = %.4f' % _R)
108 | print('      Micro-F1 = %.4f' % Micro_F1, file=file_eval_log)
109 | print('      Micro-F1 = %.4f' % Micro_F1)
110 | 
111 | _P_t = TP / (TP + FP + 1e-9)
112 | _R_t = TP / (TP + FN + 1e-9)
113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
114 | 
115 | 
116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1])
117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1])
118 | re =  np.concatenate((_P_t_re,_R_t_re),axis=1)
119 | print('_P_t _R_t:')
120 | print('_P_t:', file=file_eval_log)
121 | print(re,file=file_eval_log)
122 | print(re)
123 | 
124 | print('      Macro-F1 = %.4f' % Macro_F1, file=file_eval_log)
125 | print('      Macro-F1 = %.4f' % Macro_F1)
126 | 


--------------------------------------------------------------------------------
/__pycache__/graphcnn.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/graphcnn_GPU.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_GPU.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/graphcnn_generate_data.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_generate_data.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/graphcnn_input.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_input.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/graphcnn_option.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUST-KnowComp/DeepGraphCNNforTexts/bf0bb5441ecea58c5556a9969064bec074325c7a/__pycache__/graphcnn_option.cpython-34.pyc


--------------------------------------------------------------------------------
/boosting/a08_boosting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | reload(sys)
 4 | sys.setdefaultencoding('utf8')
 5 | import tensorflow as tf
 6 | 
 7 | #main process for boosting:
 8 | #1.compute label weight after each epoch using validation data.
 9 | #2.get weights for each batch during traininig process
10 | #3.compute loss using cross entropy with weights
11 | 
12 | #1.compute label weight after each epoch using validation data.
13 | def compute_labels_weights(weights_label,logits,labels):
14 |     """
15 |     compute weights for labels in current batch, and update weights_label(a dict)
16 |     :param weights_label:a dict
17 |     :param logit: [None,Vocabulary_size]
18 |     :param label: [None,]
19 |     :return:
20 |     """
21 |     labels_predict=np.argmax(logits,axis=1) # logits:(256,108,754)
22 |     for i in range(len(labels)):
23 |         label=labels[i]
24 |         label_predict=labels_predict[i]
25 |         weight=weights_label.get(label,None)
26 |         if weight==None:
27 |             if label_predict == label:
28 |                 weights_label[label]=(1,1)
29 |             else:
30 |                 weights_label[label]=(1,0)
31 |         else:
32 |             number=weight[0]
33 |             correct=weight[1]
34 |             number=number+1
35 |             if label_predict==label:
36 |                 correct=correct+1
37 |             weights_label[label]=(number,correct)
38 |     return weights_label
39 | 
40 | #2.get weights for each batch during traininig process
41 | def get_weights_for_current_batch(answer_list,weights_dict):
42 |     """
43 |     get weights for current batch
44 |     :param  answer_list: a numpy array contain labels for a batch
45 |     :param  weights_dict: a dict that contain weights for all labels
46 |     :return: a list. length is label size.
47 |     """
48 |     weights_list_batch=list(np.ones((len(answer_list))))
49 |     answer_list=list(answer_list)
50 |     for i,label in enumerate(answer_list):
51 |         acc=weights_dict[label]
52 |         weights_list_batch[i]=min(1.5,1.0/(acc+0.001))
53 |     #if np.random.choice(200)==0: #print something from time to time
54 |     #    print("weights_list_batch:",weights_list_batch)
55 |     return weights_list_batch
56 | 
57 | #3.compute loss using cross entropy with weights
58 | def loss(logits,labels,weights):
59 |     loss= tf.losses.sparse_softmax_cross_entropy(labels, logits,weights=weights)
60 |     return loss
61 | 
62 | #######################################################################
63 | #util function
64 | def get_weights_label_as_standard_dict(weights_label):
65 |     weights_dict = {}
66 |     for k,v in weights_label.items():
67 |         count,correct=v
68 |         weights_dict[k]=float(correct)/float(count)
69 |     return weights_dict
70 | 


--------------------------------------------------------------------------------
/graphcnn_hier_eval_without_labels_all.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 222
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | 
 19 | evalDataSet = None
 20 | 
 21 | FLAGS = tf.app.flags.FLAGS
 22 | 
 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 24 |                            """Directory where to write event logs.""")
 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 26 |                            """Directory where to read model checkpoints.""")
 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 28 |                             """How often to run the eval.""")
 29 | tf.app.flags.DEFINE_boolean('run_once', False,
 30 |                          """Whether to run eval only once.""")
 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 32 |                             """Whether to log device placement.""")
 33 | 
 34 | 
 35 | 
 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 37 | 
 38 | def evaluate(checkpoint,test_index_array):
 39 |     with tf.Graph().as_default() as g, tf.device('/cpu:0'):
 40 |         # Get images and labels
 41 |         data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH,
 42 |                                            graphcnn_input.NUM_CHANNELS])
 43 |         # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES])
 44 | 
 45 |         # inference
 46 |         logits = graphcnn_model.inference(data, eval_data=True)
 47 |         # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False)
 48 | 
 49 |         # multi-label sigmoid
 50 |         logits = tf.sigmoid(logits)
 51 | 
 52 |         # Restore the moving average version of the learned variables for eval. # ?????????????????????????
 53 |         variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY)
 54 |         variables_to_restore = variable_averages.variables_to_restore()
 55 |         saver = tf.train.Saver(variables_to_restore)
 56 | 
 57 |         # Build the summary operation based on the TF collection of Summaries.
 58 |         # summary_op = tf.merge_all_summaries()
 59 |         # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)
 60 | 
 61 | 
 62 |         with tf.Session(config=tf.ConfigProto(
 63 |                 allow_soft_placement=True,
 64 |                 log_device_placement=FLAGS.log_device_placement)) as sess:
 65 |             if checkpoint == '0':
 66 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
 67 |                 if ckpt and ckpt.model_checkpoint_path:
 68 |                     # Restores from checkpoint
 69 |                     saver.restore(sess, ckpt.model_checkpoint_path)
 70 |                     # extract global_step
 71 |                     global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
 72 |                 else:
 73 |                     print('No checkpoint file found')
 74 |                     return
 75 |             else:
 76 |                 if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)):
 77 |                     saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint))
 78 |                     global_step_for_restore = int(checkpoint)
 79 |                 else:
 80 |                     print('No checkpoint file found')
 81 |                     return
 82 | 
 83 |             num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE))
 84 |             total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE
 85 |             step = 0
 86 |             total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32)  ##
 87 |             while step < num_iter:
 88 |                 test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE)
 89 |                 predicted_value = sess.run(
 90 |                     logits, feed_dict={data: test_data})
 91 |                 total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0)
 92 |                 step += 1
 93 | 
 94 |             total_predicted_value = total_predicted_value[1:]
 95 | 
 96 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 97 |             if os.path.exists(detail_filename):
 98 |                 os.remove(detail_filename)
 99 |             np.savetxt(detail_filename, total_predicted_value, fmt='%.4f')
100 | 
101 | 
102 |             filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval')
103 |             file_eval_log = open(filename_eval_log, 'w')
104 |             np.set_printoptions(threshold=np.nan)
105 |             print('\nevaluation:', file=file_eval_log)
106 |             print('\nevaluation:')
107 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log)
108 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore))
109 |             print('evaluation is end...')
110 |             print('evaluation is end...', file=file_eval_log)
111 | 
112 |             print('evaluation samples number:%d, evaluation classes number:%d' %
113 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log)
114 |             print('evaluation samples number:%d, evaluation classes number:%d' %
115 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]))
116 |             print('evaluation detail: '
117 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
118 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'),
119 |                   file=file_eval_log)
120 |             print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval')
121 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
122 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'))
123 |             file_eval_log.close()
124 | 
125 | 
126 | 
127 | def main(argv=None):  # pylint: disable=unused-argument
128 |     global evalDataSet
129 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
130 | 
131 |     if tf.gfile.Exists(FLAGS.eval_dir):
132 |         # print('the evaluate data has already exists!')
133 |         # str = input('continue will delete the old evaluate directory:(y/n)')
134 |         # if str == 'y' or str == 'Y':
135 |         tf.gfile.DeleteRecursively(FLAGS.eval_dir)
136 |         #elif str == 'n' or str == 'N':
137 |         #    print('eval end!')
138 |         #    return
139 |         #else:
140 |         #    print('invalid input!')
141 |         #    return
142 |     tf.gfile.MakeDirs(FLAGS.eval_dir)
143 | 
144 |     test_index_array = np.array(range(0, 81262))
145 | 
146 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
147 |     checkpoint = '0'
148 |     evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array,
149 |                                                          data_dir=graphcnn_option.EVAL_DATA_DIR,
150 |                                                          ont_hot=True,
151 |                                                          index_mode=True,
152 |                                                          label_used=False)
153 |     print('evaluating...')
154 |     evaluate(checkpoint,test_index_array)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     tf.app.run()
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/graphcnn_hier_eval_without_labels_some.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 444
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | 
 37 | 
 38 | def generate_eval_index():
 39 |     test_index_array = []
 40 |     # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME)
 41 |     filepath = '../hier_eval_root'
 42 |     pathDir = os.listdir(filepath)
 43 |     for allDir in pathDir:
 44 |         child = os.path.join(filepath, allDir)
 45 |         if os.path.getsize(child):
 46 |             example_label_array = np.loadtxt(child,dtype=int)
 47 |             examlpe_array = example_label_array[:,0]
 48 |             label_array = example_label_array[:, 1]
 49 |             for root in graphcnn_option.HIER_ROOT_CODE:
 50 |                 index = np.where(label_array==root)[0]
 51 |                 for one in examlpe_array[index]:
 52 |                     if one not in test_index_array:
 53 |                         test_index_array.append(one)
 54 | 
 55 |     # for allDir in pathDir:
 56 |     #     child = os.path.join(filepath, allDir)
 57 |     #     os.remove(child)
 58 | 
 59 | 
 60 |     filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index')
 61 |     np.savetxt(filename,test_index_array,fmt='%d')
 62 | 
 63 |     return test_index_array
 64 | 
 65 | 
 66 | def evaluate(checkpoint,test_index_array):
 67 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 68 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 69 |         total_predicted_value = total_predicted_value[test_index_array]
 70 | 
 71 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 72 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 73 |         total_predicted_value = (
 74 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 75 | 
 76 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 77 |         if os.path.exists(detail_filename):
 78 |             os.remove(detail_filename)
 79 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 80 | 
 81 | 
 82 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 83 |         total_remap = np.loadtxt(filename, dtype=int)
 84 | 
 85 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 86 |                                        graphcnn_option.HIER_labels_remap_file)
 87 |         remap = np.loadtxt(detail_filename, dtype=int)
 88 | 
 89 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 90 |         fr_leaf = open(filename,'a')
 91 |         filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file)
 92 |         fr_leaf_exp = open(filename, 'a')
 93 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 94 |         fr_root = open(filename, 'w')
 95 | 
 96 |         # rootstr_tmp = []
 97 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 98 |         fr = open(detail_filename, 'w')
 99 |         for i in range(0, np.size(total_predicted_value, axis=0)):
100 |             labels = np.where(total_predicted_value[i] == 1)[0]
101 |             if len(labels) > 0:
102 |                 labels_remap = remap[labels, 0]
103 |                 for elem in labels_remap:
104 |                     print(elem, end=' ', file=fr)
105 |                     if elem in total_remap[:,0]: # leaf
106 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
107 |                     else:
108 |                         print('%d %d' % (test_index_array[i], elem), file=fr_root)
109 |                         # for j in range(0,len(rootlist)):
110 |                         #     if elem in rootlist[j]:
111 |                         #         if rootstr[j] not in rootstr_tmp:
112 |                         #             rootstr_tmp.append(rootstr[j])
113 |                 print('', file=fr)
114 |             else:
115 |                 # labels_remap = remap[:, 0]
116 |                 labels = total_predicted_value_argmax[i]
117 |                 labels_value = total_predicted_value_max[i]
118 |                 labels_remap = remap[labels, 0]
119 |                 # for elem in labels_remap:
120 |                 elem = labels_remap
121 |                 print(elem, file=fr)
122 |                 if elem in total_remap[:, 0]:  # leaf
123 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp)
124 |                 else:
125 |                     print('%d %d' % (test_index_array[i], elem), file=fr_root)
126 |                 # if labels_value < 0.5:
127 |                 #     labels_remap = remap[:, 0]
128 |                 #     for elem in labels_remap:
129 |                 #         if elem not in total_remap[:, 0]:
130 |                 #             print('%d %d' % (test_index_array[i], elem), file=fr_root)
131 | 
132 |         fr.close()
133 |         fr_leaf.close()
134 |         fr_root.close()
135 |         fr_leaf_exp.close()
136 | 
137 |         # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root')
138 |         # fr = open(filename, 'w')
139 |         # for one in rootstr_tmp:
140 |         #     print(one)
141 |         #     print(one,file=fr)
142 |         # fr.close()
143 | 
144 | 
145 | 
146 | 
147 | def main(argv=None):  # pylint: disable=unused-argument
148 |     global evalDataSet
149 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
150 | 
151 |     # test_index_array = np.array(range(0, 81262))
152 |     if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root
153 |         test_index_array = np.array(range(0,81262))
154 |         # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int)
155 |     else:
156 |         test_index_array = generate_eval_index()
157 |     if test_index_array is None or len(test_index_array)==0:
158 |         print('no hier_data need eval')
159 |         return
160 |     else:
161 |         print('choosing for evaluation...')
162 |         print('choosed number:%d' % len(test_index_array))
163 | 
164 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
165 |     checkpoint = '0'
166 | 
167 |     # print('choosing for evaluation...')
168 |     evaluate(checkpoint,test_index_array)
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     tf.app.run()
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/graphcnn_hier_eval_without_labels_some2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 333
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | def evaluate(checkpoint,test_index_array):
 37 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 38 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 39 |         total_predicted_value = total_predicted_value[test_index_array]
 40 | 
 41 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 42 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 43 |         total_predicted_value = (
 44 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 45 | 
 46 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 47 |         if os.path.exists(detail_filename):
 48 |             os.remove(detail_filename)
 49 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 50 | 
 51 | 
 52 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 53 |         total_remap = np.loadtxt(filename, dtype=int)
 54 | 
 55 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 56 |                                        graphcnn_option.HIER_labels_remap_file)
 57 |         remap = np.loadtxt(detail_filename, dtype=int)
 58 | 
 59 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 60 |         fr_leaf = open(filename,'a')
 61 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 62 |         fr_root = open(filename, 'w')
 63 | 
 64 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr')
 65 |         # fr = open(filename, 'r')
 66 |         # rootstr = fr.readlines()
 67 |         # fr.close()
 68 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist')
 69 |         # fr = open(filename, 'r')
 70 |         # rootlines = fr.readlines()
 71 |         # fr.close()
 72 |         # rootlist = []
 73 |         # for line in rootlines:
 74 |         #     line = line.strip()
 75 |         #     linelist = line.split(' ')
 76 |         #     linelist = [int(k) for k in linelist]
 77 |         #     rootlist.append(linelist)
 78 | 
 79 |         # rootstr_tmp = []
 80 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 81 |         fr = open(detail_filename, 'w')
 82 |         for i in range(0, np.size(total_predicted_value, axis=0)):
 83 |             labels = np.where(total_predicted_value[i] == 1)[0]
 84 |             if len(labels) > 0:
 85 |                 labels_remap = remap[labels, 0]
 86 |                 for elem in labels_remap:
 87 |                     print(elem, end=' ', file=fr)
 88 |                     if elem in total_remap[:,0]: # leaf
 89 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
 90 |                 print('', file=fr)
 91 |             else:
 92 |                 labels = total_predicted_value_argmax[i]
 93 |                 labels_remap = remap[labels, 0]
 94 |                 elem = labels_remap
 95 |                 labels_value = total_predicted_value_max[i]
 96 |                 print(elem, file=fr)
 97 |                 if elem in total_remap[:, 0]:  # leaf
 98 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root)
 99 | 
100 | 
101 |         fr.close()
102 |         fr_leaf.close()
103 |         fr_root.close()
104 | 
105 | 
106 | 
107 | 
108 | def main(argv=None):  # pylint: disable=unused-argument
109 |     global evalDataSet
110 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
111 | 
112 |     test_index_array = np.array(range(0, 81262))
113 |     print('choosing for evaluation...')
114 |     print('choosed number:%d' % len(test_index_array))
115 | 
116 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
117 |     checkpoint = '0'
118 | 
119 |     # print('choosing for evaluation...')
120 |     evaluate(checkpoint,test_index_array)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     tf.app.run()
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/graphcnn_option.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## data
 3 | ORI_DATA_NAME = 'graphs'
 4 | ORI_TRAIN_DATA_NAME = 'train_graphs'
 5 | ORI_TEST_DATA_NAME = 'test_graphs'
 6 | ORI_DATA_VEC_NAME = 'index2vec'
 7 | ORI_DATA_OPTION_NAME = 'option'
 8 | 
 9 | TRAIN_DATA_NAME = 'data.train'
10 | TEST_DATA_NAME = 'data.test'
11 | DATA_OPTION_NAME = 'data.option'
12 | 
13 | DATA_LABELS_REMAP_NAME = 'remap'
14 | 
15 | ## LSHTC Hierarchy training
16 | 
17 | 
18 | HIER_used = True
19 | HIER_test_used = True
20 | rootstr = '_1_2322682_' # ????
21 | HIER_ROOT_CODE = [2322682] # ????
22 | HIER_DIR_NAME = 'hier'
23 | HIER_labels_remap_file = 'hier'+rootstr+'remap'
24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index'
25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels'
26 | HIER_train_data_file = 'hier'+rootstr+'train_data'  # ??
27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index'
28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels'
29 | HIER_test_data_file = 'hier'+rootstr+'test_data'  # ??
30 | 
31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf'
32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp'
33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root'
34 | 
35 | if HIER_used:
36 |     TRAIN_DATA_NAME = HIER_train_data_file
37 |     if HIER_test_used:
38 |         TEST_DATA_NAME = HIER_test_data_file
39 | 
40 | 
41 | 
42 | 
43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn
44 | # lr_decay_ecophs = [2,150,750,1250,1500]   # single-label wiki_cn
45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001]
46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001]
47 | # lr_decay_ecophs = [10,400,1500,1800,2000]   # multi-label, RCV
48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500]   # multi-label, RCV
49 | 
50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600
51 | 
52 | 
53 | 
54 | ## Basic parameters.
55 | TRAIN_DATA_DIR = '../graphCNN_data'  # Path to the train data directory.
56 | EVAL_DATA_DIR = '../graphCNN_data'  # Path to the test data directory.
57 | DATA_PATH = './data'   # Path to data directory
58 | 
59 | USE_FP16 = False  # Train the model using fp16.
60 | 
61 | # summaryWriter
62 | SUMMARYWRITER = False
63 | 
64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
65 | # to differentiate the operations. Note that this prefix is removed from the
66 | # names of the summaries when visualizing a model.
67 | TOWER_NAME = 'tower'
68 | 
69 | 
70 | 
71 | ## model parameters
72 | NUM_EPOCHS_PER_DECAY = 1000 #350     # Epochs after which learning rate decays.
73 | INITIAL_LEARNING_RATE = 0.001       # Initial learning rate.
74 | LEARNING_RATE_DECAY_RATE = 0.1  # Learning rate decay rate.
75 | 
76 | MOMENTUM = 0.9 # Momentum of SGD
77 | 
78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training.
79 | 
80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average.
81 | 
82 | WEIGHT_DECAY = 0.0005     # 0.00005  # 0.0005 # l2 regularization weight decay
83 | 
84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint
85 | 
86 | 
87 | ## train parameters
88 | NUM_GPUS = 4 # How many GPUs to use
89 | 
90 | CKPT_PERIOD = 5000
91 | 
92 | 
93 | ## eval parameters
94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification
95 | 


--------------------------------------------------------------------------------
/utils/read:
--------------------------------------------------------------------------------
 1 | a 1
 2 | a 1
 3 | a 1
 4 | a 1
 5 | a 1
 6 | a 1
 7 | a 1
 8 | a 1
 9 | b 1
10 | b 1
11 | b 1
12 | b 1
13 | c 1
14 | c 1
15 | c 1
16 | c 1
17 | a 1
18 | a 1
19 | a 1
20 | a 1
21 | b 1
22 | b 1
23 | b 1
24 | b 1
25 | c 1
26 | c 1
27 | c 1
28 | c 1
29 | 


--------------------------------------------------------------------------------
/utils/tmp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | import shutil
 6 | 
 7 | # 遍历指定目录，显示目录下的所有文件名
 8 | def eachFile(filepath):
 9 |     pathDir =  os.listdir(filepath)
10 |     for allDir in pathDir:
11 |         child = os.path.join('%s%s' % (filepath, allDir))
12 | 
13 | def xx():
14 |     filename = 'graphcnn_hier_eval_without_labels.py'
15 |     DIR = '.'
16 |     pathDir =  os.listdir(DIR)
17 |     for path in pathDir:
18 |         if len(path)>5 and path[0:5]=='LSHTC':
19 |             sourceFile = os.path.join(DIR, filename)
20 |             targetFile = os.path.join(DIR,path,filename)
21 |             if os.path.exists(targetFile):
22 |                 os.remove(targetFile)
23 |             shutil.copy(sourceFile, targetFile)
24 | 
25 | 
26 | a = np.array([[1,2,3],[1,2,3]])
27 | a = np.reshape(a,[-1,1])
28 | print(a)


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import  os
 3 | 
 4 | def main():
 5 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups'
 6 |     fr = open(filename, 'r')
 7 |     lines = fr.readlines()
 8 |     fr.close()
 9 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info'
10 |     fr = open(filename, 'w')
11 |     for line in lines:
12 |         line = line.strip()
13 |         linelist = line.split(' ')
14 |         print(len(linelist),file=fr)
15 |     fr.close()
16 | 
17 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups'
18 |     fr = open(filename, 'r')
19 |     lines = fr.readlines()
20 |     fr.close()
21 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info'
22 |     fr = open(filename, 'w')
23 |     for line in lines:
24 |         line = line.strip()
25 |         linelist = line.split(' ')
26 |         print(len(linelist),file=fr)
27 |     fr.close()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------