├── CNN ├── .gitignore ├── LICENSE ├── README.md ├── binary_class_data_loader.py ├── char_data_processor.py ├── data_helpers.py ├── eval.py ├── multi_class_data_loader.py ├── text_cnn.py ├── train.py └── word_data_processor.py ├── GraphCNN ├── SVM_eval.py ├── SVM_model.py ├── SVM_train.py ├── __init__.py ├── graphcnn_eval_SVM.py ├── graphcnn_eval_multilabel.py ├── graphcnn_eval_singlelabel.py ├── graphcnn_eval_without_labels.py ├── graphcnn_generate_data.py ├── graphcnn_hier_eval_without_labels.py ├── graphcnn_hier_eval_without_labels_SVM.py ├── graphcnn_hier_eval_without_labels_all.py ├── graphcnn_hier_eval_without_labels_some.py ├── graphcnn_hier_eval_without_labels_some2.py ├── graphcnn_hier_eval_without_labels_some_root.py ├── graphcnn_input.py ├── graphcnn_model.py ├── graphcnn_option.py ├── graphcnn_train.py └── utils │ ├── NYT_utils.py │ ├── grouping.py │ ├── hier_rootlist │ ├── hier_rootstr │ ├── lshtc_utils.py │ ├── lshtc_utils2.py │ ├── read │ ├── tmp.py │ └── utils.py ├── HAN ├── model │ └── IMDB │ │ └── bestmodel │ │ └── .gitkeep └── src │ ├── Dataset.py │ ├── EmbLayer.py │ ├── HiddenLayer.py │ ├── LSTMLayer.py │ ├── LSTMModel.py │ ├── PoolLayer.py │ ├── SentenceSortLayer.py │ ├── Update.py │ ├── test.py │ └── train.py ├── HLSTM ├── model │ └── IMDB │ │ └── bestmodel │ │ └── .gitkeep └── src │ ├── Dataset.py │ ├── EmbLayer.py │ ├── HiddenLayer.py │ ├── LSTMLayer.py │ ├── LSTMModel.py │ ├── PoolLayer.py │ ├── SentenceSortLayer.py │ ├── Update.py │ ├── test.py │ └── train.py ├── Pytorch_GraphCNNs ├── make_graphs.py ├── make_heiring.py ├── rcv1_processer.py ├── test.py ├── test_extra.py ├── train.py └── unzip.py ├── RCNN └── v-cpp │ ├── ecnn-noada.cpp │ └── fileutil.hpp └── Text2Graph └── src └── main └── java └── ecs ├── CoreNLPService.java └── TestCoreNLP.java /CNN/.gitignore: -------------------------------------------------------------------------------- 1 | *.npy 2 | runs/ 3 | 4 | # Created by https://www.gitignore.io/api/python,ipythonnotebook 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | 60 | # Sphinx documentation 61 | docs/_build/ 62 | 63 | # PyBuilder 64 | target/ 65 | 66 | 67 | ### IPythonNotebook ### 68 | # Temporary data 69 | .ipynb_checkpoints/ 70 | -------------------------------------------------------------------------------- /CNN/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /CNN/README.md: -------------------------------------------------------------------------------- 1 | **[This code belongs to the "Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)** 2 | 3 | It is slightly simplified implementation of Kim's [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) paper in Tensorflow. 4 | 5 | ## Requirements 6 | 7 | - Python 3 8 | - Tensorflow > 0.8 9 | - Numpy 10 | 11 | ## Training 12 | 13 | Print parameters: 14 | 15 | ```bash 16 | ./train.py --help 17 | ``` 18 | 19 | ``` 20 | optional arguments: 21 | -h, --help show this help message and exit 22 | --embedding_dim EMBEDDING_DIM 23 | Dimensionality of character embedding (default: 128) 24 | --filter_sizes FILTER_SIZES 25 | Comma-separated filter sizes (default: '3,4,5') 26 | --num_filters NUM_FILTERS 27 | Number of filters per filter size (default: 128) 28 | --l2_reg_lambda L2_REG_LAMBDA 29 | L2 regularizaion lambda (default: 0.0) 30 | --dropout_keep_prob DROPOUT_KEEP_PROB 31 | Dropout keep probability (default: 0.5) 32 | --batch_size BATCH_SIZE 33 | Batch Size (default: 64) 34 | --num_epochs NUM_EPOCHS 35 | Number of training epochs (default: 100) 36 | --evaluate_every EVALUATE_EVERY 37 | Evaluate model on dev set after this many steps 38 | (default: 100) 39 | --checkpoint_every CHECKPOINT_EVERY 40 | Save model after this many steps (default: 100) 41 | --allow_soft_placement ALLOW_SOFT_PLACEMENT 42 | Allow device soft device placement 43 | --noallow_soft_placement 44 | --log_device_placement LOG_DEVICE_PLACEMENT 45 | Log placement of ops on devices 46 | --nolog_device_placement 47 | 48 | ``` 49 | 50 | Train: 51 | 52 | ```bash 53 | ./train.py 54 | ``` 55 | 56 | ## Evaluating 57 | 58 | ```bash 59 | ./eval.py --eval_train --checkpoint_dir="./runs/1459637919/checkpoints/" 60 | ``` 61 | 62 | Replace the checkpoint dir with the output from the training. To use your own data, change the `eval.py` script to load your data. 63 | 64 | 65 | ## References 66 | 67 | - [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) 68 | - [A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1510.03820) -------------------------------------------------------------------------------- /CNN/binary_class_data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from tensorflow.contrib import learn 4 | 5 | class BinaryClassDataLoader(object): 6 | """ 7 | Load binary classification data from two files (positive and negative) and 8 | split data into train and dev. 9 | """ 10 | def __init__(self, flags, data_processor, clean_data=None, classes=None): 11 | self.__flags = flags 12 | self.__data_processor = data_processor 13 | self.__clean_data = clean_data 14 | self.__classes = classes 15 | 16 | def define_flags(self): 17 | self.__flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 18 | self.__flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") 19 | self.__flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.") 20 | 21 | def prepare_data(self): 22 | self.__resolve_params() 23 | 24 | x_text, y = self.load_data_and_labels() 25 | 26 | # Build vocabulary 27 | self.vocab_processor = self.__data_processor.vocab_processor(x_text) 28 | x = np.array(list(self.vocab_processor.fit_transform(x_text))) 29 | 30 | # Randomly shuffle data 31 | np.random.seed(10) 32 | shuffle_indices = np.random.permutation(np.arange(len(y))) 33 | x_shuffled = x[shuffle_indices] 34 | y_shuffled = y[shuffle_indices] 35 | 36 | # Split train/test set 37 | # TODO: This is very crude, should use cross-validation 38 | dev_sample_index = -1 * int(self.__dev_sample_percentage * float(len(y))) 39 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 40 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 41 | return [x_train, y_train, x_dev, y_dev] 42 | 43 | def restore_vocab_processor(self, vocab_path): 44 | self.vocab_processor = self.__data_processor.restore_vocab_processor(vocab_path) 45 | return self.vocab_processor 46 | 47 | def class_labels(self, class_indexes): 48 | if self.__classes is None: 49 | result = class_indexes 50 | else: 51 | result = [ self.__classes[idx] for idx in class_indexes ] 52 | return result 53 | 54 | def load_data_and_labels(self): 55 | """ 56 | Loads MR polarity data from files, splits the data into words and generates labels. 57 | Returns split sentences and labels. 58 | """ 59 | self.__resolve_params() 60 | 61 | # Load data from files 62 | positive_examples = list(open(self.__positive_data_file, "r").readlines()) 63 | negative_examples = list(open(self.__negative_data_file, "r").readlines()) 64 | # Split by words 65 | x_text = positive_examples + negative_examples 66 | x_text = [self.__data_processor.clean_data(sent) for sent in x_text] 67 | # Generate labels 68 | positive_labels = [[0, 1] for _ in positive_examples] 69 | negative_labels = [[1, 0] for _ in negative_examples] 70 | y = np.concatenate([positive_labels, negative_labels], 0) 71 | return [x_text, y] 72 | 73 | def __resolve_params(self): 74 | self.__dev_sample_percentage = self.__flags.FLAGS.dev_sample_percentage 75 | self.__positive_data_file = self.__flags.FLAGS.positive_data_file 76 | self.__negative_data_file = self.__flags.FLAGS.negative_data_file 77 | -------------------------------------------------------------------------------- /CNN/char_data_processor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import codecs 3 | 4 | class CharDataProcessor(object): 5 | def vocab_processor(_, *texts): 6 | max_document_length = 0 7 | for text in texts: 8 | max_doc_len = max([len(line.decode("utf-8")) for line in text]) 9 | if max_doc_len > max_document_length: 10 | max_document_length = max_doc_len 11 | return VocabularyProcessor(max_document_length) 12 | 13 | def restore_vocab_processor(_, vocab_path): 14 | return VocabularyProcessor.restore(vocab_path) 15 | 16 | def clean_data(_, string): 17 | return string 18 | 19 | class VocabularyProcessor(object): 20 | def __init__(self, max_document_length, min_frequency=0, vocabulary=None, 21 | tokenizer_fn=None): 22 | # init a class. index maxdocument length and a vocabulabrary 23 | if vocabulary == None: 24 | self.vocabulary_ = {"":0} # padding 25 | else: 26 | self.vocabulary_ = vocabulary 27 | 28 | self.index = 1 29 | self.max_document_length = max_document_length 30 | def fit_transform(self, raw_documents, unused_y=None, fit=True): 31 | result = [] 32 | for raw_document in raw_documents: 33 | # mark for this, we can find it is a [[I am a student]] 34 | result.append([self.__vocab_id(char, fit) for char in raw_document.decode("utf-8")]) 35 | 36 | if self.max_document_length == None: 37 | max_document_length = max([len(vocab_ids) for vocab_ids in result]) 38 | else: 39 | max_document_length = self.max_document_length 40 | 41 | result = self.__smooth_lengths(result, max_document_length) 42 | 43 | return result 44 | 45 | def transform(self, raw_documents): 46 | return self.fit_transform(raw_documents, None, False) 47 | 48 | def save(self, file): 49 | with codecs.open(file, 'w', 'utf-8') as f: 50 | data = {"vocabulary_": self.vocabulary_, "index": self.index, 51 | "max_document_length": self.max_document_length} 52 | f.write(json.dumps(data, ensure_ascii=False)) 53 | 54 | @classmethod 55 | def restore(cls, file): 56 | with codecs.open(file, "r", "utf-8") as f: 57 | data = json.loads(f.readline()) 58 | vp = cls(data["max_document_length"], 0, data["vocabulary_"]) 59 | vp.index = data["index"] 60 | return vp 61 | 62 | @staticmethod 63 | def __smooth_lengths(documents, length): 64 | result = [] 65 | for document in documents: 66 | if len(document) > length: 67 | doccument = document[:length] 68 | elif len(document) < length: 69 | document = document + [0] * (length - len(document)) 70 | result.append(document) 71 | return result 72 | 73 | def __vocab_id(self, char, fit = True): 74 | # every word has a id 75 | if char not in self.vocabulary_: 76 | if fit: 77 | self.vocabulary_[char] = self.index 78 | self.index += 1 79 | else: 80 | char = "" 81 | return self.vocabulary_[char] 82 | 83 | -------------------------------------------------------------------------------- /CNN/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | 6 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 7 | """ 8 | Generates a batch iterator for a dataset. 9 | """ 10 | data = np.array(data) 11 | data_size = len(data) 12 | num_batches_per_epoch = int(len(data)/batch_size) + 1 13 | for epoch in range(num_epochs): 14 | # Shuffle the data at each epoch 15 | if shuffle: 16 | shuffle_indices = np.random.permutation(np.arange(data_size)) 17 | shuffled_data = data[shuffle_indices] 18 | else: 19 | shuffled_data = data 20 | for batch_num in range(num_batches_per_epoch): 21 | start_index = batch_num * batch_size 22 | end_index = min((batch_num + 1) * batch_size, data_size) 23 | yield shuffled_data[start_index:end_index] 24 | -------------------------------------------------------------------------------- /CNN/eval.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import data_helpers 9 | from text_cnn import TextCNN 10 | #from binary_class_data_loader import BinaryClassDataLoader 11 | from multi_class_data_loader import MultiClassDataLoader 12 | #from word_data_processor import WordDataProcessor 13 | from char_data_processor import CharDataProcessor 14 | import csv 15 | 16 | # Parameters 17 | # ================================================== 18 | 19 | # Eval Parameters 20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 21 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 22 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") 23 | 24 | # Misc Parameters 25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 27 | 28 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor()) 29 | data_loader.define_flags() 30 | 31 | FLAGS = tf.flags.FLAGS 32 | FLAGS._parse_flags() 33 | print("\nParameters:") 34 | for attr, value in sorted(FLAGS.__flags.items()): 35 | print("{}={}".format(attr.upper(), value)) 36 | print("") 37 | 38 | # CHANGE THIS: Load data. Load your own data here 39 | if FLAGS.eval_train: 40 | x_raw, y_test = data_loader.load_data_and_labels() 41 | y_test = np.argmax(y_test, axis=1) 42 | else: 43 | x_raw = ["a masterpiece four years in the making", "everything is off."] 44 | y_test = [1, 0] 45 | 46 | # Map data into vocabulary 47 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") 48 | vocab_processor = data_loader.restore_vocab_processor(vocab_path) 49 | x_test = np.array(list(vocab_processor.transform(x_raw))) 50 | 51 | print("\nEvaluating...\n") 52 | 53 | # Evaluation 54 | # ================================================== 55 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 56 | graph = tf.Graph() 57 | with graph.as_default(): 58 | session_conf = tf.ConfigProto( 59 | allow_soft_placement=FLAGS.allow_soft_placement, 60 | log_device_placement=FLAGS.log_device_placement) 61 | sess = tf.Session(config=session_conf) 62 | with sess.as_default(): 63 | # Load the saved meta graph and restore variables 64 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 65 | saver.restore(sess, checkpoint_file) 66 | 67 | # Get the placeholders from the graph by name 68 | input_x = graph.get_operation_by_name("input_x").outputs[0] 69 | # input_y = graph.get_operation_by_name("input_y").outputs[0] 70 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 71 | 72 | # Tensors we want to evaluate 73 | predictions = graph.get_operation_by_name("output/predictions").outputs[0] 74 | 75 | # Generate batches for one epoch 76 | batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) 77 | 78 | # Collect the predictions here 79 | all_predictions = [] 80 | 81 | for x_test_batch in batches: 82 | batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) 83 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 84 | 85 | # Print accuracy if y_test is defined 86 | if y_test is not None: 87 | correct_predictions = float(sum(all_predictions == y_test)) 88 | print("Total number of test examples: {}".format(len(y_test))) 89 | print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) 90 | 91 | # Save the evaluation to a csv 92 | all_predictions = data_loader.class_labels(all_predictions.astype(int)) 93 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions)) 94 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") 95 | print("Saving evaluation to {0}".format(out_path)) 96 | with open(out_path, 'w') as f: 97 | csv.writer(f).writerows(predictions_human_readable) 98 | -------------------------------------------------------------------------------- /CNN/multi_class_data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | 4 | class MultiClassDataLoader(object): 5 | """ 6 | Handles multi-class training data. It takes predefined sets of "train_data_file" and "dev_data_file" 7 | of the following record format. 8 | \t 9 | ex. "what a masterpiece! Positive" 10 | 11 | Class labels are given as "class_data_file", which is a list of class labels. 12 | """ 13 | def __init__(self, flags, data_processor): 14 | self.__flags = flags 15 | self.__data_processor = data_processor 16 | self.__train_data_file = None 17 | self.__dev_data_file = None 18 | self.__class_data_file = None 19 | self.__classes_cache = None 20 | 21 | 22 | def define_flags(self): 23 | self.__flags.DEFINE_string("train_data_file", "./data/rt-polaritydata/train.txt", "Data source for the training data.") 24 | self.__flags.DEFINE_string("dev_data_file", "./data/rt-polaritydata/test.txt", "Data source for the cross validation data.") 25 | self.__flags.DEFINE_string("class_data_file", "./data/rt-polaritydata/lable.txt", "Data source for the class list.") 26 | 27 | def prepare_data(self): 28 | self.__resolve_params() 29 | x_train, y_train = self.__load_data_and_labels(self.__train_data_file) 30 | x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file) 31 | 32 | max_doc_len = max([len(doc.decode("utf-8")) for doc in x_train]) 33 | max_doc_len_dev = max([len(doc.decode("utf-8")) for doc in x_dev]) 34 | if max_doc_len_dev > max_doc_len: 35 | max_doc_len = max_doc_len_dev 36 | # Build vocabulary 37 | self.vocab_processor = self.__data_processor.vocab_processor(x_train, x_dev) 38 | x_train = np.array(list(self.vocab_processor.fit_transform(x_train))) 39 | # Build vocabulary 40 | x_dev = np.array(list(self.vocab_processor.fit_transform(x_dev))) 41 | return [x_train, y_train, x_dev, y_dev] 42 | 43 | def restore_vocab_processor(self, vocab_path): 44 | return self.__data_processor.restore_vocab_processor(vocab_path) 45 | 46 | def class_labels(self, class_indexes): 47 | return [ self.__classes()[idx] for idx in class_indexes ] 48 | 49 | def load_data_and_labels(self): 50 | self.__resolve_params() 51 | x_train, y_train = self.__load_data_and_labels(self.__train_data_file) 52 | x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file) 53 | x_all = x_train + x_dev 54 | y_all = np.concatenate([y_train, y_dev], 0) 55 | return [x_all, y_all] 56 | 57 | def __load_data_and_labels(self, data_file): 58 | x_text = [] 59 | y = [] 60 | with open(data_file, 'r') as tsvin: 61 | classes = self.__classes() 62 | one_hot_vectors = np.eye(len(classes), dtype=int) 63 | class_vectors = {} 64 | for i, cls in enumerate(classes): 65 | class_vectors[cls] = one_hot_vectors[i] 66 | #edit for the first to the code. 67 | all_lines = tsvin.readlines() 68 | for line in all_lines: 69 | temp = line.split(' ',1) 70 | data = self.__data_processor.clean_data(temp[1]) 71 | x_text.append(data) 72 | y.append(class_vectors[temp[0]]) 73 | #edit 74 | # tsvin = csv.reader(tsvin, delimiter='\t') 75 | # for row in tsvin: 76 | # data = self.__data_processor.clean_data(row[0]) 77 | # x_text.append(data) 78 | # y.append(class_vectors[row[1]]) 79 | return [x_text, np.array(y)] 80 | 81 | def __classes(self): 82 | self.__resolve_params() 83 | if self.__classes_cache is None: 84 | with open(self.__class_data_file, 'r') as catin: 85 | classes = list(catin.readlines()) 86 | self.__classes_cache = [s.strip() for s in classes] 87 | return self.__classes_cache 88 | 89 | def __resolve_params(self): 90 | if self.__class_data_file is None: 91 | self.__train_data_file = self.__flags.FLAGS.train_data_file 92 | self.__dev_data_file = self.__flags.FLAGS.dev_data_file 93 | self.__class_data_file = self.__flags.FLAGS.class_data_file 94 | -------------------------------------------------------------------------------- /CNN/text_cnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class TextCNN(object): 6 | """ 7 | A CNN for text classification. 8 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 9 | """ 10 | def __init__( 11 | self, sequence_length, num_classes, vocab_size, 12 | embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): 13 | 14 | # Placeholders for input, output and dropout 15 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") 16 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") 17 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 18 | 19 | # Keeping track of l2 regularization loss (optional) 20 | l2_loss = tf.constant(0.0) 21 | 22 | # Embedding layer 23 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 24 | self.W = tf.Variable( 25 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 26 | trainable = False, 27 | name="W") 28 | 29 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 30 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 31 | 32 | # Create a convolution + maxpool layer for each filter size 33 | pooled_outputs = [] 34 | for i, filter_size in enumerate(filter_sizes): 35 | with tf.name_scope("conv-maxpool-%s" % filter_size): 36 | # Convolution Layer 37 | filter_shape = [filter_size, embedding_size, 1, num_filters] 38 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 39 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 40 | conv = tf.nn.conv2d( 41 | self.embedded_chars_expanded, 42 | W, 43 | strides=[1, 1, 1, 1], 44 | padding="VALID", 45 | name="conv") 46 | # Apply nonlinearity 47 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 48 | # Maxpooling over the outputs 49 | pooled = tf.nn.max_pool( 50 | h, 51 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 52 | strides=[1, 1, 1, 1], 53 | padding='VALID', 54 | name="pool") 55 | pooled_outputs.append(pooled) 56 | 57 | # Combine all the pooled features 58 | num_filters_total = num_filters * len(filter_sizes) 59 | self.h_pool = tf.concat(3, pooled_outputs) 60 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 61 | 62 | # Add dropout 63 | with tf.name_scope("dropout"): 64 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 65 | 66 | # Final (unnormalized) scores and predictions 67 | with tf.name_scope("output"): 68 | W = tf.get_variable( 69 | "W", 70 | shape=[num_filters_total, num_classes], 71 | initializer=tf.contrib.layers.xavier_initializer()) 72 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 73 | l2_loss += tf.nn.l2_loss(W) 74 | l2_loss += tf.nn.l2_loss(b) 75 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 76 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 77 | 78 | # CalculateMean cross-entropy loss 79 | with tf.name_scope("loss"): 80 | losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y) 81 | self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss 82 | 83 | # Accuracy 84 | with tf.name_scope("accuracy"): 85 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 86 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 87 | -------------------------------------------------------------------------------- /CNN/train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import numpy as np 3 | import tensorflow as tf 4 | import os 5 | import time 6 | import datetime 7 | import data_helpers 8 | from text_cnn import TextCNN 9 | #from binary_class_data_loader import BinaryClassDataLoader 10 | from multi_class_data_loader import MultiClassDataLoader 11 | #from word_data_processor import WordDataProcessor 12 | from char_data_processor import CharDataProcessor 13 | 14 | # Parameters 15 | # ================================================== 16 | # use this for static wordembedding 17 | # change the path to static the vec.bin is a Chinese word2vec file trained from the sina weibo 18 | tf.flags.DEFINE_string("word2vec","./data/rt-polaritydata/vec.bin", "word2vec file with pre-trained embedding (default: None)") 19 | tf.flags.DEFINE_integer("dev_batch_size", 4096, "Batch Size (default: 64)") 20 | 21 | # Model Hyperparameters 22 | tf.flags.DEFINE_integer("embedding_dim", 50, "Dimensionality of character embedding (default: 128)") 23 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 24 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 25 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 26 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)") 27 | 28 | 29 | # Training parameters 30 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 31 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") 32 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") 33 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") 34 | # Misc Parameters 35 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 36 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 37 | 38 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor()) 39 | data_loader.define_flags() 40 | 41 | FLAGS = tf.flags.FLAGS 42 | FLAGS._parse_flags() 43 | print("\nParameters:") 44 | for attr, value in sorted(FLAGS.__flags.items()): 45 | print("{}={}".format(attr.upper(), value)) 46 | print("") 47 | 48 | 49 | # Data Preparatopn 50 | # ================================================== 51 | 52 | # Load data 53 | print("Loading data...") 54 | x_train, y_train, x_dev, y_dev = data_loader.prepare_data() 55 | vocab_processor = data_loader.vocab_processor 56 | 57 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) 58 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 59 | 60 | 61 | # Training 62 | # ================================================== 63 | 64 | with tf.Graph().as_default(): 65 | session_conf = tf.ConfigProto( 66 | allow_soft_placement=FLAGS.allow_soft_placement, 67 | log_device_placement=FLAGS.log_device_placement) 68 | sess = tf.Session(config=session_conf) 69 | with sess.as_default(): 70 | cnn = TextCNN( 71 | sequence_length=x_train.shape[1], 72 | num_classes=y_train.shape[1], 73 | vocab_size=len(vocab_processor.vocabulary_), 74 | embedding_size=FLAGS.embedding_dim, 75 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 76 | num_filters=FLAGS.num_filters, 77 | l2_reg_lambda=FLAGS.l2_reg_lambda) 78 | 79 | # Define Training procedure 80 | global_step = tf.Variable(0, name="global_step", trainable=False) 81 | optimizer = tf.train.AdamOptimizer(1e-3) 82 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 83 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 84 | 85 | # Keep track of gradient values and sparsity (optional) 86 | grad_summaries = [] 87 | for g, v in grads_and_vars: 88 | if g is not None: 89 | grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) 90 | sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 91 | grad_summaries.append(grad_hist_summary) 92 | grad_summaries.append(sparsity_summary) 93 | grad_summaries_merged = tf.merge_summary(grad_summaries) 94 | 95 | # Output directory for models and summaries 96 | timestamp = str(int(time.time())) 97 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 98 | print("Writing to {}\n".format(out_dir)) 99 | 100 | # Summaries for loss and accuracy 101 | loss_summary = tf.scalar_summary("loss", cnn.loss) 102 | acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) 103 | 104 | # Train Summaries 105 | train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 106 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 107 | train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) 108 | 109 | # Dev summaries 110 | dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) 111 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 112 | dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) 113 | 114 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 115 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 116 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 117 | if not os.path.exists(checkpoint_dir): 118 | os.makedirs(checkpoint_dir) 119 | saver = tf.train.Saver(tf.all_variables()) 120 | 121 | # Write vocabulary 122 | vocab_processor.save(os.path.join(out_dir, "vocab")) 123 | 124 | # Initialize all variables 125 | sess.run(tf.initialize_all_variables()) 126 | if FLAGS.word2vec: 127 | # initial matrix with random uniform, vocab_processor.vocabulary_ is the vocabulalu 128 | initW = np.random.uniform(-0.25, 0.25, (len(vocab_processor.vocabulary_), FLAGS.embedding_dim)) 129 | # load any vecotors from the 130 | print("Load word2vec file {}\n".format(FLAGS.word2vec)) 131 | #read as a binary file 132 | with open(FLAGS.word2vec, "rb") as f: 133 | head = f.readline() 134 | vocab_size, layer1_size = map(int, head.split()) 135 | binary_len = np.dtype('float32').itemsize * layer1_size 136 | for line in xrange(vocab_size): 137 | word = [] 138 | while True: 139 | ch = f.read(1) 140 | if ch == ' ': 141 | word = ''.join(word) 142 | break 143 | if ch != '\n': 144 | word.append(ch) 145 | idx = vocab_processor.vocabulary_.get(word) 146 | if idx != None: 147 | initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') 148 | else: 149 | f.read(binary_len) 150 | sess.run(cnn.W.assign(initW)) 151 | 152 | def train_step(x_batch, y_batch): 153 | """ 154 | A single training step 155 | """ 156 | feed_dict = { 157 | cnn.input_x: x_batch, 158 | cnn.input_y: y_batch, 159 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 160 | } 161 | _, step, summaries, loss, accuracy = sess.run( 162 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 163 | feed_dict) 164 | time_str = datetime.datetime.now().isoformat() 165 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 166 | train_summary_writer.add_summary(summaries, step) 167 | 168 | def dev_step(x_batch, y_batch, writer=None): 169 | """ 170 | Evaluates model on a dev set 171 | """ 172 | feed_dict = { 173 | cnn.input_x: x_batch, 174 | cnn.input_y: y_batch, 175 | cnn.dropout_keep_prob: 1.0 176 | } 177 | step, summaries, loss, accuracy = sess.run( 178 | [global_step, dev_summary_op, cnn.loss, cnn.accuracy], 179 | feed_dict) 180 | time_str = datetime.datetime.now().isoformat() 181 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 182 | if writer: 183 | writer.add_summary(summaries, step) 184 | 185 | # Generate batches 186 | batches = data_helpers.batch_iter( 187 | list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) 188 | # Training loop. For each batch...\ 189 | 190 | 191 | for batch in batches: 192 | x_batch, y_batch = zip(*batch) 193 | train_step(x_batch, y_batch) 194 | current_step = tf.train.global_step(sess, global_step) 195 | if current_step % FLAGS.evaluate_every == 0: 196 | print("\nEvaluation:") 197 | dev_batches = data_helpers.batch_iter( 198 | list(zip(x_dev, y_dev)), FLAGS.dev_batch_size, 1) 199 | for dev_batch in dev_batches: 200 | x_dev_batch, y_dev_batch = zip(*dev_batch) 201 | dev_step(x_dev_batch, y_dev_batch, writer=dev_summary_writer) 202 | print("") 203 | if current_step % FLAGS.checkpoint_every == 0: 204 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 205 | print("Saved model checkpoint to {}\n".format(path)) 206 | -------------------------------------------------------------------------------- /CNN/word_data_processor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from tensorflow.contrib import learn 3 | 4 | class WordDataProcessor(object): 5 | def vocab_processor(_, *texts): 6 | max_document_length = 0 7 | for text in texts: 8 | max_doc_len = max([len(line.split(" ")) for line in text]) 9 | if max_doc_len > max_document_length: 10 | max_document_length = max_doc_len 11 | return learn.preprocessing.VocabularyProcessor(max_document_length) 12 | 13 | def restore_vocab_processor(_, vocab_path): 14 | return learn.preprocessing.VocabularyProcessor.restore(vocab_path) 15 | 16 | def clean_data(_, string): 17 | """ 18 | Tokenization/string cleaning for all datasets except for SST. 19 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 20 | """ 21 | string = string.strip() 22 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 23 | string = re.sub(r"\'s", " \'s", string) 24 | string = re.sub(r"\'ve", " \'ve", string) 25 | string = re.sub(r"n\'t", " n\'t", string) 26 | string = re.sub(r"\'re", " \'re", string) 27 | string = re.sub(r"\'d", " \'d", string) 28 | string = re.sub(r"\'ll", " \'ll", string) 29 | string = re.sub(r",", " , ", string) 30 | string = re.sub(r"!", " ! ", string) 31 | string = re.sub(r"\(", " \( ", string) 32 | string = re.sub(r"\)", " \) ", string) 33 | string = re.sub(r"\?", " \? ", string) 34 | string = re.sub(r"\s{2,}", " ", string) 35 | return string.strip().lower() 36 | -------------------------------------------------------------------------------- /GraphCNN/SVM_model.py: -------------------------------------------------------------------------------- 1 | 2 | # HR-SVM 3 | 4 | from datetime import datetime 5 | import os.path 6 | import time 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import math 11 | 12 | import graphcnn_input 13 | import graphcnn_option 14 | 15 | 16 | 17 | class Model(object): 18 | ''' svm model 19 | ''' 20 | 21 | def __init__(self): 22 | self._paramaters_list = [] 23 | 24 | def linear_SVM(self, data, target): 25 | ''' Linear Support Vector Machine: Soft Margin 26 | data: 2D of [samples number, feature vector dimension] 27 | target: 2D of [samples number, 1], with value -1 or 1 28 | ''' 29 | # feature vector dimension 30 | feature_dim = data.get_shape()[1].value 31 | 32 | # Create variables for linear regression 33 | A = tf.Variable(tf.random_normal(shape=[feature_dim,1])) 34 | b = tf.Variable(tf.random_normal(shape=[1,1])) 35 | 36 | # record para 37 | self._paramaters_list.append(A) 38 | self._paramaters_list.append(b) 39 | 40 | # Declare model operations 41 | model_output = tf.sub(tf.matmul(data, A), b) 42 | 43 | # Declare vector L2 'norm' function squared 44 | l2_norm = tf.reduce_sum(tf.square(A)) 45 | 46 | # Declare loss function 47 | # Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2 48 | # L2 regularization parameter, alpha 49 | alpha = tf.constant([0.01]) 50 | # Margin term in loss 51 | classification_term = tf.reduce_mean(tf.maximum(0., tf.sub(1., tf.mul(model_output, target)))) 52 | # Put terms together 53 | loss = tf.add(classification_term, tf.mul(alpha, l2_norm),name='svm_loss') 54 | 55 | tf.add_to_collection('losses', loss) 56 | 57 | return model_output 58 | 59 | 60 | def compute_dependencies_loss(model_list): 61 | # Calculate the Variable's dependency constraint 62 | filename = os.path.join(graphcnn_option.DATA_PATH, 'fathercode') 63 | father = np.loadtxt(filename, dtype=int) 64 | 65 | # Calculate the inner nodes' parameters value 66 | inner = np.zeros([graphcnn_input.NUM_CLASSES]) 67 | for i in range(0, graphcnn_input.NUM_CLASSES): 68 | father_i = father[i] 69 | if father_i != -1: 70 | inner[father_i] = 1 71 | nodes = [] 72 | for i in range(0, graphcnn_input.NUM_CLASSES): 73 | nodes.append([]) 74 | for i in range(0, graphcnn_input.NUM_CLASSES): 75 | if inner[i] == 1: 76 | father_i = father[i] 77 | nodes[i].append(model_list[i]._paramaters_list) 78 | if father_i != -1: 79 | nodes[i].append(model_list[father_i]._paramaters_list) 80 | nodes[father_i].append(model_list[i]._paramaters_list) 81 | nodes_paras = [] 82 | for i in range(0, graphcnn_input.NUM_CLASSES): 83 | para_list = [] 84 | if inner[i] == 1: 85 | para_list_len = len(nodes[i][0]) 86 | para_list_num = len(nodes[i]) 87 | for para_i in range(0,para_list_len): 88 | para = [] 89 | for para_list_i in range(0,para_list_num): 90 | para.append(nodes[i][para_list_i][para_i]) 91 | para_list.append(tf.truediv(tf.add_n(para), float(para_list_num))) ##??????????????? 92 | nodes_paras.append(para_list) 93 | 94 | for i in range(0, graphcnn_input.NUM_CLASSES): 95 | if inner[i] == 1: 96 | model_para = model_list[i]._paramaters_list 97 | father_model_para = nodes_paras[i] 98 | else: 99 | model_para = model_list[i]._paramaters_list 100 | father_i = father[i] 101 | if father_i != -1: 102 | father_model_para = nodes_paras[father_i] 103 | assert len(model_para) == len(father_model_para), ' something is wrong' 104 | for j in range(0, len(model_para)): 105 | sub_vector = tf.sub(model_para[j], father_model_para[j]) 106 | reshape = tf.reshape(sub_vector, [1, -1]) 107 | reshape_trans = tf.reshape(sub_vector, [-1, 1]) 108 | dependencies = tf.mul(tf.matmul(reshape, reshape_trans)[0, 0], graphcnn_option.VARIABLE_DEPENDENCY, 109 | name='dependencies_loss') 110 | tf.add_to_collection('losses', dependencies) 111 | 112 | def SVM_inference(data, target, dependencies_loss=True): 113 | ''' 114 | data: 2D of [samples number, feature vector dimension] 115 | target: 2D of [samples number, NUM_CLASSES], with value -1 or 1 116 | ''' 117 | 118 | model_list = [] 119 | logits_list = [] 120 | for i in range(0, graphcnn_input.NUM_CLASSES): 121 | target_i = target[:,i] 122 | target_i = tf.reshape(target_i, [-1, 1]) 123 | model = Model() 124 | logits = model.linear_SVM(data, target_i) 125 | model_list.append(model) 126 | logits_list.append(logits) 127 | logits = tf.concat(1, logits_list) 128 | 129 | if dependencies_loss: 130 | compute_dependencies_loss(model_list) 131 | 132 | return logits 133 | 134 | def SVM_loss(): 135 | ''' add loss function: cross entropy. 136 | ''' 137 | return tf.add_n(tf.get_collection('losses'), name='total_loss') 138 | 139 | def _add_loss_summaries(total_loss): 140 | """ Add summaries for losses. 141 | Generates moving average for all losses and associated summaries for visualizing the performance of the network. 142 | moving average -> eliminate noise 143 | 144 | Args: 145 | total_loss: Total loss from loss(). 146 | Returns: 147 | loss_averages_op: op for generating moving averages of losses. 148 | """ 149 | # Compute the moving average of all individual losses and the total loss. 150 | # The moving averages are computed using exponential decay: 151 | # shadow_variable -= (1 - decay) * (shadow_variable - variable) equivalent to: 152 | # shadow_variable = decay * shadow_variable + (1 - decay) * variable 153 | loss_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY, name='avg') 154 | losses = tf.get_collection('losses') 155 | loss_averages_op = loss_averages.apply(losses + [total_loss]) 156 | 157 | if graphcnn_option.SUMMARYWRITER: 158 | # Attach a scalar summary to all individual losses and the total loss; do the same for the averaged version of the losses. 159 | for l in losses + [total_loss]: 160 | # Name each loss as '(raw)' and name the moving average version of the loss as the original loss name. 161 | tf.scalar_summary(l.op.name + ' (raw)', l) 162 | tf.scalar_summary(l.op.name, loss_averages.average(l)) 163 | 164 | return loss_averages_op 165 | 166 | def SVM_train(total_loss, global_step): 167 | """ Create an optimizer and apply to all trainable variables. 168 | Add moving average for all trainable variables. 169 | 170 | Args: 171 | total_loss: total loss from loss(). 172 | global_step: Integer Variable counting the number of training steps processed. 173 | 174 | Returns: 175 | train_op: op for training. 176 | """ 177 | 178 | # Variables that affect learning rate. 179 | num_batches_per_epoch = graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / graphcnn_input.TRAIN_BATCH_SIZE 180 | decay_steps = int(num_batches_per_epoch * graphcnn_option.NUM_EPOCHS_PER_DECAY) 181 | 182 | # Decay the learning rate exponentially based on the number of steps. 183 | # decayed_learning_rate = INITIAL_LEARNING_RATE * LEARNING_RATE_DECAY_RATE ^ (global_step / decay_steps) 184 | lr = tf.train.exponential_decay(graphcnn_option.INITIAL_LEARNING_RATE, 185 | global_step, 186 | decay_steps, 187 | graphcnn_option.LEARNING_RATE_DECAY_RATE, 188 | staircase=True) 189 | 190 | if graphcnn_option.SUMMARYWRITER: 191 | tf.scalar_summary('learning_rate', lr) 192 | 193 | # Generate moving averages of all losses and associated summaries. 194 | loss_averages_op = _add_loss_summaries(total_loss) 195 | 196 | # Compute gradients 197 | with tf.control_dependencies([loss_averages_op]): 198 | # opt = tf.train.GradientDescentOptimizer(lr) 199 | opt = tf.train.MomentumOptimizer(lr, graphcnn_option.MOMENTUM) 200 | grads = opt.compute_gradients(total_loss) 201 | 202 | # Apply gradients. 203 | apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) 204 | 205 | if graphcnn_option.SUMMARYWRITER: 206 | # Add histograms for trainable variables. 207 | for var in tf.trainable_variables(): 208 | tf.histogram_summary(var.op.name, var) 209 | 210 | # Add histograms for gradients. 211 | for grad, var in grads: 212 | if grad is not None: 213 | tf.histogram_summary(var.op.name + '/gradients', grad) 214 | 215 | # Track the moving averages of all trainable variables. 216 | variable_averages = tf.train.ExponentialMovingAverage( 217 | graphcnn_option.MOVING_AVERAGE_DECAY, global_step) 218 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 219 | 220 | with tf.control_dependencies([apply_gradient_op, variables_averages_op]): 221 | train_op = tf.no_op(name='train') 222 | 223 | return train_op 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /GraphCNN/SVM_train.py: -------------------------------------------------------------------------------- 1 | 2 | # HR-SVM 3 | 4 | from datetime import datetime 5 | import os.path 6 | import time 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import math 11 | 12 | import graphcnn_input 13 | import graphcnn_option 14 | import SVM_model 15 | 16 | 17 | 18 | FLAGS = tf.app.flags.FLAGS 19 | 20 | tf.app.flags.DEFINE_string('train_dir', './tmp/graphcnn_train', 21 | """Directory where to write event logs and checkpoint.""") 22 | tf.app.flags.DEFINE_integer('max_epochs', 8000, 23 | """Number of batches to run.""") 24 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 25 | """Whether to log device placement.""") 26 | 27 | 28 | # max_steps for train: 29 | STEPS_PER_ECOPH = None 30 | MAX_STEPS = None 31 | # the period to save the model checkpoint. 32 | CKPT_PERIOD = None 33 | 34 | trainDataSet = None 35 | 36 | 37 | def train(newTrain,checkpoint): 38 | with tf.Graph().as_default(): 39 | global_step = tf.Variable(0, trainable=False) 40 | 41 | data = tf.placeholder(tf.float32, [graphcnn_input.TRAIN_BATCH_SIZE, graphcnn_input.NUM_CHANNELS]) # NUM_CHANNELS: feature dim 42 | labels = tf.placeholder(tf.int32, [graphcnn_input.TRAIN_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) # with value: -1,1 43 | 44 | # inference model. 45 | logits = SVM_model.SVM_inference(data, labels) 46 | 47 | # Declare prediction function 48 | prediction = tf.sign(logits) 49 | accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, y_target), tf.float32)) 50 | 51 | # Calculate loss. 52 | loss = SVM_model.SVM_loss() 53 | 54 | # updates the model parameters. 55 | train_op = SVM_model.SVM_train(loss, global_step) 56 | 57 | # Create a saver. 58 | saver = tf.train.Saver(var_list=tf.global_variables(), 59 | max_to_keep=6, 60 | keep_checkpoint_every_n_hours=10) 61 | 62 | if graphcnn_option.SUMMARYWRITER: 63 | # Build the summary operation based on the TF collection of Summaries. 64 | summary_op = tf.merge_all_summaries() 65 | 66 | # Build an initialization operation to run below. 67 | init = tf.global_variables_initializer() 68 | 69 | # Start running operations on the Graph. allow_soft_placement must be set to 70 | # True to build towers on GPU, as some of the ops do not have GPU implementations. 71 | sess = tf.Session(config=tf.ConfigProto( 72 | allow_soft_placement=True, 73 | log_device_placement=FLAGS.log_device_placement)) 74 | 75 | first_step = 0 76 | if not newTrain: 77 | if checkpoint == '0': # choose the latest one 78 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 79 | if ckpt and ckpt.model_checkpoint_path: 80 | new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta') 81 | # Restores from checkpoint 82 | new_saver.restore(sess, ckpt.model_checkpoint_path) 83 | global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] 84 | first_step = int(global_step_for_restore) + 1 85 | else: 86 | print('No checkpoint file found') 87 | return 88 | else: # 89 | if os.path.exists(os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)): 90 | new_saver = tf.train.import_meta_graph( 91 | os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint + '.meta')) 92 | new_saver.restore(sess, 93 | os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)) 94 | first_step = int(checkpoint) + 1 95 | else: 96 | print('No checkpoint file found') 97 | return 98 | else: 99 | sess.run(init) 100 | 101 | if graphcnn_option.SUMMARYWRITER: 102 | summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) 103 | 104 | filename_train_log = os.path.join(FLAGS.train_dir, 'log_train') 105 | if os.path.exists(filename_train_log): 106 | file_train_log = open(filename_train_log, 'a') 107 | else: 108 | file_train_log = open(filename_train_log, 'w') 109 | 110 | # learning_rate = graphcnn_option.lr_decay_value[0] # 0.1(5), 0.01(100), 0.001(500), 0.0001(300), 0.00001(100) 111 | # learning_rate_index = 0 112 | for step in range(first_step,MAX_STEPS): 113 | # if learning_rate_index < len(graphcnn_option.lr_decay_value) - 1: 114 | # if step > STEPS_PER_ECOPH * graphcnn_option.lr_decay_ecophs[learning_rate_index]: 115 | # learning_rate_index = learning_rate_index + 1 116 | # learning_rate = graphcnn_option.lr_decay_value[learning_rate_index] 117 | 118 | train_data, train_label = trainDataSet.next_batch(graphcnn_input.TRAIN_BATCH_SIZE) 119 | start_time = time.time() 120 | _, loss_value = sess.run([train_op, loss], 121 | feed_dict= {data:train_data, labels:train_label}) 122 | duration = time.time() - start_time 123 | 124 | assert not np.isnan(loss_value), 'Model diverged with loss = NaN' 125 | 126 | if step % 10 == 0: 127 | sec_per_batch = float(duration) 128 | format_str = ('%s: step=%d, loss=%.4f; %.3f sec/batch)') 129 | print(format_str % (datetime.now(), step, loss_value, sec_per_batch), file=file_train_log) 130 | print(format_str % (datetime.now(), step, loss_value, sec_per_batch)) 131 | 132 | if graphcnn_option.SUMMARYWRITER: 133 | if step % 100 == 0: 134 | summary_str = sess.run(summary_op, 135 | feed_dict= {data:train_data, labels:train_label}) 136 | summary_writer.add_summary(summary_str, step) 137 | 138 | # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') 139 | if step % CKPT_PERIOD == 0 or (step + 1) == MAX_STEPS: 140 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 141 | saver.save(sess, checkpoint_path, global_step=step) 142 | file_train_log.close() 143 | 144 | def main(argv=None): 145 | global trainDataSet, evalDataSet, STEPS_PER_ECOPH, MAX_STEPS, CKPT_PERIOD 146 | newTrain = True 147 | checkpoint = 0 148 | # assert not tf.gfile.Exists(FLAGS.train_dir), 'please move the old train directory to pre_versions!' 149 | if tf.gfile.Exists(FLAGS.train_dir): 150 | ans = input('whether to open up a new training:(y/n)') 151 | if ans == 'y' or ans == 'Y': 152 | newTrain = True 153 | tf.gfile.DeleteRecursively(FLAGS.train_dir) 154 | elif ans == 'n' or ans == 'N': 155 | newTrain = False 156 | checkpoint = input('please input the choosed checkpoint to restore:(0 for latest)') 157 | else: 158 | print('invalid input!') 159 | return 160 | if newTrain: 161 | tf.gfile.MakeDirs(FLAGS.train_dir) 162 | 163 | # update paras 164 | trainDataSet = graphcnn_input.generate_SVM_train_data(graphcnn_option.TRAIN_DATA_DIR, 165 | ont_hot=True,index_mode=True) 166 | 167 | # max_steps for train: 168 | STEPS_PER_ECOPH = math.ceil( 169 | graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / float(graphcnn_input.TRAIN_BATCH_SIZE)) 170 | MAX_STEPS = FLAGS.max_epochs * STEPS_PER_ECOPH 171 | 172 | # the period to save the model checkpoint. 173 | CKPT_PERIOD = graphcnn_option.CKPT_PERIOD # ????????????????????? 174 | # CKPT_PERIOD = 5000 175 | # tem = str(STEPS_PER_ECOPH * 20) # save the model every ecoph # 5 176 | # CKPT_PERIOD = int(int(tem[0]) * pow(10, len(tem) - 1)) 177 | 178 | print('training...') 179 | train(newTrain,checkpoint) 180 | 181 | 182 | if __name__ == '__main__': 183 | tf.app.run() 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /GraphCNN/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | """ must run in python3x""" 3 | import numpy as np 4 | import tensorflow as tf 5 | import os 6 | import shutil 7 | __author__ = 'Yu He' 8 | __version__ = 'v30' 9 | 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 11 | 12 | 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution') 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float) 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value') 16 | total_true_value = np.loadtxt(detail_filename,dtype=int) 17 | 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 19 | 20 | 21 | 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int) 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1) 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1) 25 | # 26 | 27 | 28 | filename_eval_log = os.path.join('./data', 'log_eval') 29 | file_eval_log = open(filename_eval_log, 'w') 30 | np.set_printoptions(threshold=np.nan) 31 | print('\nevaluation:', file=file_eval_log) 32 | print('\nevaluation:') 33 | 34 | total_predicted_value = total_predicted_value.astype(bool) 35 | total_true_value = total_true_value.astype(bool) 36 | 37 | print(' example based evaluations:', file=file_eval_log) 38 | print(' example based evaluations:') 39 | 40 | equal = total_true_value == total_predicted_value 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1) 42 | exact_match_ratio = np.sum(match) / np.size(match) 43 | print(' exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log) 44 | print(' exact_match_ratio = %.4f' % exact_match_ratio) 45 | 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1) 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1) 48 | accuracy = np.mean(true_and_predict / true_or_predict) 49 | print(' accuracy = %.4f' % accuracy, file=file_eval_log) 50 | print(' accuracy = %.4f' % accuracy) 51 | 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9)) 53 | print(' precison = %.4f' % precison, file=file_eval_log) 54 | print(' precison = %.4f' % precison) 55 | 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1)) 57 | print(' recall = %.4f' % recall, file=file_eval_log) 58 | print(' recall = %.4f' % recall) 59 | 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1) 61 | + np.sum(total_predicted_value, axis=1))) 62 | print(' F1_Measure = %.4f' % F1_Measure, file=file_eval_log) 63 | print(' F1_Measure = %.4f' % F1_Measure) 64 | 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value) 66 | print(' HammingLoss = %.4f' % HammingLoss, file=file_eval_log) 67 | print(' HammingLoss = %.4f' % HammingLoss) 68 | 69 | 70 | print(' label based evaluations:', file=file_eval_log) 71 | print(' label based evaluations:') 72 | 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32) 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32) 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32) 76 | 77 | TP_re = np.reshape(TP,[TP.shape[0],1]) 78 | FP_re = np.reshape(FP,[FP.shape[0],1]) 79 | FN_re = np.reshape(FN,[FN.shape[0],1]) 80 | re = np.concatenate((TP_re,FP_re,FN_re),axis=1) 81 | print('TP FP FN:') 82 | print('TP FP FN:', file=file_eval_log) 83 | print(re,file=file_eval_log) 84 | print(re) 85 | 86 | 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:])) 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:])) 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:])) 90 | 91 | # for i in [6,28,31,36,52]: 92 | # TP[i] = TP[i-1] 93 | # FP[i] = FP[i - 1] 94 | # FN[i] = FN[i - 1] 95 | # 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:])) 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:])) 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:])) 99 | 100 | 101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9 ) 102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9 ) 103 | Micro_F1 = (2 * _P *_R) / (_P + _R) 104 | print(' P = %.4f' % _P, file=file_eval_log) 105 | print(' P = %.4f' % _P) 106 | print(' R = %.4f' % _R, file=file_eval_log) 107 | print(' R = %.4f' % _R) 108 | print(' Micro-F1 = %.4f' % Micro_F1, file=file_eval_log) 109 | print(' Micro-F1 = %.4f' % Micro_F1) 110 | 111 | _P_t = TP / (TP + FP + 1e-9) 112 | _R_t = TP / (TP + FN + 1e-9) 113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 114 | 115 | 116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1]) 117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1]) 118 | re = np.concatenate((_P_t_re,_R_t_re),axis=1) 119 | print('_P_t _R_t:') 120 | print('_P_t:', file=file_eval_log) 121 | print(re,file=file_eval_log) 122 | print(re) 123 | 124 | print(' Macro-F1 = %.4f' % Macro_F1, file=file_eval_log) 125 | print(' Macro-F1 = %.4f' % Macro_F1) 126 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_eval_without_labels.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from datetime import datetime 4 | import math 5 | import time 6 | import os 7 | import shutil 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | import graphcnn_model 13 | import graphcnn_input 14 | import graphcnn_option 15 | 16 | 17 | evalDataSet = None 18 | 19 | FLAGS = tf.app.flags.FLAGS 20 | 21 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_eval', 22 | """Directory where to write event logs.""") 23 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 24 | """Directory where to read model checkpoints.""") 25 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 26 | """How often to run the eval.""") 27 | tf.app.flags.DEFINE_boolean('run_once', False, 28 | """Whether to run eval only once.""") 29 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 30 | """Whether to log device placement.""") 31 | 32 | 33 | 34 | 35 | def evaluate(checkpoint): 36 | with tf.Graph().as_default() as g: 37 | # Get images and labels 38 | data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH, 39 | graphcnn_input.NUM_CHANNELS]) 40 | # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) 41 | 42 | # inference 43 | # logits = graphcnn_model.inference(data, eval_data=True) 44 | logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False) 45 | 46 | # multi-label sigmoid 47 | logits = tf.sigmoid(logits) 48 | 49 | # Restore the moving average version of the learned variables for eval. # ????????????????????????? 50 | variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY) 51 | variables_to_restore = variable_averages.variables_to_restore() 52 | saver = tf.train.Saver(variables_to_restore) 53 | 54 | # Build the summary operation based on the TF collection of Summaries. 55 | # summary_op = tf.merge_all_summaries() 56 | # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) 57 | 58 | 59 | with tf.Session(config=tf.ConfigProto( 60 | allow_soft_placement=True, 61 | log_device_placement=FLAGS.log_device_placement)) as sess: 62 | if checkpoint == '0': 63 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) 64 | if ckpt and ckpt.model_checkpoint_path: 65 | # Restores from checkpoint 66 | saver.restore(sess, ckpt.model_checkpoint_path) 67 | # extract global_step 68 | global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) 69 | else: 70 | print('No checkpoint file found') 71 | return 72 | else: 73 | if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)): 74 | saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)) 75 | global_step_for_restore = int(checkpoint) 76 | else: 77 | print('No checkpoint file found') 78 | return 79 | 80 | num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE)) 81 | total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE 82 | step = 0 83 | total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32) ## 84 | while step < num_iter: 85 | test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE) 86 | predicted_value = sess.run( 87 | logits, feed_dict={data: test_data}) 88 | total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0) 89 | step += 1 90 | 91 | total_predicted_value = total_predicted_value[1:] 92 | 93 | np.savetxt('./log_eval_for_predicted_value_dictribution', total_predicted_value[range(0,100)], fmt='%.4f') 94 | 95 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution') 96 | if os.path.exists(detail_filename): 97 | os.remove(detail_filename) 98 | np.savetxt(detail_filename, total_predicted_value, fmt='%.4f') 99 | total_predicted_value = ((total_predicted_value) >= graphcnn_option.EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 100 | assert total_sample_count == total_predicted_value.shape[0], 'sample_count error!' 101 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 102 | if os.path.exists(detail_filename): 103 | os.remove(detail_filename) 104 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 105 | 106 | np.savetxt('./log_eval_for_predicted_value', total_predicted_value[range(0,100)], fmt='%.4f') 107 | 108 | detail_filename = os.path.join(graphcnn_option.DATA_PATH,'remap') 109 | remap = np.loadtxt(detail_filename,dtype=int) 110 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 111 | fr = open(detail_filename, 'w') 112 | for i in range(0, np.size(total_predicted_value, axis=0)): 113 | labels = np.where(total_predicted_value[i] == 1)[0] 114 | labels_remap = remap[labels,0] 115 | for elem in labels_remap: 116 | print(elem, end=' ', file=fr) 117 | print('', file=fr) 118 | fr.close() 119 | 120 | filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval') 121 | file_eval_log = open(filename_eval_log, 'w') 122 | np.set_printoptions(threshold=np.nan) 123 | print('\nevaluation:', file=file_eval_log) 124 | print('\nevaluation:') 125 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log) 126 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore)) 127 | print('evaluation is end...') 128 | print('evaluation is end...', file=file_eval_log) 129 | 130 | print('evaluation samples number:%d, evaluation classes number:%d' % 131 | (total_predicted_value.shape[0], total_predicted_value.shape[1]),file=file_eval_log) 132 | print('evaluation samples number:%d, evaluation classes number:%d' % 133 | (total_predicted_value.shape[0], total_predicted_value.shape[1])) 134 | print('evaluation detail: ' 135 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 136 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'), 137 | file=file_eval_log) 138 | print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval') 139 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 140 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution')) 141 | file_eval_log.close() 142 | 143 | 144 | def main(argv=None): # pylint: disable=unused-argument 145 | global evalDataSet 146 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 147 | if tf.gfile.Exists(FLAGS.eval_dir): 148 | print('the evaluate data has already exists!') 149 | str = input('continue will delete the old evaluate directory:(y/n)') 150 | if str == 'y' or str == 'Y': 151 | tf.gfile.DeleteRecursively(FLAGS.eval_dir) 152 | elif str == 'n' or str == 'N': 153 | print('eval end!') 154 | return 155 | else: 156 | print('invalid input!') 157 | return 158 | tf.gfile.MakeDirs(FLAGS.eval_dir) 159 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 160 | checkpoint = '0' 161 | evalDataSet = graphcnn_input.generate_eval_data(graphcnn_option.EVAL_DATA_DIR, 162 | ont_hot=True,index_mode=True, 163 | label_used=False) 164 | print('evaluating...') 165 | evaluate(checkpoint) 166 | 167 | 168 | if __name__ == '__main__': 169 | tf.app.run() 170 | 171 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_all.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 222 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | 19 | evalDataSet = None 20 | 21 | FLAGS = tf.app.flags.FLAGS 22 | 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 24 | """Directory where to write event logs.""") 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 26 | """Directory where to read model checkpoints.""") 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 28 | """How often to run the eval.""") 29 | tf.app.flags.DEFINE_boolean('run_once', False, 30 | """Whether to run eval only once.""") 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 32 | """Whether to log device placement.""") 33 | 34 | 35 | 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 37 | 38 | def evaluate(checkpoint,test_index_array): 39 | with tf.Graph().as_default() as g, tf.device('/cpu:0'): 40 | # Get images and labels 41 | data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH, 42 | graphcnn_input.NUM_CHANNELS]) 43 | # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) 44 | 45 | # inference 46 | logits = graphcnn_model.inference(data, eval_data=True) 47 | # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False) 48 | 49 | # multi-label sigmoid 50 | logits = tf.sigmoid(logits) 51 | 52 | # Restore the moving average version of the learned variables for eval. # ????????????????????????? 53 | variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY) 54 | variables_to_restore = variable_averages.variables_to_restore() 55 | saver = tf.train.Saver(variables_to_restore) 56 | 57 | # Build the summary operation based on the TF collection of Summaries. 58 | # summary_op = tf.merge_all_summaries() 59 | # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) 60 | 61 | 62 | with tf.Session(config=tf.ConfigProto( 63 | allow_soft_placement=True, 64 | log_device_placement=FLAGS.log_device_placement)) as sess: 65 | if checkpoint == '0': 66 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) 67 | if ckpt and ckpt.model_checkpoint_path: 68 | # Restores from checkpoint 69 | saver.restore(sess, ckpt.model_checkpoint_path) 70 | # extract global_step 71 | global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) 72 | else: 73 | print('No checkpoint file found') 74 | return 75 | else: 76 | if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)): 77 | saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)) 78 | global_step_for_restore = int(checkpoint) 79 | else: 80 | print('No checkpoint file found') 81 | return 82 | 83 | num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE)) 84 | total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE 85 | step = 0 86 | total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32) ## 87 | while step < num_iter: 88 | test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE) 89 | predicted_value = sess.run( 90 | logits, feed_dict={data: test_data}) 91 | total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0) 92 | step += 1 93 | 94 | total_predicted_value = total_predicted_value[1:] 95 | 96 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 97 | if os.path.exists(detail_filename): 98 | os.remove(detail_filename) 99 | np.savetxt(detail_filename, total_predicted_value, fmt='%.4f') 100 | 101 | 102 | filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval') 103 | file_eval_log = open(filename_eval_log, 'w') 104 | np.set_printoptions(threshold=np.nan) 105 | print('\nevaluation:', file=file_eval_log) 106 | print('\nevaluation:') 107 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log) 108 | print(' %s, ckpt-%d' % (datetime.now(), global_step_for_restore)) 109 | print('evaluation is end...') 110 | print('evaluation is end...', file=file_eval_log) 111 | 112 | print('evaluation samples number:%d, evaluation classes number:%d' % 113 | (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log) 114 | print('evaluation samples number:%d, evaluation classes number:%d' % 115 | (total_predicted_value.shape[0], total_predicted_value.shape[1])) 116 | print('evaluation detail: ' 117 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 118 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'), 119 | file=file_eval_log) 120 | print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval') 121 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 122 | + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution')) 123 | file_eval_log.close() 124 | 125 | 126 | 127 | def main(argv=None): # pylint: disable=unused-argument 128 | global evalDataSet 129 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 130 | 131 | if tf.gfile.Exists(FLAGS.eval_dir): 132 | # print('the evaluate data has already exists!') 133 | # str = input('continue will delete the old evaluate directory:(y/n)') 134 | # if str == 'y' or str == 'Y': 135 | tf.gfile.DeleteRecursively(FLAGS.eval_dir) 136 | #elif str == 'n' or str == 'N': 137 | # print('eval end!') 138 | # return 139 | #else: 140 | # print('invalid input!') 141 | # return 142 | tf.gfile.MakeDirs(FLAGS.eval_dir) 143 | 144 | test_index_array = np.array(range(0, 81262)) 145 | 146 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 147 | checkpoint = '0' 148 | evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array, 149 | data_dir=graphcnn_option.EVAL_DATA_DIR, 150 | ont_hot=True, 151 | index_mode=True, 152 | label_used=False) 153 | print('evaluating...') 154 | evaluate(checkpoint,test_index_array) 155 | 156 | 157 | if __name__ == '__main__': 158 | tf.app.run() 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_some.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 444 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | 37 | 38 | # 生成测试数据的索引文件 39 | def generate_eval_index(): 40 | test_index_array = [] 41 | # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME) 42 | filepath = '../hier_eval_root' 43 | pathDir = os.listdir(filepath) 44 | for allDir in pathDir: 45 | child = os.path.join(filepath, allDir) 46 | if os.path.getsize(child): 47 | example_label_array = np.loadtxt(child,dtype=int) 48 | examlpe_array = example_label_array[:,0] 49 | label_array = example_label_array[:, 1] 50 | for root in graphcnn_option.HIER_ROOT_CODE: 51 | index = np.where(label_array==root)[0] 52 | for one in examlpe_array[index]: 53 | if one not in test_index_array: 54 | test_index_array.append(one) 55 | 56 | # for allDir in pathDir: 57 | # child = os.path.join(filepath, allDir) 58 | # os.remove(child) 59 | 60 | # 将索引文件写到hier_eval文件夹下 61 | filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index') 62 | np.savetxt(filename,test_index_array,fmt='%d') 63 | 64 | return test_index_array 65 | 66 | 67 | def evaluate(checkpoint,test_index_array): 68 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 69 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 70 | total_predicted_value = total_predicted_value[test_index_array] 71 | 72 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 73 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 74 | total_predicted_value = ( 75 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 76 | 77 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 78 | if os.path.exists(detail_filename): 79 | os.remove(detail_filename) 80 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 81 | 82 | 83 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 84 | total_remap = np.loadtxt(filename, dtype=int) 85 | 86 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 87 | graphcnn_option.HIER_labels_remap_file) 88 | remap = np.loadtxt(detail_filename, dtype=int) 89 | 90 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 91 | fr_leaf = open(filename,'a') 92 | filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file) 93 | fr_leaf_exp = open(filename, 'a') 94 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 95 | fr_root = open(filename, 'w') 96 | 97 | # rootstr_tmp = [] 98 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 99 | fr = open(detail_filename, 'w') 100 | for i in range(0, np.size(total_predicted_value, axis=0)): 101 | labels = np.where(total_predicted_value[i] == 1)[0] 102 | if len(labels) > 0: 103 | labels_remap = remap[labels, 0] 104 | for elem in labels_remap: 105 | print(elem, end=' ', file=fr) 106 | if elem in total_remap[:,0]: # leaf 107 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 108 | else: 109 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 110 | # for j in range(0,len(rootlist)): 111 | # if elem in rootlist[j]: 112 | # if rootstr[j] not in rootstr_tmp: 113 | # rootstr_tmp.append(rootstr[j]) 114 | print('', file=fr) 115 | else: 116 | # labels_remap = remap[:, 0] 117 | labels = total_predicted_value_argmax[i] 118 | labels_value = total_predicted_value_max[i] 119 | labels_remap = remap[labels, 0] 120 | # for elem in labels_remap: 121 | elem = labels_remap 122 | print(elem, file=fr) 123 | if elem in total_remap[:, 0]: # leaf 124 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp) 125 | else: 126 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 127 | # if labels_value < 0.5: 128 | # labels_remap = remap[:, 0] 129 | # for elem in labels_remap: 130 | # if elem not in total_remap[:, 0]: 131 | # print('%d %d' % (test_index_array[i], elem), file=fr_root) 132 | 133 | fr.close() 134 | fr_leaf.close() 135 | fr_root.close() 136 | fr_leaf_exp.close() 137 | 138 | # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root') 139 | # fr = open(filename, 'w') 140 | # for one in rootstr_tmp: 141 | # print(one) 142 | # print(one,file=fr) 143 | # fr.close() 144 | 145 | 146 | 147 | 148 | def main(argv=None): # pylint: disable=unused-argument 149 | global evalDataSet 150 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 151 | 152 | # test_index_array = np.array(range(0, 81262)) 153 | if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root 154 | test_index_array = np.array(range(0,81262)) 155 | # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int) 156 | else: 157 | test_index_array = generate_eval_index() 158 | if test_index_array is None or len(test_index_array)==0: 159 | print('no hier_data need eval') 160 | return 161 | else: 162 | print('choosing for evaluation...') 163 | print('choosed number:%d' % len(test_index_array)) 164 | 165 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 166 | checkpoint = '0' 167 | 168 | # print('choosing for evaluation...') 169 | evaluate(checkpoint,test_index_array) 170 | 171 | 172 | if __name__ == '__main__': 173 | tf.app.run() 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_some2.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 333 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | def evaluate(checkpoint,test_index_array): 37 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 38 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 39 | total_predicted_value = total_predicted_value[test_index_array] 40 | 41 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 42 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 43 | total_predicted_value = ( 44 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 45 | 46 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 47 | if os.path.exists(detail_filename): 48 | os.remove(detail_filename) 49 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 50 | 51 | 52 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 53 | total_remap = np.loadtxt(filename, dtype=int) 54 | 55 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 56 | graphcnn_option.HIER_labels_remap_file) 57 | remap = np.loadtxt(detail_filename, dtype=int) 58 | 59 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 60 | fr_leaf = open(filename,'a') 61 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 62 | fr_root = open(filename, 'w') 63 | 64 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr') 65 | # fr = open(filename, 'r') 66 | # rootstr = fr.readlines() 67 | # fr.close() 68 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist') 69 | # fr = open(filename, 'r') 70 | # rootlines = fr.readlines() 71 | # fr.close() 72 | # rootlist = [] 73 | # for line in rootlines: 74 | # line = line.strip() 75 | # linelist = line.split(' ') 76 | # linelist = [int(k) for k in linelist] 77 | # rootlist.append(linelist) 78 | 79 | # rootstr_tmp = [] 80 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 81 | fr = open(detail_filename, 'w') 82 | for i in range(0, np.size(total_predicted_value, axis=0)): 83 | labels = np.where(total_predicted_value[i] == 1)[0] 84 | if len(labels) > 0: 85 | labels_remap = remap[labels, 0] 86 | for elem in labels_remap: 87 | print(elem, end=' ', file=fr) 88 | if elem in total_remap[:,0]: # leaf 89 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 90 | print('', file=fr) 91 | else: 92 | labels = total_predicted_value_argmax[i] 93 | labels_remap = remap[labels, 0] 94 | elem = labels_remap 95 | labels_value = total_predicted_value_max[i] 96 | print(elem, file=fr) 97 | if elem in total_remap[:, 0]: # leaf 98 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root) 99 | 100 | 101 | fr.close() 102 | fr_leaf.close() 103 | fr_root.close() 104 | 105 | 106 | 107 | 108 | def main(argv=None): # pylint: disable=unused-argument 109 | global evalDataSet 110 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 111 | 112 | test_index_array = np.array(range(0, 81262)) 113 | print('choosing for evaluation...') 114 | print('choosed number:%d' % len(test_index_array)) 115 | 116 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 117 | checkpoint = '0' 118 | 119 | # print('choosing for evaluation...') 120 | evaluate(checkpoint,test_index_array) 121 | 122 | 123 | if __name__ == '__main__': 124 | tf.app.run() 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_hier_eval_without_labels_some_root.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 444 4 | 5 | from datetime import datetime 6 | import math 7 | import time 8 | import os 9 | import shutil 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import graphcnn_model 15 | import graphcnn_input 16 | import graphcnn_option 17 | 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9 19 | 20 | evalDataSet = None 21 | 22 | FLAGS = tf.app.flags.FLAGS 23 | 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval', 25 | """Directory where to write event logs.""") 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train', 27 | """Directory where to read model checkpoints.""") 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 29 | """How often to run the eval.""") 30 | tf.app.flags.DEFINE_boolean('run_once', False, 31 | """Whether to run eval only once.""") 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 33 | """Whether to log device placement.""") 34 | 35 | 36 | 37 | 38 | # 生成测试数据的索引文件 39 | def generate_eval_index(): 40 | test_index_array = [] 41 | # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME) 42 | filepath = '../hier_eval_root' 43 | pathDir = os.listdir(filepath) 44 | for allDir in pathDir: 45 | child = os.path.join(filepath, allDir) 46 | if os.path.getsize(child): 47 | example_label_array = np.loadtxt(child,dtype=int) 48 | examlpe_array = example_label_array[:,0] 49 | label_array = example_label_array[:, 1] 50 | for root in graphcnn_option.HIER_ROOT_CODE: 51 | index = np.where(label_array==root)[0] 52 | for one in examlpe_array[index]: 53 | if one not in test_index_array: 54 | test_index_array.append(one) 55 | 56 | # for allDir in pathDir: 57 | # child = os.path.join(filepath, allDir) 58 | # os.remove(child) 59 | 60 | # 将索引文件写到hier_eval文件夹下 61 | filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index') 62 | np.savetxt(filename,test_index_array,fmt='%d') 63 | 64 | return test_index_array 65 | 66 | 67 | def evaluate(checkpoint,test_index_array): 68 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all') 69 | total_predicted_value = np.loadtxt(detail_filename,dtype=float) 70 | total_predicted_value = total_predicted_value[test_index_array] 71 | 72 | total_predicted_value_max = np.max(total_predicted_value, axis=1) 73 | total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1) 74 | total_predicted_value = ( 75 | (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 76 | 77 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value') 78 | if os.path.exists(detail_filename): 79 | os.remove(detail_filename) 80 | np.savetxt(detail_filename, total_predicted_value, fmt='%d') 81 | 82 | 83 | filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME) 84 | total_remap = np.loadtxt(filename, dtype=int) 85 | 86 | detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME, 87 | graphcnn_option.HIER_labels_remap_file) 88 | remap = np.loadtxt(detail_filename, dtype=int) 89 | 90 | filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file) 91 | fr_leaf = open(filename,'a') 92 | filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file) 93 | fr_leaf_exp = open(filename, 'a') 94 | filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file) 95 | fr_root = open(filename, 'w') 96 | 97 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr') 98 | # fr = open(filename, 'r') 99 | # rootstr = fr.readlines() 100 | # fr.close() 101 | # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist') 102 | # fr = open(filename, 'r') 103 | # rootlines = fr.readlines() 104 | # fr.close() 105 | # rootlist = [] 106 | # for line in rootlines: 107 | # line = line.strip() 108 | # linelist = line.split(' ') 109 | # linelist = [int(k) for k in linelist] 110 | # rootlist.append(linelist) 111 | 112 | # rootstr_tmp = [] 113 | detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list') 114 | fr = open(detail_filename, 'w') 115 | for i in range(0, np.size(total_predicted_value, axis=0)): 116 | labels = np.where(total_predicted_value[i] == 1)[0] 117 | if len(labels) > 0: 118 | labels_remap = remap[labels, 0] 119 | for elem in labels_remap: 120 | print(elem, end=' ', file=fr) 121 | if elem in total_remap[:,0]: # leaf 122 | print('%d %d'%(test_index_array[i],elem),file=fr_leaf) 123 | else: 124 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 125 | # for j in range(0,len(rootlist)): 126 | # if elem in rootlist[j]: 127 | # if rootstr[j] not in rootstr_tmp: 128 | # rootstr_tmp.append(rootstr[j]) 129 | print('', file=fr) 130 | else: 131 | # labels_remap = remap[:, 0] 132 | labels = total_predicted_value_argmax[i] 133 | labels_value = total_predicted_value_max[i] 134 | labels_remap = remap[labels, 0] 135 | # for elem in labels_remap: 136 | elem = labels_remap 137 | print(elem, file=fr) 138 | if elem in total_remap[:, 0]: # leaf 139 | print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp) 140 | else: 141 | print('%d %d' % (test_index_array[i], elem), file=fr_root) 142 | # if labels_value < 0.5: 143 | # labels_remap = remap[:, 0] 144 | # for elem in labels_remap: 145 | # if elem not in total_remap[:, 0]: 146 | # print('%d %d' % (test_index_array[i], elem), file=fr_root) 147 | 148 | fr.close() 149 | fr_leaf.close() 150 | fr_root.close() 151 | fr_leaf_exp.close() 152 | 153 | # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root') 154 | # fr = open(filename, 'w') 155 | # for one in rootstr_tmp: 156 | # print(one) 157 | # print(one,file=fr) 158 | # fr.close() 159 | 160 | 161 | 162 | 163 | def main(argv=None): # pylint: disable=unused-argument 164 | global evalDataSet 165 | # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!' 166 | 167 | # test_index_array = np.array(range(0, 81262)) 168 | if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root 169 | test_index_array = np.array(range(0,81262)) 170 | # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int) 171 | else: 172 | test_index_array = generate_eval_index() 173 | if test_index_array is None or len(test_index_array)==0: 174 | print('no hier_data need eval') 175 | return 176 | else: 177 | print('choosing for evaluation...') 178 | print('choosed number:%d' % len(test_index_array)) 179 | 180 | # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)') 181 | checkpoint = '0' 182 | 183 | # print('choosing for evaluation...') 184 | evaluate(checkpoint,test_index_array) 185 | 186 | 187 | if __name__ == '__main__': 188 | tf.app.run() 189 | 190 | 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_option.py: -------------------------------------------------------------------------------- 1 | 2 | ## data 3 | ORI_DATA_NAME = 'graphs' 4 | ORI_TRAIN_DATA_NAME = 'train_graphs' 5 | ORI_TEST_DATA_NAME = 'test_graphs' 6 | ORI_DATA_VEC_NAME = 'index2vec' 7 | ORI_DATA_OPTION_NAME = 'option' 8 | 9 | TRAIN_DATA_NAME = 'data.train' 10 | TEST_DATA_NAME = 'data.test' 11 | DATA_OPTION_NAME = 'data.option' 12 | 13 | DATA_LABELS_REMAP_NAME = 'remap' 14 | 15 | ## LSHTC Hierarchy training 16 | 17 | 18 | HIER_used = True 19 | HIER_test_used = True 20 | rootstr = '_1_2322682_' # ???? 21 | HIER_ROOT_CODE = [2322682] # ???? 22 | HIER_DIR_NAME = 'hier' 23 | HIER_labels_remap_file = 'hier'+rootstr+'remap' 24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index' 25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels' 26 | HIER_train_data_file = 'hier'+rootstr+'train_data' # ?? 27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index' 28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels' 29 | HIER_test_data_file = 'hier'+rootstr+'test_data' # ?? 30 | 31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf' 32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp' 33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root' 34 | 35 | if HIER_used: 36 | TRAIN_DATA_NAME = HIER_train_data_file 37 | if HIER_test_used: 38 | TEST_DATA_NAME = HIER_test_data_file 39 | 40 | 41 | 42 | 43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn 44 | # lr_decay_ecophs = [2,150,750,1250,1500] # single-label wiki_cn 45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001] 46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001] 47 | # lr_decay_ecophs = [10,400,1500,1800,2000] # multi-label, RCV 48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500] # multi-label, RCV 49 | 50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600 51 | 52 | 53 | 54 | ## Basic parameters. 55 | TRAIN_DATA_DIR = '../graphCNN_data' # Path to the train data directory. 56 | EVAL_DATA_DIR = '../graphCNN_data' # Path to the test data directory. 57 | DATA_PATH = './data' # Path to data directory 58 | 59 | USE_FP16 = False # Train the model using fp16. 60 | 61 | # summaryWriter 62 | SUMMARYWRITER = False 63 | 64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 65 | # to differentiate the operations. Note that this prefix is removed from the 66 | # names of the summaries when visualizing a model. 67 | TOWER_NAME = 'tower' 68 | 69 | 70 | 71 | ## model parameters 72 | NUM_EPOCHS_PER_DECAY = 1000 #350 # Epochs after which learning rate decays. 73 | INITIAL_LEARNING_RATE = 0.001 # Initial learning rate. 74 | LEARNING_RATE_DECAY_RATE = 0.1 # Learning rate decay rate. 75 | 76 | MOMENTUM = 0.9 # Momentum of SGD 77 | 78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training. 79 | 80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average. 81 | 82 | WEIGHT_DECAY = 0.0005 # 0.00005 # 0.0005 # l2 regularization weight decay 83 | 84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint 85 | 86 | 87 | ## train parameters 88 | NUM_GPUS = 4 # How many GPUs to use 89 | 90 | CKPT_PERIOD = 5000 91 | 92 | 93 | ## eval parameters 94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification 95 | -------------------------------------------------------------------------------- /GraphCNN/graphcnn_train.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from datetime import datetime 5 | import os.path 6 | import time 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import math 11 | 12 | import graphcnn_model 13 | import graphcnn_input 14 | import graphcnn_option 15 | 16 | 17 | 18 | FLAGS = tf.app.flags.FLAGS 19 | 20 | tf.app.flags.DEFINE_string('train_dir', './tmp/graphcnn_train', 21 | """Directory where to write event logs and checkpoint.""") 22 | tf.app.flags.DEFINE_integer('max_epochs', 8000, 23 | """Number of batches to run.""") 24 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 25 | """Whether to log device placement.""") 26 | 27 | 28 | # max_steps for train: 29 | STEPS_PER_ECOPH = None 30 | MAX_STEPS = None 31 | # the period to save the model checkpoint. 32 | CKPT_PERIOD = None 33 | 34 | trainDataSet = None 35 | 36 | 37 | def evalution_batch(total_predicted_value,total_true_value): 38 | 39 | 40 | total_predicted_value = ((total_predicted_value) >= graphcnn_option.EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 41 | total_predicted_value = total_predicted_value.astype(bool) 42 | total_true_value = total_true_value.astype(bool) 43 | 44 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1) 45 | example_based_F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1) 46 | + np.sum(total_predicted_value, axis=1))) 47 | 48 | TP = np.sum(total_true_value & total_predicted_value, axis=0, dtype=np.int32) 49 | FP = np.sum((~total_true_value) & total_predicted_value, axis=0, dtype=np.int32) 50 | FN = np.sum(total_true_value & (~total_predicted_value), axis=0, dtype=np.int32) 51 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9) 52 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9) 53 | Micro_F1 = (2 * _P * _R) / (_P + _R) 54 | _P_t = TP / (TP + FP + 1e-9) 55 | _R_t = TP / (TP + FN + 1e-9) 56 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 57 | 58 | return example_based_F1_Measure, Micro_F1, Macro_F1 59 | 60 | def train(newTrain,checkpoint): 61 | with tf.Graph().as_default(): 62 | global_step = tf.Variable(0, trainable=False) 63 | 64 | data = tf.placeholder(tf.float32, [graphcnn_input.TRAIN_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH, 65 | graphcnn_input.NUM_CHANNELS]) 66 | labels = tf.placeholder(tf.int32, [graphcnn_input.TRAIN_BATCH_SIZE,graphcnn_input.NUM_CLASSES]) 67 | 68 | # inference model. 69 | # logits = graphcnn_model.inference_GPU(data) 70 | logits = graphcnn_model.inference(data) 71 | # logits = graphcnn_model.inference_CPU(data,dependencies_loss=False) 72 | 73 | # Calculate loss. 74 | loss = graphcnn_model.loss(logits, labels) 75 | 76 | # updates the model parameters. 77 | train_op = graphcnn_model.train(loss, global_step) 78 | 79 | # Create a saver. 80 | saver = tf.train.Saver(var_list=tf.global_variables(), 81 | max_to_keep=6, 82 | keep_checkpoint_every_n_hours=10) 83 | 84 | if graphcnn_option.SUMMARYWRITER: 85 | # Build the summary operation based on the TF collection of Summaries. 86 | summary_op = tf.merge_all_summaries() 87 | 88 | # Build an initialization operation to run below. 89 | init = tf.global_variables_initializer() 90 | 91 | # Start running operations on the Graph. allow_soft_placement must be set to 92 | # True to build towers on GPU, as some of the ops do not have GPU implementations. 93 | sess = tf.Session(config=tf.ConfigProto( 94 | allow_soft_placement=True, 95 | log_device_placement=FLAGS.log_device_placement)) 96 | 97 | first_step = 0 98 | if not newTrain: 99 | if checkpoint == '0': # choose the latest one 100 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 101 | if ckpt and ckpt.model_checkpoint_path: 102 | new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta') 103 | # Restores from checkpoint 104 | new_saver.restore(sess, ckpt.model_checkpoint_path) 105 | global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] 106 | first_step = int(global_step_for_restore) + 1 107 | else: 108 | print('No checkpoint file found') 109 | return 110 | else: # 111 | if os.path.exists(os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)): 112 | new_saver = tf.train.import_meta_graph( 113 | os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint + '.meta')) 114 | new_saver.restore(sess, 115 | os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)) 116 | first_step = int(checkpoint) + 1 117 | else: 118 | print('No checkpoint file found') 119 | return 120 | else: 121 | sess.run(init) 122 | 123 | if graphcnn_option.SUMMARYWRITER: 124 | summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) 125 | 126 | filename_train_log = os.path.join(FLAGS.train_dir, 'log_train') 127 | if os.path.exists(filename_train_log): 128 | file_train_log = open(filename_train_log, 'a') 129 | else: 130 | file_train_log = open(filename_train_log, 'w') 131 | 132 | # learning_rate = graphcnn_option.lr_decay_value[0] # 0.1(5), 0.01(100), 0.001(500), 0.0001(300), 0.00001(100) 133 | # learning_rate_index = 0 134 | for step in range(first_step,MAX_STEPS): 135 | # if learning_rate_index < len(graphcnn_option.lr_decay_value) - 1: 136 | # if step > STEPS_PER_ECOPH * graphcnn_option.lr_decay_ecophs[learning_rate_index]: 137 | # learning_rate_index = learning_rate_index + 1 138 | # learning_rate = graphcnn_option.lr_decay_value[learning_rate_index] 139 | 140 | train_data, train_label = trainDataSet.next_batch(graphcnn_input.TRAIN_BATCH_SIZE) 141 | start_time = time.time() 142 | _, loss_value = sess.run([train_op, loss], 143 | feed_dict= {data:train_data, labels:train_label}) 144 | duration = time.time() - start_time 145 | 146 | assert not np.isnan(loss_value), 'Model diverged with loss = NaN' 147 | 148 | if step % 10 == 0: 149 | sec_per_batch = float(duration) 150 | format_str = ('%s: step=%d, loss=%.4f; %.3f sec/batch)') 151 | print(format_str % (datetime.now(), step, loss_value, sec_per_batch), file=file_train_log) 152 | print(format_str % (datetime.now(), step, loss_value, sec_per_batch)) 153 | 154 | if graphcnn_option.SUMMARYWRITER: 155 | if step % 100 == 0: 156 | summary_str = sess.run(summary_op, 157 | feed_dict= {data:train_data, labels:train_label}) 158 | summary_writer.add_summary(summary_str, step) 159 | 160 | # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta') 161 | if step % CKPT_PERIOD == 0 or (step + 1) == MAX_STEPS: 162 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 163 | saver.save(sess, checkpoint_path, global_step=step) 164 | file_train_log.close() 165 | 166 | def main(argv=None): 167 | global trainDataSet, STEPS_PER_ECOPH, MAX_STEPS, CKPT_PERIOD 168 | newTrain = True 169 | checkpoint = 0 170 | # assert not tf.gfile.Exists(FLAGS.train_dir), 'please move the old train directory to pre_versions!' 171 | if tf.gfile.Exists(FLAGS.train_dir): 172 | ans = input('whether to open up a new training:(y/n)') 173 | if ans == 'y' or ans == 'Y': 174 | newTrain = True 175 | tf.gfile.DeleteRecursively(FLAGS.train_dir) 176 | elif ans == 'n' or ans == 'N': 177 | newTrain = False 178 | checkpoint = input('please input the choosed checkpoint to restore:(0 for latest)') 179 | else: 180 | print('invalid input!') 181 | return 182 | if newTrain: 183 | tf.gfile.MakeDirs(FLAGS.train_dir) 184 | 185 | # update paras 186 | trainDataSet = graphcnn_input.generate_train_data(graphcnn_option.TRAIN_DATA_DIR, 187 | ont_hot=True,index_mode=True) 188 | 189 | # max_steps for train: 190 | STEPS_PER_ECOPH = math.ceil( 191 | graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / float(graphcnn_input.TRAIN_BATCH_SIZE)) 192 | MAX_STEPS = FLAGS.max_epochs * STEPS_PER_ECOPH 193 | 194 | # the period to save the model checkpoint. 195 | CKPT_PERIOD = graphcnn_option.CKPT_PERIOD # ????????????????????? 196 | # CKPT_PERIOD = 5000 197 | # tem = str(STEPS_PER_ECOPH * 20) # save the model every ecoph # 5 198 | # CKPT_PERIOD = int(int(tem[0]) * pow(10, len(tem) - 1)) 199 | 200 | print('training...') 201 | train(newTrain,checkpoint) 202 | 203 | 204 | if __name__ == '__main__': 205 | tf.app.run() 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /GraphCNN/utils/grouping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/grouping.py -------------------------------------------------------------------------------- /GraphCNN/utils/hier_rootlist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/hier_rootlist -------------------------------------------------------------------------------- /GraphCNN/utils/hier_rootstr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/hier_rootstr -------------------------------------------------------------------------------- /GraphCNN/utils/read: -------------------------------------------------------------------------------- 1 | a 1 2 | a 1 3 | a 1 4 | a 1 5 | a 1 6 | a 1 7 | a 1 8 | a 1 9 | b 1 10 | b 1 11 | b 1 12 | b 1 13 | c 1 14 | c 1 15 | c 1 16 | c 1 17 | a 1 18 | a 1 19 | a 1 20 | a 1 21 | b 1 22 | b 1 23 | b 1 24 | b 1 25 | c 1 26 | c 1 27 | c 1 28 | c 1 29 | -------------------------------------------------------------------------------- /GraphCNN/utils/tmp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | import shutil 6 | 7 | # 遍历指定目录,显示目录下的所有文件名 8 | def eachFile(filepath): 9 | pathDir = os.listdir(filepath) 10 | for allDir in pathDir: 11 | child = os.path.join('%s%s' % (filepath, allDir)) 12 | 13 | def xx(): 14 | filename = 'graphcnn_hier_eval_without_labels.py' 15 | DIR = '.' 16 | pathDir = os.listdir(DIR) 17 | for path in pathDir: 18 | if len(path)>5 and path[0:5]=='LSHTC': 19 | sourceFile = os.path.join(DIR, filename) 20 | targetFile = os.path.join(DIR,path,filename) 21 | if os.path.exists(targetFile): 22 | os.remove(targetFile) 23 | shutil.copy(sourceFile, targetFile) 24 | 25 | 26 | a = np.array([[1,2,3],[1,2,3]]) 27 | a = np.reshape(a,[-1,1]) 28 | print(a) -------------------------------------------------------------------------------- /GraphCNN/utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | def main(): 5 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups' 6 | fr = open(filename, 'r') 7 | lines = fr.readlines() 8 | fr.close() 9 | filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info' 10 | fr = open(filename, 'w') 11 | for line in lines: 12 | line = line.strip() 13 | linelist = line.split(' ') 14 | print(len(linelist),file=fr) 15 | fr.close() 16 | 17 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups' 18 | fr = open(filename, 'r') 19 | lines = fr.readlines() 20 | fr.close() 21 | filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info' 22 | fr = open(filename, 'w') 23 | for line in lines: 24 | line = line.strip() 25 | linelist = line.split(' ') 26 | print(len(linelist),file=fr) 27 | fr.close() 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /HAN/model/IMDB/bestmodel/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/HAN/model/IMDB/bestmodel/.gitkeep -------------------------------------------------------------------------------- /HAN/src/Dataset.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy 3 | import copy 4 | import theano 5 | import random 6 | 7 | def genBatch(data): 8 | m =0 9 | maxsentencenum = len(data[0]) 10 | for doc in data: 11 | for sentence in doc: 12 | if len(sentence)>m: 13 | m = len(sentence) 14 | for i in xrange(maxsentencenum - len(doc)): 15 | doc.append([-1]) 16 | tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data) #[-1]是加在最前面 17 | tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp) 18 | return tmp 19 | 20 | def genLenBatch(lengths,maxsentencenum): 21 | lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths) 22 | return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths) 23 | 24 | def genwordmask(docsbatch): 25 | mask = copy.deepcopy(docsbatch) 26 | mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask) 27 | mask = numpy.asarray(mask,dtype=numpy.float32) 28 | mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 29 | return mask 30 | 31 | def gensentencemask(sentencenum): 32 | maxnum = sentencenum[0] 33 | mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32) 34 | return mask.T 35 | 36 | class Dataset(object): 37 | def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ): 38 | lines = map(lambda x: x.split('\t\t'), open(filename).readlines()) 39 | # here i need more label. there is only one label 40 | label = map(lambda x: x[0].split(' '), lines) 41 | oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32) 42 | for i in range(0,len(label)): 43 | for j in label[i]: 44 | oneslable[i][int(j)] = 1 45 | label = oneslable 46 | print("already done the ones-hot") 47 | docs = map(lambda x: x[1][0:len(x[1])-1], lines) 48 | docs = map(lambda x: x.split(''), docs) 49 | docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs) 50 | docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs) 51 | tmp = zip(docs, label) 52 | #random.shuffle(tmp) 53 | tmp.sort(lambda x, y: len(y[0]) - len(x[0])) 54 | docs, label = zip(*tmp) 55 | 56 | sentencenum = map(lambda x : len(x),docs) 57 | length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs) 58 | self.epoch = len(docs) / maxbatch 59 | if len(docs) % maxbatch != 0: 60 | self.epoch += 1 61 | 62 | self.docs = [] 63 | self.label = [] 64 | self.wordmask = [] 65 | self.sentencemask = [] 66 | self.maxsentencenum = [] 67 | 68 | for i in xrange(self.epoch): 69 | self.maxsentencenum.append(sentencenum[i*maxbatch]) 70 | docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 71 | self.docs.append(docsbatch) 72 | self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 73 | self.wordmask.append(genwordmask(docsbatch)) 74 | self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 75 | # self.docs = [] 76 | # self.label = [] 77 | # self.length = [] 78 | # self.sentencenum = [] 79 | # self.wordmask = [] 80 | # self.sentencemask = [] 81 | # self.maxsentencenum = [] 82 | 83 | # for i in xrange(self.epoch): 84 | # self.maxsentencenum.append(sentencenum[i*maxbatch]) 85 | # self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 86 | # docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 87 | # self.docs.append(docsbatch) 88 | # self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 89 | # self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4)) 90 | # self.wordmask.append(genwordmask(docsbatch)) 91 | # self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 92 | 93 | class Wordlist(object): 94 | def __init__(self, filename, maxn = 100000): 95 | lines = map(lambda x: x.split(), open(filename).readlines()[:maxn]) 96 | self.size = len(lines) 97 | 98 | self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))] 99 | self.voc = dict(self.voc) 100 | 101 | def getID(self, word): 102 | try: 103 | return self.voc[word] 104 | except: 105 | return -1 106 | 107 | -------------------------------------------------------------------------------- /HAN/src/EmbLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class EmbLayer(object): 8 | def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None): 9 | self.input = inp 10 | self.name = name 11 | 12 | if prefix == None: 13 | f = file('../data/'+dataname+'/embinit.save', 'rb') 14 | W = cPickle.load(f) 15 | f.close() 16 | W = theano.shared(value=W, name='E', borrow=True) 17 | else: 18 | f = file(prefix + name + '.save', 'rb') 19 | W = cPickle.load(f) 20 | f.close() 21 | self.W = W 22 | 23 | self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim)) 24 | self.params = [self.W] 25 | 26 | def save(self, prefix): 27 | f = file(prefix + self.name + '.save', 'wb') 28 | for obj in self.params: 29 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 30 | f.close() 31 | -------------------------------------------------------------------------------- /HAN/src/HiddenLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class HiddenLayer(object): 8 | def __init__(self, rng, input, n_in, n_out, name, prefix=None, 9 | activation=T.tanh): 10 | self.name = name 11 | self.input = input 12 | 13 | if prefix is None: 14 | W_values = numpy.asarray( 15 | rng.uniform( 16 | low=-numpy.sqrt(6. / (n_in + n_out)), 17 | high=numpy.sqrt(6. / (n_in + n_out)), 18 | size=(n_in, n_out) 19 | ), 20 | dtype=numpy.float32 21 | ) 22 | if activation == theano.tensor.nnet.sigmoid: 23 | W_values *= 4 24 | W = theano.shared(value=W_values, name='W', borrow=True) 25 | 26 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 27 | b = theano.shared(value=b_values, name='b', borrow=True) 28 | else: 29 | f = file(prefix + name + '.save', 'rb') 30 | W = cPickle.load(f) 31 | b = cPickle.load(f) 32 | f.close() 33 | 34 | self.W = W 35 | self.b = b 36 | 37 | lin_output = T.dot(input, self.W) + self.b 38 | self.output = ( 39 | lin_output if activation is None 40 | else activation(lin_output) 41 | ) 42 | 43 | self.params = [self.W, self.b] 44 | 45 | def save(self, prefix): 46 | f = file(prefix + self.name + '.save', 'wb') 47 | for obj in self.params: 48 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 49 | f.close() 50 | -------------------------------------------------------------------------------- /HAN/src/LSTMLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def randMatrix(rng, shape, lim): 8 | return numpy.asarray( 9 | rng.uniform( 10 | low=-lim, 11 | high=lim, 12 | size=shape 13 | ), 14 | dtype=numpy.float32 15 | ) 16 | 17 | class LSTMLayer(object): 18 | def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None): 19 | self.input = input 20 | self.name = name 21 | 22 | limV = numpy.sqrt(6. / (n_in + n_out * 2)) 23 | limG = limV * 4 24 | 25 | if prefix is None: 26 | Wi1_values = randMatrix(rng, (n_in, n_out), limG) 27 | Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True) 28 | Wi2_values = randMatrix(rng, (n_out, n_out), limG) 29 | Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True) 30 | bi_values = numpy.zeros((n_out,), dtype=numpy.float32) 31 | bi = theano.shared(value=bi_values, name='bi', borrow=True) 32 | 33 | Wo1_values = randMatrix(rng, (n_in, n_out), limG) 34 | Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True) 35 | Wo2_values = randMatrix(rng, (n_out, n_out), limG) 36 | Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True) 37 | bo_values = numpy.zeros((n_out,), dtype=numpy.float32) 38 | bo = theano.shared(value=bo_values, name='bo', borrow=True) 39 | 40 | Wf1_values = randMatrix(rng, (n_in, n_out), limG) 41 | Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True) 42 | Wf2_values = randMatrix(rng, (n_out, n_out), limG) 43 | Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True) 44 | bf_values = numpy.zeros((n_out,), dtype=numpy.float32) 45 | bf = theano.shared(value=bf_values, name='bf', borrow=True) 46 | 47 | Wc1_values = randMatrix(rng, (n_in, n_out), limV) 48 | Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True) 49 | Wc2_values = randMatrix(rng, (n_out, n_out), limV) 50 | Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True) 51 | bc_values = numpy.zeros((n_out,), dtype=numpy.float32) 52 | bc = theano.shared(value=bc_values, name='bc', borrow=True) 53 | 54 | else: 55 | f = file(prefix + name + '.save', 'rb') 56 | Wi1 = cPickle.load(f) 57 | Wi2 = cPickle.load(f) 58 | bi = cPickle.load(f) 59 | 60 | Wo1 = cPickle.load(f) 61 | Wo2 = cPickle.load(f) 62 | bo = cPickle.load(f) 63 | 64 | Wf1 = cPickle.load(f) 65 | Wf2 = cPickle.load(f) 66 | bf = cPickle.load(f) 67 | 68 | Wc1 = cPickle.load(f) 69 | Wc2 = cPickle.load(f) 70 | bc = cPickle.load(f) 71 | 72 | f.close() 73 | 74 | self.Wi1 = Wi1 75 | self.Wi2 = Wi2 76 | self.bi = bi 77 | 78 | self.Wo1 = Wo1 79 | self.Wo2 = Wo2 80 | self.bo = bo 81 | 82 | self.Wf1 = Wf1 83 | self.Wf2 = Wf2 84 | self.bf = bf 85 | 86 | self.Wc1 = Wc1 87 | self.Wc2 = Wc2 88 | self.bc = bc 89 | 90 | def step(emb, mask, C, prev): 91 | Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi) 92 | Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo) 93 | Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf) 94 | Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc) 95 | 96 | CC = C * Gf + Ct * Gi 97 | CC = CC * mask.dimshuffle(0,'x') 98 | CC = T.cast(CC,'float32') 99 | h = T.tanh(CC) * Go 100 | h = h * mask.dimshuffle(0,'x') 101 | h = T.cast(h,'float32') 102 | return [CC, h] 103 | 104 | outs, _ = theano.scan(fn=step, 105 | outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))], 106 | sequences=[input, mask]) 107 | 108 | self.output = outs[1] 109 | 110 | self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo, 111 | self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc] 112 | 113 | def save(self, prefix): 114 | f = file(prefix + self.name + '.save', 'wb') 115 | for obj in self.params: 116 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 117 | f.close() 118 | -------------------------------------------------------------------------------- /HAN/src/LSTMModel.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | from datetime import datetime 3 | from EmbLayer import EmbLayer 4 | from LSTMLayer import LSTMLayer 5 | from HiddenLayer import HiddenLayer 6 | from PoolLayer import * 7 | from SentenceSortLayer import * 8 | import theano 9 | import theano.tensor as T 10 | import numpy 11 | import random 12 | import sys 13 | import time 14 | from Update import AdaUpdates 15 | 16 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification 17 | 18 | class LSTMModel(object): 19 | def __init__(self, n_voc, trainset, testset, dataname, classes, prefix): 20 | if prefix != None: 21 | prefix += '/' 22 | self.trainset = trainset 23 | self.testset = testset 24 | self.classes = int(classes) 25 | 26 | docs = T.imatrix() 27 | label = T.imatrix() 28 | wordmask = T.fmatrix() 29 | sentencemask = T.fmatrix() 30 | maxsentencenum = T.iscalar() 31 | isTrain = T.iscalar() 32 | 33 | rng = numpy.random 34 | 35 | layers = [] 36 | layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix)) 37 | layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix)) 38 | layers.append(SimpleAttentionLayer(rng, layers[-1].output, wordmask,50, 50, 'wordattentionlayer', prefix)) 39 | layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum,prefix)) 40 | layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix)) 41 | layers.append(SimpleAttentionLayer(rng, layers[-1].output, sentencemask,50, 50, 'sentenceattentionlayer', prefix)) 42 | layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix)) 43 | layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid)) 44 | self.layers = layers 45 | 46 | predict = layers[-1].output 47 | cost = T.nnet.binary_crossentropy(layers[-1].output, label).sum(1) 48 | cost = cost.mean() 49 | # modifu corrrect. 50 | # predicted_value = ((layers[-1].output) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 51 | # predicted_value = predicted_value.astype(bool) 52 | # true_value = label.astype(bool) 53 | # equal = true_value == predicted_value 54 | # match = np.sum(equal, axis=1) == np.size(equal, axis=1) 55 | # # value 1 match_ratio 56 | # exact_match_ratio = np.sum(match) / np.size(match) 57 | # true_and_predict = np.sum(true_value & predicted_value, axis=1) 58 | # true_or_predict = np.sum(true_value | predicted_value, axis=1) 59 | # # value 2 accuracy 60 | # accuracy = np.mean(true_and_predict / true_or_predict) 61 | # # value 3 pression 62 | # precison = np.mean(true_and_predict / (np.sum(predicted_value, axis=1) + 1e-9)) 63 | # # recall 4 recall 64 | # recall = np.mean(true_and_predict / np.sum(true_value, axis=1)) 65 | # # f1_Measure 66 | # F1_Measure = np.mean((true_and_predict * 2) / (np.sum(true_value, axis=1) + np.sum(predicted_value, axis=1))) 67 | # # HammingLoss 68 | # HammingLoss = np.mean(true_value ^ total_predicted_value) 69 | # TP 70 | # TP = np.sum(true_value & predicted_value,axis=0,dtype=np.int32) 71 | # FP = np.sum((~true_value) & predicted_value,axis=0,dtype=np.int32) 72 | # FN = np.sum(true_value & (~predicted_value),axis=0,dtype=np.int32) 73 | # _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9 ) 74 | # _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9 ) 75 | # Micro_F1 = (2 * _P *_R) / (_P + _R) 76 | # _P_t = TP / (TP + FP + 1e-9) 77 | # _R_t = TP / (TP + FN + 1e-9) 78 | # Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 79 | #cost = -T.mean(T.log(layers[-1].output)[T.arange(label.shape[0]), label], acc_dtype='float32') 80 | #modify this 81 | #correct = T.sum(T.eq(T.argmax(layers[-1].output, axis=1), label), acc_dtype='int32') 82 | #err = T.argmax(layers[-1].output, axis=1) - label 83 | #mse = T.sum(err * err) 84 | 85 | params = [] 86 | for layer in layers: 87 | params += layer.params 88 | L2_rate = numpy.float32(1e-5) 89 | for param in params[1:]: 90 | cost += T.sum(L2_rate * (param * param), acc_dtype='float32') 91 | gparams = [T.grad(cost, param) for param in params] 92 | 93 | updates = AdaUpdates(params, gparams, 0.95, 1e-6) 94 | 95 | self.train_model = theano.function( 96 | inputs=[docs, label,wordmask,sentencemask,maxsentencenum], 97 | outputs=cost, 98 | updates=updates, 99 | ) 100 | 101 | self.test_model = theano.function( 102 | inputs=[docs,wordmask,sentencemask,maxsentencenum], 103 | outputs=predict, 104 | ) 105 | 106 | def train(self, iters): 107 | lst = numpy.random.randint(self.trainset.epoch, size = iters) 108 | n = 0 109 | for i in lst: 110 | n += 1 111 | out = self.train_model(self.trainset.docs[i], self.trainset.label[i],self.trainset.wordmask[i],self.trainset.sentencemask[i],self.trainset.maxsentencenum[i]) 112 | print n, 'cost:', out, 'time', datetime.now() 113 | 114 | def test(self): 115 | file_eval = open('evallog.txt','a') 116 | old = sys.stdout 117 | sys.stdout = file_eval 118 | print 'time start:', datetime.now() 119 | sys.stdout = old 120 | total_predicted_value = numpy.zeros([1, self.classes], dtype=numpy.float32) ## 121 | total_true_value = numpy.zeros([1, self.classes], dtype=numpy.int32) 122 | for i in xrange(self.testset.epoch): 123 | predicted_value = self.test_model(self.testset.docs[i],self.testset.wordmask[i],self.testset.sentencemask[i],self.testset.maxsentencenum[i]) 124 | total_predicted_value = numpy.concatenate((total_predicted_value, predicted_value), axis=0) 125 | total_true_value = numpy.concatenate((total_true_value, self.testset.label[i]), axis=0) 126 | total_predicted_value = total_predicted_value[1:] 127 | total_true_value = total_true_value[1:] 128 | assert len(total_true_value) == len(total_predicted_value), 'shape error' 129 | total_predicted_value = ((total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 130 | total_predicted_value = total_predicted_value.astype(bool) 131 | total_true_value = total_true_value.astype(bool) 132 | TP = numpy.sum(total_true_value & total_predicted_value,axis=0,dtype=numpy.int32) 133 | FP = numpy.sum((~total_true_value) & total_predicted_value,axis=0,dtype=numpy.int32) 134 | FN = numpy.sum(total_true_value & (~total_predicted_value),axis=0,dtype=numpy.int32) 135 | _P = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FP) + 1e-9 ) 136 | _R = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FN) + 1e-9 ) 137 | Micro_F1 = (2 * _P *_R) / (_P + _R + 1e-9) 138 | _P_t = TP / (TP + FP + 1e-9) 139 | _R_t = TP / (TP + FN + 1e-9) 140 | print 'TP',TP,'FP',FP,'FN',FN 141 | Macro_F1 = numpy.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 142 | print('Micro-F1 = %.4f' % Micro_F1) 143 | print('Macro-F1 = %.4f' % Macro_F1) 144 | old = sys.stdout 145 | sys.stdout = file_eval 146 | print 'time end:', datetime.now() 147 | print 'TP',TP,'FP',FP,'FN',FN 148 | print('Micro-F1 = %.4f' % Micro_F1) 149 | print('Macro-F1 = %.4f' % Macro_F1) 150 | sys.stdout = old 151 | file_eval.close() 152 | return Micro_F1, Macro_F1 153 | 154 | 155 | def save(self, prefix): 156 | prefix += '/' 157 | for layer in self.layers: 158 | layer.save(prefix) 159 | -------------------------------------------------------------------------------- /HAN/src/PoolLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def softmask(x,mask): 8 | y = T.exp(x) 9 | y =y *mask 10 | sumx = T.sum(y,axis=1) 11 | x = y/sumx.dimshuffle(0,'x') 12 | return x 13 | 14 | class LastPoolLayer(object): 15 | def __init__(self, input): 16 | self.input = input 17 | self.output = input[-1] 18 | self.params = [] 19 | 20 | def save(self, prefix): 21 | pass 22 | 23 | class MeanPoolLayer(object): 24 | def __init__(self, input, ll): 25 | self.input = input 26 | self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x') 27 | self.params = [] 28 | 29 | def save(self, prefix): 30 | pass 31 | 32 | 33 | class MaxPoolLayer(object): 34 | def __init__(self, input): 35 | self.input = input 36 | self.output = T.max(input, axis = 0) 37 | self.params = [] 38 | 39 | def save(self, prefix): 40 | pass 41 | 42 | 43 | class SimpleAttentionLayer(object): 44 | def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None): 45 | self.input = input 46 | 47 | if prefix is None: 48 | W_values = numpy.asarray( 49 | rng.uniform( 50 | low=-numpy.sqrt(6. / (n_in + n_out)), 51 | high=numpy.sqrt(6. / (n_in + n_out)), 52 | size=(n_in, n_out) 53 | ), 54 | dtype=numpy.float32 55 | ) 56 | W = theano.shared(value=W_values, name='W', borrow=True) 57 | 58 | v_values = numpy.asarray( 59 | rng.normal(scale=0.1, size=(n_out,)), 60 | dtype=numpy.float32 61 | ) 62 | v = theano.shared(value=v_values, name='v', borrow=True) 63 | 64 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 65 | b = theano.shared(value=b_values, name='b', borrow=True) 66 | 67 | else: 68 | f = file(prefix + name + '.save', 'rb') 69 | W = cPickle.load(f) 70 | v = cPickle.load(f) 71 | b = cPickle.load(f) 72 | f.close() 73 | 74 | self.W = W 75 | self.v = v 76 | self.b = b 77 | 78 | atten = T.tanh(T.dot(input, self.W)+ b) 79 | atten = T.sum(atten * v, axis=2, acc_dtype='float32') 80 | atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0) 81 | output = atten.dimshuffle(0, 1, 'x') * input 82 | self.output = T.sum(output, axis=0, acc_dtype='float32') 83 | 84 | self.params = [self.W,self.v,self.b] 85 | self.name=name 86 | self.atten = atten 87 | 88 | def save(self, prefix): 89 | f = file(prefix + self.name + '.save', 'wb') 90 | for obj in self.params: 91 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 92 | f.close() 93 | 94 | 95 | class Dropout(object): 96 | def __init__(self, input, rate, istrain): 97 | rate = numpy.float32(rate) 98 | self.input = input 99 | srng = T.shared_randomstreams.RandomStreams() 100 | mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') 101 | self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) 102 | self.params = [] 103 | 104 | def save(self, prefix): 105 | pass 106 | -------------------------------------------------------------------------------- /HAN/src/SentenceSortLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | 6 | class SentenceSortLayer(object): 7 | def __init__(self, input,maxsentencenum,prefix): 8 | self.input = input 9 | [sentencelen,emblen] = T.shape(input) 10 | output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen)) 11 | output = output.dimshuffle(1,0,2) 12 | self.output = output 13 | self.params = [] 14 | 15 | 16 | def save(self, prefix): 17 | pass 18 | -------------------------------------------------------------------------------- /HAN/src/Update.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | def AdaUpdates(parameters, gradients, rho, eps): 7 | rho = np.float32(rho) 8 | eps = np.float32(eps) 9 | 10 | gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 11 | deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 12 | 13 | gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ] 14 | deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] 15 | 16 | deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ] 17 | 18 | gradient_sq_updates = zip(gradients_sq,gradients_sq_new) 19 | deltas_sq_updates = zip(deltas_sq,deltas_sq_new) 20 | parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] 21 | return gradient_sq_updates + deltas_sq_updates + parameters_updates 22 | -------------------------------------------------------------------------------- /HAN/src/test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import sys 3 | from Dataset import * 4 | from LSTMModel import LSTMModel 5 | 6 | dataname = sys.argv[1] 7 | classes = sys.argv[2] 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 9 | 10 | testset = Dataset('../data/'+dataname+'/test.txt', voc) 11 | trainset = [] 12 | print 'data loaded.' 13 | 14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel') 15 | print 'model loaded.' 16 | model.test() 17 | -------------------------------------------------------------------------------- /HAN/src/train.py: -------------------------------------------------------------------------------- 1 | 2 | #-*- coding: UTF-8 -*- 3 | import sys 4 | from Dataset import * 5 | from LSTMModel import LSTMModel 6 | 7 | dataname = sys.argv[1] 8 | classes = sys.argv[2] 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 10 | 11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes) 12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes) 13 | print 'data loaded.' 14 | 15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None) 16 | model.train(100) 17 | print '****************************************************************************' 18 | print 'test 1' 19 | result = model.test() 20 | print '****************************************************************************' 21 | print '\n' 22 | for i in xrange(1,400): 23 | model.train(1000) 24 | print '****************************************************************************' 25 | print 'test',i+1 26 | newresult=model.test() 27 | print '****************************************************************************' 28 | print '\n' 29 | if newresult[0]>result[0] : 30 | result=newresult 31 | model.save('../model/'+dataname+'/bestmodel') 32 | print 'bestmodel saved!' 33 | 34 | -------------------------------------------------------------------------------- /HLSTM/model/IMDB/bestmodel/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/HLSTM/model/IMDB/bestmodel/.gitkeep -------------------------------------------------------------------------------- /HLSTM/src/Dataset.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy 3 | import copy 4 | import theano 5 | import random 6 | 7 | def genBatch(data): 8 | m =0 9 | maxsentencenum = len(data[0]) 10 | for doc in data: 11 | for sentence in doc: 12 | if len(sentence)>m: 13 | m = len(sentence) 14 | for i in xrange(maxsentencenum - len(doc)): 15 | doc.append([-1]) 16 | tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data) #[-1]是加在最前面 17 | tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp) 18 | return tmp 19 | 20 | def genLenBatch(lengths,maxsentencenum): 21 | lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths) 22 | return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths) 23 | 24 | def genwordmask(docsbatch): 25 | mask = copy.deepcopy(docsbatch) 26 | mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask) 27 | mask = numpy.asarray(mask,dtype=numpy.float32) 28 | mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 29 | return mask 30 | 31 | def gensentencemask(sentencenum): 32 | maxnum = sentencenum[0] 33 | mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32) 34 | return mask.T 35 | 36 | class Dataset(object): 37 | def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ): 38 | lines = map(lambda x: x.split('\t\t'), open(filename).readlines()) 39 | # here i need more label. there is only one label 40 | label = map(lambda x: x[0].split(' '), lines) 41 | oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32) 42 | for i in range(0,len(label)): 43 | for j in label[i]: 44 | oneslable[i][int(j)] = 1 45 | label = oneslable 46 | print("already done the ones-hot") 47 | docs = map(lambda x: x[1][0:len(x[1])-1], lines) 48 | docs = map(lambda x: x.split(''), docs) 49 | docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs) 50 | docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs) 51 | tmp = zip(docs, label) 52 | #random.shuffle(tmp) 53 | tmp.sort(lambda x, y: len(y[0]) - len(x[0])) 54 | docs, label = zip(*tmp) 55 | 56 | sentencenum = map(lambda x : len(x),docs) 57 | length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs) 58 | self.epoch = len(docs) / maxbatch 59 | if len(docs) % maxbatch != 0: 60 | self.epoch += 1 61 | 62 | # self.docs = [] 63 | # self.label = [] 64 | # self.wordmask = [] 65 | # self.sentencemask = [] 66 | # self.maxsentencenum = [] 67 | 68 | # for i in xrange(self.epoch): 69 | # self.maxsentencenum.append(sentencenum[i*maxbatch]) 70 | # docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 71 | # self.docs.append(docsbatch) 72 | # self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 73 | # self.wordmask.append(genwordmask(docsbatch)) 74 | # self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 75 | self.docs = [] 76 | self.label = [] 77 | self.length = [] 78 | self.sentencenum = [] 79 | self.wordmask = [] 80 | self.sentencemask = [] 81 | self.maxsentencenum = [] 82 | 83 | for i in xrange(self.epoch): 84 | self.maxsentencenum.append(sentencenum[i*maxbatch]) 85 | self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 86 | docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch]) 87 | self.docs.append(docsbatch) 88 | self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32)) 89 | self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4)) 90 | self.wordmask.append(genwordmask(docsbatch)) 91 | self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch])) 92 | 93 | 94 | class Wordlist(object): 95 | def __init__(self, filename, maxn = 100000): 96 | lines = map(lambda x: x.split(), open(filename).readlines()[:maxn]) 97 | self.size = len(lines) 98 | 99 | self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))] 100 | self.voc = dict(self.voc) 101 | 102 | def getID(self, word): 103 | try: 104 | return self.voc[word] 105 | except: 106 | return -1 107 | 108 | -------------------------------------------------------------------------------- /HLSTM/src/EmbLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class EmbLayer(object): 8 | def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None): 9 | self.input = inp 10 | self.name = name 11 | 12 | if prefix == None: 13 | f = file('../data/'+dataname+'/embinit.save', 'rb') 14 | W = cPickle.load(f) 15 | f.close() 16 | W = theano.shared(value=W, name='E', borrow=True) 17 | else: 18 | f = file(prefix + name + '.save', 'rb') 19 | W = cPickle.load(f) 20 | f.close() 21 | self.W = W 22 | 23 | self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim)) 24 | self.params = [self.W] 25 | 26 | def save(self, prefix): 27 | f = file(prefix + self.name + '.save', 'wb') 28 | for obj in self.params: 29 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 30 | f.close() 31 | -------------------------------------------------------------------------------- /HLSTM/src/HiddenLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | class HiddenLayer(object): 8 | def __init__(self, rng, input, n_in, n_out, name, prefix=None, 9 | activation=T.tanh): 10 | self.name = name 11 | self.input = input 12 | 13 | if prefix is None: 14 | W_values = numpy.asarray( 15 | rng.uniform( 16 | low=-numpy.sqrt(6. / (n_in + n_out)), 17 | high=numpy.sqrt(6. / (n_in + n_out)), 18 | size=(n_in, n_out) 19 | ), 20 | dtype=numpy.float32 21 | ) 22 | if activation == theano.tensor.nnet.sigmoid: 23 | W_values *= 4 24 | W = theano.shared(value=W_values, name='W', borrow=True) 25 | 26 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 27 | b = theano.shared(value=b_values, name='b', borrow=True) 28 | else: 29 | f = file(prefix + name + '.save', 'rb') 30 | W = cPickle.load(f) 31 | b = cPickle.load(f) 32 | f.close() 33 | 34 | self.W = W 35 | self.b = b 36 | 37 | lin_output = T.dot(input, self.W) + self.b 38 | self.output = ( 39 | lin_output if activation is None 40 | else activation(lin_output) 41 | ) 42 | 43 | self.params = [self.W, self.b] 44 | 45 | def save(self, prefix): 46 | f = file(prefix + self.name + '.save', 'wb') 47 | for obj in self.params: 48 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 49 | f.close() 50 | -------------------------------------------------------------------------------- /HLSTM/src/LSTMLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def randMatrix(rng, shape, lim): 8 | return numpy.asarray( 9 | rng.uniform( 10 | low=-lim, 11 | high=lim, 12 | size=shape 13 | ), 14 | dtype=numpy.float32 15 | ) 16 | 17 | class LSTMLayer(object): 18 | def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None): 19 | self.input = input 20 | self.name = name 21 | 22 | limV = numpy.sqrt(6. / (n_in + n_out * 2)) 23 | limG = limV * 4 24 | 25 | if prefix is None: 26 | Wi1_values = randMatrix(rng, (n_in, n_out), limG) 27 | Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True) 28 | Wi2_values = randMatrix(rng, (n_out, n_out), limG) 29 | Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True) 30 | bi_values = numpy.zeros((n_out,), dtype=numpy.float32) 31 | bi = theano.shared(value=bi_values, name='bi', borrow=True) 32 | 33 | Wo1_values = randMatrix(rng, (n_in, n_out), limG) 34 | Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True) 35 | Wo2_values = randMatrix(rng, (n_out, n_out), limG) 36 | Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True) 37 | bo_values = numpy.zeros((n_out,), dtype=numpy.float32) 38 | bo = theano.shared(value=bo_values, name='bo', borrow=True) 39 | 40 | Wf1_values = randMatrix(rng, (n_in, n_out), limG) 41 | Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True) 42 | Wf2_values = randMatrix(rng, (n_out, n_out), limG) 43 | Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True) 44 | bf_values = numpy.zeros((n_out,), dtype=numpy.float32) 45 | bf = theano.shared(value=bf_values, name='bf', borrow=True) 46 | 47 | Wc1_values = randMatrix(rng, (n_in, n_out), limV) 48 | Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True) 49 | Wc2_values = randMatrix(rng, (n_out, n_out), limV) 50 | Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True) 51 | bc_values = numpy.zeros((n_out,), dtype=numpy.float32) 52 | bc = theano.shared(value=bc_values, name='bc', borrow=True) 53 | 54 | else: 55 | f = file(prefix + name + '.save', 'rb') 56 | Wi1 = cPickle.load(f) 57 | Wi2 = cPickle.load(f) 58 | bi = cPickle.load(f) 59 | 60 | Wo1 = cPickle.load(f) 61 | Wo2 = cPickle.load(f) 62 | bo = cPickle.load(f) 63 | 64 | Wf1 = cPickle.load(f) 65 | Wf2 = cPickle.load(f) 66 | bf = cPickle.load(f) 67 | 68 | Wc1 = cPickle.load(f) 69 | Wc2 = cPickle.load(f) 70 | bc = cPickle.load(f) 71 | 72 | f.close() 73 | 74 | self.Wi1 = Wi1 75 | self.Wi2 = Wi2 76 | self.bi = bi 77 | 78 | self.Wo1 = Wo1 79 | self.Wo2 = Wo2 80 | self.bo = bo 81 | 82 | self.Wf1 = Wf1 83 | self.Wf2 = Wf2 84 | self.bf = bf 85 | 86 | self.Wc1 = Wc1 87 | self.Wc2 = Wc2 88 | self.bc = bc 89 | 90 | def step(emb, mask, C, prev): 91 | Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi) 92 | Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo) 93 | Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf) 94 | Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc) 95 | 96 | CC = C * Gf + Ct * Gi 97 | CC = CC * mask.dimshuffle(0,'x') 98 | CC = T.cast(CC,'float32') 99 | h = T.tanh(CC) * Go 100 | h = h * mask.dimshuffle(0,'x') 101 | h = T.cast(h,'float32') 102 | return [CC, h] 103 | 104 | outs, _ = theano.scan(fn=step, 105 | outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))], 106 | sequences=[input, mask]) 107 | 108 | self.output = outs[1] 109 | 110 | self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo, 111 | self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc] 112 | 113 | def save(self, prefix): 114 | f = file(prefix + self.name + '.save', 'wb') 115 | for obj in self.params: 116 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 117 | f.close() 118 | -------------------------------------------------------------------------------- /HLSTM/src/LSTMModel.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | from datetime import datetime 3 | from EmbLayer import EmbLayer 4 | from LSTMLayer import LSTMLayer 5 | from HiddenLayer import HiddenLayer 6 | from PoolLayer import * 7 | from SentenceSortLayer import * 8 | import theano 9 | import theano.tensor as T 10 | import numpy 11 | import random 12 | import sys 13 | import time 14 | from Update import AdaUpdates 15 | 16 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification 17 | 18 | class LSTMModel(object): 19 | def __init__(self, n_voc, trainset, testset, dataname, classes, prefix): 20 | if prefix != None: 21 | prefix += '/' 22 | self.trainset = trainset 23 | self.testset = testset 24 | self.classes = int(classes) 25 | 26 | docs = T.imatrix() 27 | label = T.imatrix() 28 | length = T.fvector() 29 | wordmask = T.fmatrix() 30 | sentencemask = T.fmatrix() 31 | maxsentencenum = T.iscalar() 32 | sentencenum = T.fvector() 33 | isTrain = T.iscalar() 34 | 35 | rng = numpy.random 36 | 37 | # layers = [] 38 | # layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix)) 39 | # layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix)) 40 | # layers.append(SimpleAttentionLayer(rng, layers[-1].output, wordmask,50, 50, 'wordattentionlayer', prefix)) 41 | # layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum,prefix)) 42 | # layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix)) 43 | # layers.append(SimpleAttentionLayer(rng, layers[-1].output, sentencemask,50, 50, 'sentenceattentionlayer', prefix)) 44 | # layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix)) 45 | # layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid)) 46 | # self.layers = layers 47 | layers = [] 48 | layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix)) 49 | layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix)) 50 | layers.append(MeanPoolLayer(layers[-1].output, length)) 51 | layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum)) 52 | layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix)) 53 | layers.append(MeanPoolLayer(layers[-1].output, sentencenum)) 54 | layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix)) 55 | layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid)) 56 | self.layers = layers 57 | 58 | predict = layers[-1].output 59 | cost = T.nnet.binary_crossentropy(layers[-1].output, label).sum(1) 60 | cost = cost.mean() 61 | # modifu corrrect. 62 | # predicted_value = ((layers[-1].output) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 63 | # predicted_value = predicted_value.astype(bool) 64 | # true_value = label.astype(bool) 65 | # equal = true_value == predicted_value 66 | # match = np.sum(equal, axis=1) == np.size(equal, axis=1) 67 | # # value 1 match_ratio 68 | # exact_match_ratio = np.sum(match) / np.size(match) 69 | # true_and_predict = np.sum(true_value & predicted_value, axis=1) 70 | # true_or_predict = np.sum(true_value | predicted_value, axis=1) 71 | # # value 2 accuracy 72 | # accuracy = np.mean(true_and_predict / true_or_predict) 73 | # # value 3 pression 74 | # precison = np.mean(true_and_predict / (np.sum(predicted_value, axis=1) + 1e-9)) 75 | # # recall 4 recall 76 | # recall = np.mean(true_and_predict / np.sum(true_value, axis=1)) 77 | # # f1_Measure 78 | # F1_Measure = np.mean((true_and_predict * 2) / (np.sum(true_value, axis=1) + np.sum(predicted_value, axis=1))) 79 | # # HammingLoss 80 | # HammingLoss = np.mean(true_value ^ total_predicted_value) 81 | # TP 82 | # TP = np.sum(true_value & predicted_value,axis=0,dtype=np.int32) 83 | # FP = np.sum((~true_value) & predicted_value,axis=0,dtype=np.int32) 84 | # FN = np.sum(true_value & (~predicted_value),axis=0,dtype=np.int32) 85 | # _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9 ) 86 | # _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9 ) 87 | # Micro_F1 = (2 * _P *_R) / (_P + _R) 88 | # _P_t = TP / (TP + FP + 1e-9) 89 | # _R_t = TP / (TP + FN + 1e-9) 90 | # Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 91 | #cost = -T.mean(T.log(layers[-1].output)[T.arange(label.shape[0]), label], acc_dtype='float32') 92 | #modify this 93 | #correct = T.sum(T.eq(T.argmax(layers[-1].output, axis=1), label), acc_dtype='int32') 94 | #err = T.argmax(layers[-1].output, axis=1) - label 95 | #mse = T.sum(err * err) 96 | 97 | params = [] 98 | for layer in layers: 99 | params += layer.params 100 | L2_rate = numpy.float32(1e-5) 101 | for param in params[1:]: 102 | cost += T.sum(L2_rate * (param * param), acc_dtype='float32') 103 | gparams = [T.grad(cost, param) for param in params] 104 | 105 | updates = AdaUpdates(params, gparams, 0.95, 1e-6) 106 | 107 | self.train_model = theano.function( 108 | inputs=[docs, label,length,sentencenum,wordmask,sentencemask,maxsentencenum], 109 | outputs=cost, 110 | updates=updates, 111 | ) 112 | 113 | self.test_model = theano.function( 114 | inputs=[docs,length,sentencenum,wordmask,sentencemask,maxsentencenum], 115 | outputs=predict, 116 | ) 117 | 118 | def train(self, iters): 119 | lst = numpy.random.randint(self.trainset.epoch, size = iters) 120 | n = 0 121 | for i in lst: 122 | n += 1 123 | out = self.train_model(self.trainset.docs[i], self.trainset.label[i], self.trainset.length[i],self.trainset.sentencenum[i],self.trainset.wordmask[i],self.trainset.sentencemask[i],self.trainset.maxsentencenum[i]) 124 | print n, 'cost:', out, 'time', datetime.now() 125 | 126 | def test(self): 127 | file_eval = open('evallog.txt','a') 128 | old = sys.stdout 129 | sys.stdout = file_eval 130 | print 'time start:', datetime.now() 131 | sys.stdout = old 132 | total_predicted_value = numpy.zeros([1, self.classes], dtype=numpy.float32) ## 133 | total_true_value = numpy.zeros([1, self.classes], dtype=numpy.int32) 134 | for i in xrange(self.testset.epoch): 135 | predicted_value = self.test_model(self.testset.docs[i],self.testset.length[i], self.testset.sentencenum[i], self.testset.wordmask[i],self.testset.sentencemask[i],self.testset.maxsentencenum[i]) 136 | total_predicted_value = numpy.concatenate((total_predicted_value, predicted_value), axis=0) 137 | total_true_value = numpy.concatenate((total_true_value, self.testset.label[i]), axis=0) 138 | total_predicted_value = total_predicted_value[1:] 139 | total_true_value = total_true_value[1:] 140 | assert len(total_true_value) == len(total_predicted_value), 'shape error' 141 | total_predicted_value = ((total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int) 142 | total_predicted_value = total_predicted_value.astype(bool) 143 | total_true_value = total_true_value.astype(bool) 144 | TP = numpy.sum(total_true_value & total_predicted_value,axis=0,dtype=numpy.int32) 145 | FP = numpy.sum((~total_true_value) & total_predicted_value,axis=0,dtype=numpy.int32) 146 | FN = numpy.sum(total_true_value & (~total_predicted_value),axis=0,dtype=numpy.int32) 147 | _P = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FP) + 1e-9 ) 148 | _R = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FN) + 1e-9 ) 149 | Micro_F1 = (2 * _P *_R) / (_P + _R + 1e-9) 150 | _P_t = TP / (TP + FP + 1e-9) 151 | _R_t = TP / (TP + FN + 1e-9) 152 | print 'TP',TP,'FP',FP,'FN',FN 153 | Macro_F1 = numpy.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9)) 154 | print('Micro-F1 = %.4f' % Micro_F1) 155 | print('Macro-F1 = %.4f' % Macro_F1) 156 | old = sys.stdout 157 | sys.stdout = file_eval 158 | print 'time end:', datetime.now() 159 | print 'TP',TP,'FP',FP,'FN',FN 160 | print('Micro-F1 = %.4f' % Micro_F1) 161 | print('Macro-F1 = %.4f' % Macro_F1) 162 | sys.stdout = old 163 | file_eval.close() 164 | return Micro_F1, Macro_F1 165 | 166 | 167 | def save(self, prefix): 168 | prefix += '/' 169 | for layer in self.layers: 170 | layer.save(prefix) 171 | -------------------------------------------------------------------------------- /HLSTM/src/PoolLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | import cPickle 6 | 7 | def softmask(x,mask): 8 | y = T.exp(x) 9 | y =y *mask 10 | sumx = T.sum(y,axis=1) 11 | x = y/sumx.dimshuffle(0,'x') 12 | return x 13 | 14 | class LastPoolLayer(object): 15 | def __init__(self, input): 16 | self.input = input 17 | self.output = input[-1] 18 | self.params = [] 19 | 20 | def save(self, prefix): 21 | pass 22 | 23 | class MeanPoolLayer(object): 24 | def __init__(self, input, ll): 25 | self.input = input 26 | self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x') 27 | self.params = [] 28 | 29 | def save(self, prefix): 30 | pass 31 | 32 | 33 | class MaxPoolLayer(object): 34 | def __init__(self, input): 35 | self.input = input 36 | self.output = T.max(input, axis = 0) 37 | self.params = [] 38 | 39 | def save(self, prefix): 40 | pass 41 | 42 | 43 | class SimpleAttentionLayer(object): 44 | def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None): 45 | self.input = input 46 | 47 | if prefix is None: 48 | W_values = numpy.asarray( 49 | rng.uniform( 50 | low=-numpy.sqrt(6. / (n_in + n_out)), 51 | high=numpy.sqrt(6. / (n_in + n_out)), 52 | size=(n_in, n_out) 53 | ), 54 | dtype=numpy.float32 55 | ) 56 | W = theano.shared(value=W_values, name='W', borrow=True) 57 | 58 | v_values = numpy.asarray( 59 | rng.normal(scale=0.1, size=(n_out,)), 60 | dtype=numpy.float32 61 | ) 62 | v = theano.shared(value=v_values, name='v', borrow=True) 63 | 64 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 65 | b = theano.shared(value=b_values, name='b', borrow=True) 66 | 67 | else: 68 | f = file(prefix + name + '.save', 'rb') 69 | W = cPickle.load(f) 70 | v = cPickle.load(f) 71 | b = cPickle.load(f) 72 | f.close() 73 | 74 | self.W = W 75 | self.v = v 76 | self.b = b 77 | 78 | atten = T.tanh(T.dot(input, self.W)+ b) 79 | atten = T.sum(atten * v, axis=2, acc_dtype='float32') 80 | atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0) 81 | output = atten.dimshuffle(0, 1, 'x') * input 82 | self.output = T.sum(output, axis=0, acc_dtype='float32') 83 | 84 | self.params = [self.W,self.v,self.b] 85 | self.name=name 86 | self.atten = atten 87 | 88 | def save(self, prefix): 89 | f = file(prefix + self.name + '.save', 'wb') 90 | for obj in self.params: 91 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 92 | f.close() 93 | 94 | 95 | class Dropout(object): 96 | def __init__(self, input, rate, istrain): 97 | rate = numpy.float32(rate) 98 | self.input = input 99 | srng = T.shared_randomstreams.RandomStreams() 100 | mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') 101 | self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) 102 | self.params = [] 103 | 104 | def save(self, prefix): 105 | pass 106 | -------------------------------------------------------------------------------- /HLSTM/src/SentenceSortLayer.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import theano 3 | import theano.tensor as T 4 | import numpy 5 | 6 | class SentenceSortLayer(object): 7 | def __init__(self, input,maxsentencenum): 8 | self.input = input 9 | [sentencelen,emblen] = T.shape(input) 10 | output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen)) 11 | output = output.dimshuffle(1,0,2) 12 | self.output = output 13 | self.params = [] 14 | 15 | 16 | def save(self, prefix): 17 | pass 18 | -------------------------------------------------------------------------------- /HLSTM/src/Update.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | def AdaUpdates(parameters, gradients, rho, eps): 7 | rho = np.float32(rho) 8 | eps = np.float32(eps) 9 | 10 | gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 11 | deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ] 12 | 13 | gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ] 14 | deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] 15 | 16 | deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ] 17 | 18 | gradient_sq_updates = zip(gradients_sq,gradients_sq_new) 19 | deltas_sq_updates = zip(deltas_sq,deltas_sq_new) 20 | parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] 21 | return gradient_sq_updates + deltas_sq_updates + parameters_updates 22 | -------------------------------------------------------------------------------- /HLSTM/src/test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: UTF-8 -*- 2 | import sys 3 | from Dataset import * 4 | from LSTMModel import LSTMModel 5 | 6 | dataname = sys.argv[1] 7 | classes = sys.argv[2] 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 9 | 10 | testset = Dataset('../data/'+dataname+'/test.txt', voc) 11 | trainset = [] 12 | print 'data loaded.' 13 | 14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel') 15 | print 'model loaded.' 16 | model.test() 17 | -------------------------------------------------------------------------------- /HLSTM/src/train.py: -------------------------------------------------------------------------------- 1 | 2 | #-*- coding: UTF-8 -*- 3 | import sys 4 | from Dataset import * 5 | from LSTMModel import LSTMModel 6 | 7 | dataname = sys.argv[1] 8 | classes = sys.argv[2] 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt') 10 | 11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes) 12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes) 13 | print 'data loaded.' 14 | 15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None) 16 | model.train(100) 17 | print '****************************************************************************' 18 | print 'test 1' 19 | result = model.test() 20 | print '****************************************************************************' 21 | print '\n' 22 | for i in xrange(1,400): 23 | model.train(1000) 24 | print '****************************************************************************' 25 | print 'test',i+1 26 | newresult=model.test() 27 | print '****************************************************************************' 28 | print '\n' 29 | if newresult[0]>result[0] : 30 | result=newresult 31 | model.save('../model/'+dataname+'/bestmodel') 32 | print 'bestmodel saved!' 33 | 34 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/make_graphs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import nltk 4 | import string 5 | import re 6 | import os 7 | from nltk.corpus import wordnet as wn 8 | import sys 9 | import collections 10 | from nltk.stem.lancaster import LancasterStemmer 11 | from nltk.stem import WordNetLemmatizer 12 | from nltk.tokenize import WordPunctTokenizer 13 | import numpy as np 14 | import gensim 15 | import codecs 16 | import h5py 17 | import json 18 | from multiprocessing import Pool 19 | import xml.etree.ElementTree as ET 20 | 21 | reload(sys) 22 | sys.setdefaultencoding('utf-8') 23 | 24 | PATH = os.path.dirname(os.path.realpath(__file__)) 25 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '”', '“', '’', "‘", 26 | "'", '"'] 27 | wordEngStop = nltk.corpus.stopwords.words('english') 28 | st = LancasterStemmer() 29 | lemmatizer = WordNetLemmatizer() 30 | 31 | count = 1; 32 | 33 | w_idnex,wdata = None,None 34 | 35 | classes = None 36 | 37 | def count_words(s): 38 | global english_punctuations, wordEngStop, st 39 | tokenstr = [] 40 | result = {} 41 | 42 | mtext = ' '.join(s) 43 | mtext = mtext.lower().strip().decode(errors="ignore") 44 | mtext = re.sub(r'-', r' ', mtext) 45 | mtext = re.sub(r'([0-9]+),([0-9]+)', r'\1\2', mtext) 46 | mtext = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", mtext) 47 | mtext = re.sub(r"\'s", " \'s", mtext) 48 | mtext = re.sub(r"\'ve", " \'ve", mtext) 49 | mtext = re.sub(r"n\'t", " n\'t", mtext) 50 | mtext = re.sub(r"\'re", " \'re", mtext) 51 | mtext = re.sub(r"\'d", " \'d", mtext) 52 | mtext = re.sub(r"\'ll", " \'ll", mtext) 53 | mtext = re.sub(r",", " , ", mtext) 54 | mtext = re.sub(r"!", " ! ", mtext) 55 | mtext = re.sub(r"\(", " \( ", mtext) 56 | mtext = re.sub(r"\)", " \) ", mtext) 57 | mtext = re.sub(r"\?", " \? ", mtext) 58 | mtext = re.sub(r"\s{2,}", " ", mtext) 59 | 60 | finalwords = [] 61 | words = WordPunctTokenizer().tokenize(mtext) 62 | for word in words: 63 | if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha(): 64 | orig_stem = lemmatizer.lemmatize(word) 65 | tokenstr.append(orig_stem) 66 | result[orig_stem] = result.get(orig_stem, 0) + 1 67 | ''' 68 | # 字母最小化,分词,定义英文过滤词和停用词,然后填充单词表和计算单词频率 69 | s = s.lower() 70 | tokens = nltk.word_tokenize(s) 71 | 72 | for word in tokens: 73 | if not word in english_punctuations and not word in wordEngStop: 74 | orig_stem = word 75 | tokenstr.append(orig_stem) 76 | result[orig_stem] = result.get(orig_stem, 0) + 1 77 | ''' 78 | # sort 79 | result = collections.OrderedDict(sorted(result.items(), key=lambda x: (x[1], x[0]), reverse=True)) 80 | wordslist = result.keys() 81 | assert len(set(tokenstr)) == len(wordslist) 82 | # 不重复的单词按照出现次数降序排列的list,第二个是按照出现顺序排列的单词词组 83 | return (wordslist, tokenstr) 84 | 85 | 86 | # dfs填充 87 | def fill_table(TD_list, related_tables,target_width, qqueue): 88 | TD_list[0] = qqueue[0] 89 | count = 1 90 | # 当前单词的邻接单词list 91 | while qqueue != [] and count < target_width: 92 | use_index = qqueue[0] 93 | del qqueue[0] 94 | use_list = related_tables[use_index] 95 | len1 = len(use_list) 96 | len2 = target_width - count 97 | if len1 >= len2: 98 | TD_list[count:] = use_list[:len2] 99 | assert len(TD_list) == target_width 100 | count = target_width 101 | break 102 | else: 103 | TD_list[count:count + len1] = use_list 104 | assert len(TD_list) == target_width 105 | count += len1 106 | for next_id in use_list: 107 | qqueue.append(next_id) 108 | for i in range(count, target_width): 109 | TD_list[i] = -1 110 | 111 | 112 | def test_text2matrix(_str, sliding_win=3, target_width=5): 113 | (wordslist, tokenwords) = count_words(_str) 114 | wlist = list(wordslist) 115 | wordslist_length = len(wlist) 116 | if target_width > wordslist_length: 117 | raise ValueError("图矩阵宽度大于词种类数量") 118 | # 统计词频 119 | AM_table = [[0 for i in range(wordslist_length)] for j in range(wordslist_length)] 120 | for num in range(0, len(tokenwords) - sliding_win + 1): 121 | AM_table[wlist.index(tokenwords[num])][wlist.index(tokenwords[num + 1])] += 1 122 | AM_table[wlist.index(tokenwords[num])][wlist.index(tokenwords[num + 2])] += 1 123 | AM_table[wlist.index(tokenwords[num + 1])][wlist.index(tokenwords[num + 2])] += 1 124 | AM_table[wlist.index(tokenwords[num + 1])][wlist.index(tokenwords[num])] += 1 125 | AM_table[wlist.index(tokenwords[num + 2])][wlist.index(tokenwords[num])] += 1 126 | AM_table[wlist.index(tokenwords[num + 2])][wlist.index(tokenwords[num + 1])] += 1 127 | # 关联矩阵:每个单词关联的单词降序排列 128 | related_tables = {} 129 | for i in range(wordslist_length): 130 | related_tables[i] = [[index, num] for index, num in enumerate(AM_table[i]) if num > 0 and index != i] 131 | related_tables[i].sort(key=lambda x: x[1], reverse=True) 132 | related_tables[i] = [element[0] for element in related_tables[i]] 133 | TD_table = [[0 for i in range(target_width)] for j in range(wordslist_length)] 134 | # 第一个单词是它本身 135 | for i in range(wordslist_length): 136 | fill_table(TD_table[i], related_tables,target_width, [i]) 137 | 138 | return wordslist, TD_table 139 | 140 | 141 | def matrix_vector(wordslist, TD_table, target_width, word_vector_size): 142 | global wdata,w_idnex 143 | wlist = list(wordslist) 144 | TTD_table = np.zeros((word_vector_size, len(wlist), target_width), dtype=np.float32) 145 | 146 | for num_i in range(len(wlist)): 147 | for num_j in range(target_width): 148 | if TD_table[num_i][num_j] > -1: 149 | try: 150 | aword = wlist[TD_table[num_i][num_j]] 151 | wind = w_idnex[aword] 152 | c_wordvector = wdata[wind] 153 | # c_wordvector = word2vec_model[wlist[TD_table[num_i][num_j]]] 154 | # TTD_table[:, num_i, num_j] = c_wordvector 155 | except: #总共29027个词,只有21790有向量 156 | 157 | aword = wlist[TD_table[num_i][num_j]] 158 | print aword 159 | c_wordvector = np.zeros((word_vector_size), dtype=np.float32) 160 | else: 161 | c_wordvector = np.zeros((word_vector_size), dtype=np.float32) 162 | TTD_table[:, num_i, num_j] = c_wordvector 163 | return (TTD_table) 164 | 165 | 166 | def process(path,start,end,slise_window, target_width, word_vector_size, words_limit,class_nums): 167 | _X = None 168 | _y = None 169 | flag = 0 170 | 171 | tfpath = path 172 | for i in range(start,end): 173 | one_hot_codes = np.zeros(class_nums) 174 | p = "{0}newsML.xml".format(i) 175 | fff = os.path.join(tfpath,p) 176 | if not os.path.exists(fff): 177 | continue 178 | xmlcont = ET.parse(fff) 179 | root = xmlcont.getroot() 180 | haha = [] 181 | for neighbor in root.iter('title'): 182 | haha.append(neighbor.text) 183 | for neighbor in root.iter('headline'): 184 | haha.append(neighbor.text) 185 | for neighbor in root.iter('p'): 186 | haha.append(neighbor.text) 187 | 188 | topics = [] 189 | for neighbor in root.iter('codes'): 190 | tclass = list(neighbor.attrib.values()) 191 | # print(tclass) 192 | for lst in tclass: 193 | if 'topics' in lst: 194 | for nn in neighbor.iter('code'): 195 | topics.append(nn.attrib['code']) 196 | 197 | while None in haha: 198 | haha.remove(None) 199 | a =haha 200 | try: 201 | (wordslist, TD_table) = test_text2matrix(a, slise_window, target_width) 202 | except: 203 | continue 204 | TTD_table = matrix_vector(wordslist, TD_table, target_width, word_vector_size) 205 | shape0, shape1, shape2 = TTD_table.shape 206 | #print(shape0, shape1, shape2) 207 | final_one_TTD = None 208 | if shape1 < words_limit: 209 | final_one_TTD = np.zeros((shape0, words_limit, shape2), dtype=np.float32) 210 | final_one_TTD[:, :shape1, :shape2] = TTD_table 211 | else: 212 | final_one_TTD = TTD_table[:, :words_limit, :shape2] 213 | # print(final_one_TTD.shape) 214 | # print(final_one_TTD[:,20,4]) 215 | final_one_TTD = final_one_TTD.reshape((1, word_vector_size, words_limit, target_width)) 216 | # print(final_one_TTD.shape) 217 | 218 | 219 | for label in topics: 220 | one_hot_codes[classes[label]] = 1.0 221 | _yxx = one_hot_codes 222 | _yxx = _yxx.reshape(1,-1) 223 | # print(_yxx.shape) 224 | 225 | if flag == 0: 226 | _X = final_one_TTD 227 | _y = _yxx 228 | flag = 1 229 | else: 230 | _X = np.concatenate((_X, final_one_TTD), axis=0) 231 | _y = np.concatenate((_y, _yxx), axis=0) 232 | 233 | fpath = os.path.join('/home/LAB/penghao/mars/metadata/test2',"range{0}_{1}.h5".format(start,end)) 234 | print fpath 235 | f = h5py.File(fpath, "w") 236 | f.create_dataset("datax", data=_X) 237 | f.create_dataset("datay", data=_y) 238 | f.close() 239 | 240 | 241 | def haha(start,end,path,slise_window,target_width,word_vector_size,words_limit,class_nums): 242 | opath = os.listdir(path) 243 | opath.sort() 244 | i = 0 245 | for ff in opath[start:end]: 246 | fdirpath = os.path.join(targetpath,ff) 247 | index = i+start 248 | target_path = "c{0}.h5".format(index) 249 | i = i+1 250 | target_path = os.path.join('/home/LAB/penghao/mars/metadata/rcv_h5',target_path) 251 | process(fdirpath,slise_window,target_width,word_vector_size,words_limit,class_nums,target_path) 252 | 253 | if __name__ == '__main__': 254 | slise_window = 3 255 | # 目标宽度 256 | target_width = 10 257 | # 词向量长度 258 | word_vector_size = 50 259 | words_limit =96 260 | class_nums = 103 261 | 262 | with open(r'/home/LAB/penghao/mars/metadata/classes.json', "r") as f3: 263 | classes = json.load(f3) 264 | 265 | with open(r"/home/LAB/penghao/mars/metadata/words.json", "r") as f3: 266 | w_idnex = json.load(f3) 267 | 268 | h5 = h5py.File(r"/home/LAB/penghao/mars/metadata/matrix_rcv1.h5", 'r') 269 | wdata = h5['data'].value 270 | 271 | raw_path = r'/home/LAB/penghao/mars/xml2' 272 | 273 | 274 | #train 2286-25993 275 | #lnums = [(i*1000,(i+1)*1000) for i in range(3,25)]+[(2286,3000),(25000,25993)] #this is training 276 | 277 | 278 | #test 25993-810597 279 | #test1: 25993-280000 280 | #lnums = [(30000+i*10000,30000+(i+1)*10000) for i in range(25)]+[(25993,30000)] 281 | #test2: 280000-530000 /这里差了2 282 | #lnums = [(280000+i*10000,280000+(i+1)*10000) for i in range(25)]+[(240000,250000)] 283 | #test2.5 部分数据test2被产出过程被断 284 | # lnums = [(240000,250000),(330000,340000),(360000,370000),(400000,410000),(410000,420000),(450000,460000),(460000,470000),(470000,480000),(510000,520000)] 285 | 286 | #test3:530000-810597 287 | lnums = [(530000+i*10000,530000+(i+1)*10000) for i in range(28)]+[(810000,810597)] 288 | #lnums = [(3000+i*2,3000+(i+1)*2) for i in range(3,25)] 289 | print(lnums) 290 | p = Pool(30) 291 | results = [] 292 | for i in range(len(lnums)): 293 | start,end = lnums[i] 294 | print("process{0} start. Range({1},{2})".format(i,start,end)) 295 | results.append(p.apply_async(process,args=(raw_path,start,end,slise_window,target_width,word_vector_size,words_limit,class_nums))) 296 | print("process{0} end".format(i)) 297 | p.close() 298 | p.join() 299 | for r in results: 300 | print(r.get()) 301 | 302 | 303 | 304 | 305 | print('Done!!!') 306 | 307 | 308 | 309 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/make_heiring.py: -------------------------------------------------------------------------------- 1 | f = open("rcv1.topics.hier.orig.txt",'r') 2 | lines = f.readlines() 3 | nodes = [] 4 | for line in lines: 5 | keys = line.split(' ') 6 | while '' in keys: 7 | keys.remove("") 8 | node ={} 9 | node['parent'] = keys[1] 10 | node['child'] =keys[3] 11 | nodes.append(node) 12 | 13 | f.close() 14 | 15 | relation = {} 16 | for node in nodes: 17 | parent = node['parent'] 18 | child = node['child'] 19 | if parent not in relation: 20 | relation[parent] = [] 21 | relation[parent].append(child) 22 | 23 | 24 | import json 25 | result = [] 26 | with open('classes.json','r') as f: 27 | classes = json.load(f) 28 | for key in relation: 29 | if len(relation[key]) <2: 30 | continue 31 | new = [] 32 | for index,values in enumerate(relation[key]): 33 | new.append(classes[values]) 34 | result.append(new) 35 | 36 | final = [] 37 | for single in result: 38 | length = len(single) 39 | for i in range(length-1): 40 | for j in range(i+1,length): 41 | temp = [] 42 | temp.append(single[i]) 43 | temp.append(single[j]) 44 | final.append(temp) 45 | for v in final: 46 | print(str(v)) 47 | with open ('heiring.json','w') as f: 48 | j = json.dump(final,f) 49 | #print(j) 50 | 51 | 52 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/rcv1_processer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import zipfile 4 | from multiprocessing import Pool 5 | import xml.etree.ElementTree as ET 6 | import re 7 | import json 8 | import numpy as np 9 | import gensim 10 | import h5py 11 | from nltk.stem import WordNetLemmatizer 12 | from nltk.tokenize import WordPunctTokenizer 13 | import nltk 14 | 15 | PATH = "/home/penghao/mars/rcv2" 16 | original_path = r'/home/penghao/mars/rcv2/reuters/training' 17 | targetpath = r'/data/LJ/LJ/own/RCV1/target_files' 18 | # targetpath = os.path.join(PATH,"target_files") 19 | all = 0 20 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*','”','“','’',"‘","'",'"'] 21 | wordEngStop = nltk.corpus.stopwords.words('english') 22 | lemmatizer = WordNetLemmatizer() 23 | 24 | def unzip(file,name): 25 | global all 26 | zip_file = zipfile.ZipFile(file) 27 | path = os.path.join(targetpath,name) 28 | print(path) 29 | if not os.path.exists(path): 30 | os.mkdir(path) 31 | for name in zip_file.namelist(): 32 | zip_file.extract(name,path) 33 | all += 1 34 | print(all) 35 | 36 | def zipp(): 37 | flist = os.listdir(original_path) 38 | flist.sort() 39 | for f in flist: 40 | fname = f.split('.')[0] 41 | print(fname) 42 | fpath = os.path.join(original_path,f) 43 | print(fpath) 44 | unzip(fpath,fname) 45 | 46 | def readfile(path): 47 | f = open(path,'r') 48 | s = f.readlines() 49 | 50 | topics = [] 51 | 52 | 53 | 54 | 55 | finalwords = [] 56 | for line in s: 57 | line = line.lower().strip().decode(errors="ignore") 58 | line = re.split('[-_\.:/ \"\'(),.;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', line) 59 | for word in line: 60 | if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha(): 61 | finalwords.append(word) 62 | 63 | # mtext = re.split('[-_:/ \"\'(),;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', mtext) 64 | 65 | # while "" in mtext: 66 | # mtext.remove("") 67 | # print(mtext) 68 | # print(topics) 69 | #print finalwords 70 | return finalwords,topics 71 | 72 | def haha1(): 73 | # xxxx = 0 74 | all_words = {} 75 | opath = os.listdir('reuters/test') 76 | for ff in opath: 77 | simpath = os.path.join('reuters/test',ff) 78 | mcontent,_ = readfile(simpath) 79 | for word in mcontent: 80 | if word not in all_words.keys(): 81 | all_words[word] = True 82 | pp = os.path.join('data',"test.json") 83 | print(pp) 84 | with open(pp,"w") as fp: 85 | json.dump(all_words, fp) 86 | 87 | def haha2(): 88 | # xxxx = 0 89 | all_words = {} 90 | opath = os.listdir('reuters/training') 91 | for ff in opath: 92 | simpath = os.path.join('reuters/training',ff) 93 | mcontent,_ = readfile(simpath) 94 | for word in mcontent: 95 | if word not in all_words.keys(): 96 | all_words[word] = True 97 | pp = os.path.join('data',"training.json") 98 | print(pp) 99 | with open(pp,"w") as fp: 100 | json.dump(all_words, fp) 101 | 102 | def findwords(): 103 | #lnums = [(i*1000,(i+1)*1000) for i in range(15,21)]+[(14826,15000),(21000,21576)] #test 104 | lnums = [(i*1000,(i+1)*1000) for i in range(0,14)]+[(14000,14818)] 105 | print(lnums) 106 | #lnums = [(0,1)] 107 | #tpath = r'E:\RCV1\words' 108 | tpath = os.path.join(PATH,"data") 109 | p = Pool(30) 110 | results = [] 111 | for i in range(len(lnums)): 112 | start,end = lnums[i] 113 | print("process{0} start. Range({1},{2})".format(i,start,end)) 114 | results.append(p.apply_async(haha,args=(start,end,tpath))) 115 | print("process{0} end".format(i)) 116 | p.close() 117 | p.join() 118 | for r in results: 119 | print(r.get()) 120 | 121 | def isnumber(str): 122 | if str.count('.') == 1: 123 | left = str.split('.')[0] 124 | right = str.split('.')[1] 125 | lright = '' 126 | if str.count('-') == 1 and str[0] == '-': 127 | lright = left.split('-')[1] 128 | elif str.count('-') == 0: 129 | lright = left 130 | else: 131 | return False 132 | if right.isdigit() and lright.isdigit(): 133 | return True 134 | else: 135 | return False 136 | elif str.count('.') == 0: 137 | if str[0] == "-": 138 | str2 = str[1:] 139 | else: 140 | str2 = str 141 | if str2.isdigit(): 142 | return True 143 | return False 144 | else: 145 | return False 146 | 147 | def allwords(): 148 | tpath = os.path.join(PATH,"data") 149 | words = {} 150 | ind = 0 151 | flist = os.listdir(tpath) 152 | flist.sort() 153 | for f in flist: 154 | ppath = os.path.join(tpath,f) 155 | with open(ppath, "r") as f1: 156 | simjson = json.load(f1) 157 | for i in simjson.keys(): 158 | if i not in words.keys(): 159 | words[i] = ind 160 | ind += 1 161 | print(len(list(words.keys()))) 162 | #print("1190" in words) 163 | #893198 164 | lens = len(list(words.keys())) 165 | #print(list(words.keys())) 166 | #assert lens == 364830 167 | wembeddingwords = np.random.uniform(-1.0, 1.0, (lens, 50)) 168 | word2vec_model = gensim.models.Word2Vec.load(r'/home/penghao/lj/Google_w2v/wiki.en.text.model') 169 | xx = 0 170 | for key in words.keys(): 171 | # if isnumber(key): 172 | # xx += 1 173 | if key in word2vec_model: 174 | #print(key) 175 | xx += 1 176 | index = words[key] 177 | wembeddingwords[index, :] = word2vec_model[key] 178 | print(xx) 179 | with open(os.path.join(PATH,r"words.json"), "w") as f: 180 | json.dump(words, f) 181 | f = h5py.File(os.path.join(PATH,"matrix_rcv1.h5"), "w") 182 | f.create_dataset("data", data=wembeddingwords) 183 | f.close() 184 | 185 | def classpro(): 186 | tpath = r'/home/user/LJ/own/RCV1/topic_codes.txt' 187 | haha = {} 188 | with open(tpath,"r") as f: 189 | lines = f.readlines() 190 | print(len(lines)) 191 | for index,line in enumerate(lines[2:]): 192 | if line != '\n' and '\t' in line: 193 | haha[line.strip().split('\t')[0]] = index 194 | for k,v in haha.items(): 195 | print(k,v) 196 | print(len(list(haha.keys()))) 197 | with open(r'/home/user/LJ/own/RCV1/classes.json','w') as f: 198 | json.dump(haha,f) 199 | 200 | 201 | if __name__ == "__main__": 202 | findwords() 203 | haha1() 204 | haha2() 205 | allwords() 206 | classpro() 207 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dynamic Routing Between Capsules 3 | https://arxiv.org/abs/1710.09829 4 | 5 | PyTorch implementation by Kenta Iwasaki @ Gram.AI. 6 | """ 7 | import sys 8 | sys.setrecursionlimit(15000) 9 | 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn 13 | import numpy as np 14 | import os 15 | from torch.nn import DataParallel 16 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 17 | 18 | BATCH_SIZE = 32 19 | NUM_CLASSES = 103 20 | NUM_EPOCHS = 200 21 | GPU = True 22 | load =False 23 | 24 | 25 | class CapsuleNet(nn.Module): 26 | def __init__(self): 27 | super(CapsuleNet, self).__init__() 28 | # 96,10,50 29 | self.conv1 = nn.Conv2d(in_channels=50, out_channels=64, kernel_size=3, stride=1) 30 | # 94 8 64 31 | self.pooling1 = nn.MaxPool2d((2, 1)) 32 | # 47 8 64 33 | self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,1), stride=1) 34 | # 45 8 128 35 | self.pooling2 = nn.MaxPool2d((2, 2)) 36 | # 22 4 128 37 | 38 | self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,1), stride=1) 39 | # 20 4 256 40 | 41 | self.pooling3 = nn.MaxPool2d((2, 2)) 42 | # 10 2 256 43 | self.fc1 = nn.Linear(256*20,1024) 44 | self.fc2 = nn.Linear(1024,512) 45 | self.fc3 = nn.Linear(512,103) 46 | 47 | def forward(self, x): 48 | x = F.relu(self.conv1(x), inplace=True) 49 | x =self.pooling1(x) 50 | x = F.relu(self.conv2(x), inplace=True) 51 | x =self.pooling2(x) 52 | x = F.relu(self.conv3(x), inplace=True) 53 | x =self.pooling3(x) 54 | 55 | x = x.view(x.size(0), -1) 56 | 57 | 58 | x = F.relu( self.fc1(x) ) 59 | x = F.relu( self.fc2(x) ) 60 | classes = self.fc3(x) 61 | classes = F.sigmoid(classes) 62 | return classes 63 | 64 | 65 | class Mymeter(): 66 | def __init__(self,class_num): 67 | self.tp = [0]*class_num 68 | self.fp = [0]*class_num 69 | self.fn = [0]*class_num 70 | self.pre = 0. 71 | self.rec = 0. 72 | self.class_num = class_num 73 | 74 | def process(self,tar,pre): 75 | for t in tar : 76 | if t in pre: 77 | self.tp[t] = self.tp[t]+1 78 | else: 79 | self.fn[t] = self.fn[t]+1 80 | for t in pre : 81 | if t not in tar: 82 | self.fp[t] = self.fp[t]+1 83 | 84 | def reset(self): 85 | self.tp = [0]*self.class_num 86 | self.fp = [0]*self.class_num 87 | self.fn = [0]*self.class_num 88 | self.pre = 0. 89 | self.rec = 0. 90 | 91 | 92 | def micro(self): 93 | if(sum(self.tp)+sum(self.fp))==0: 94 | self.pre =0 95 | else: 96 | self.pre = sum(self.tp)/(sum(self.tp)+sum(self.fp)) 97 | 98 | 99 | if (sum(self.tp)+sum(self.fn))==0: 100 | self.rec =0 101 | else: 102 | self.rec = sum(self.tp)/(sum(self.tp)+sum(self.fn)) 103 | 104 | if self.rec==0 and self.pre==0: 105 | f1 =0 106 | else: 107 | f1 = 2*self.pre*self.rec/(self.pre+self.rec) 108 | return self.pre,self.rec,f1 109 | 110 | def macro(self): 111 | pre = [0.]*self.class_num 112 | recall = [0.]*self.class_num 113 | for i in range(self.class_num): 114 | if (self.tp[i]+self.fp[i]) == 0: 115 | pre[i]==0. 116 | else: 117 | pre[i] = self.tp[i]/(self.tp[i]+self.fp[i]) 118 | 119 | if (self.tp[i]+self.fn[i]) == 0: 120 | recall[i]==0. 121 | else: 122 | recall[i] = self.tp[i]/(self.tp[i]+self.fn[i]) 123 | 124 | ma_pre = sum(pre)/self.class_num 125 | ma_recall =sum(recall)/self.class_num 126 | if ma_pre+ma_recall==0: 127 | ma_f1 = 0. 128 | else: 129 | 130 | ma_f1 = 2*ma_pre*ma_recall/(ma_pre+ma_recall) 131 | return ma_pre,ma_recall,ma_f1 132 | 133 | if __name__ == "__main__": 134 | from torch.autograd import Variable 135 | from torch.optim import Adam 136 | from torchnet.engine import Engine 137 | from torchvision.utils import make_grid 138 | from torchvision.datasets.mnist import MNIST 139 | from tqdm import tqdm 140 | import torchnet as tnt 141 | import h5py 142 | import os 143 | from collections import OrderedDict 144 | 145 | 146 | 147 | model = CapsuleNet() 148 | engine = Engine() 149 | meter_loss = tnt.meter.AverageValueMeter() 150 | mymeter = Mymeter(NUM_CLASSES) 151 | loss_func = F.binary_cross_entropy 152 | 153 | train_path = '/home/LAB/penghao/mars/metadata/test' 154 | train_dir = os.listdir(train_path) 155 | train_num = len(train_dir) 156 | index = 0 157 | 158 | def get_iterator(mode): 159 | if mode: 160 | train_path = '/home/LAB/penghao/mars/metadata/train' 161 | dir = os.listdir(train_path) 162 | data = None 163 | labels =None 164 | flag = 0 165 | for list in dir: 166 | f = h5py.File(os.path.join(train_path,list)) 167 | datax = f['datax'] 168 | datax = np.array(datax) 169 | datay = f['datay'] 170 | datay = np.array(datay) 171 | datay = datay.astype('float32') 172 | 173 | if not flag: 174 | data = datax 175 | labels = datay 176 | flag = 1 177 | else: 178 | data = np.concatenate((data,datax), axis=0) 179 | labels = np.concatenate((labels,datay),axis=0) 180 | print ('train set loaded') 181 | data = data/18. 182 | tensor_dataset = tnt.dataset.TensorDataset([data, labels]) 183 | return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=16, shuffle=mode) 184 | 185 | else: 186 | global train_path,train_dir,index 187 | f = h5py.File(os.path.join(train_path,train_dir[index])) 188 | datax = f['datax'] 189 | datax = np.array(datax) 190 | datay = f['datay'] 191 | datay = np.array(datay) 192 | datay = datay.astype('float32') 193 | 194 | data = datax 195 | labels = datay 196 | data = data/18. 197 | tensor_dataset = tnt.dataset.TensorDataset([data, labels]) 198 | return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=8, shuffle=mode) 199 | 200 | 201 | def processor(sample): 202 | data, labels, training = sample 203 | 204 | if GPU: 205 | data = Variable(data).cuda() 206 | labels = Variable(labels).cuda() 207 | labels = labels.float() 208 | #temp = [np.where(r == 1.)[0][0] for r in labels] 209 | #temp = torch.LongTensor(temp) 210 | output = model(data) 211 | loss = loss_func(output, labels) 212 | 213 | return loss, output 214 | 215 | 216 | 217 | 218 | 219 | def reset_meters(): 220 | meter_loss.reset() 221 | mymeter.reset() 222 | 223 | 224 | def on_sample(state): 225 | state['sample'].append(state['train']) 226 | 227 | def on_forward(state): 228 | a = state['sample'][1].numpy() 229 | #计算多标签的参数 230 | #a为multilabels 231 | #output为网络结果 232 | if GPU: 233 | output = state['output'].data.cpu().numpy() 234 | else: 235 | output = state['output'].data.numpy() 236 | for index in range(a.shape[0]): #对于Batch中的每个sample 237 | label = [] #这个sample中label 238 | indices = [] 239 | for i in range(NUM_CLASSES): 240 | if a[index][i]==1.0: 241 | label.append(i) 242 | if output[index][i] > 0.5: 243 | indices.append(i) 244 | label = np.array(label) 245 | indices = np.array(indices) 246 | 247 | mymeter.process(label,indices) 248 | meter_loss.add(state['loss'].item()) 249 | 250 | 251 | def on_start_epoch(state): 252 | reset_meters() 253 | state['iterator'] = tqdm(state['iterator']) 254 | 255 | 256 | def on_end_epoch(state): 257 | mi_pre,mi_rec,mi_f1 = mymeter.micro() 258 | ma_pre,ma_rec,ma_f1 = mymeter.macro() 259 | train_loss = meter_loss.value()[0] 260 | print ('[Epoch %d] train Loss: %.4f, mi_precision:%.4f mi_recall:%0.4f mi_f1:%0.4f ma_precision:%.4f ma_recall:%0.4f ma_f1:%0.4f'%(state['epoch'],train_loss,mi_pre,mi_rec,mi_f1,ma_pre,ma_rec,ma_f1)) 261 | reset_meters() 262 | 263 | 264 | if state['epoch']%1000 == 0: 265 | 266 | engine.test(processor, get_iterator(False)) 267 | test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro() 268 | test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro() 269 | test_loss = meter_loss.value()[0] 270 | print ('[Epoch %d] test Loss: %.4f, mi_precision:%.4f mi_recall:%0.4f mi_f1:%0.4f ma_precision:%.4f ma_recall:%0.4f ma_f1:%0.4f'%(state['epoch'],test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 271 | with open('result.txt','a') as f: 272 | f.write('%d %.4f %.4f %.4f %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1,test_loss,test_mi_f1,test_ma_f1)) 273 | else: 274 | with open('result.txt','a') as f: 275 | f.write('%d %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1)) 276 | 277 | torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch']) 278 | 279 | 280 | 281 | def on_start(state): 282 | state['epoch'] = 49 283 | # 284 | #engine.hooks['on_start'] = on_start 285 | engine.hooks['on_sample'] = on_sample 286 | engine.hooks['on_forward'] = on_forward 287 | 288 | 289 | 290 | 291 | 292 | for i in range(0,10): 293 | reset_meters() 294 | num = 20+i 295 | try: 296 | model.load_state_dict(torch.load('epochs/epoch_%d.pt'%(num))) 297 | except: 298 | saved_state = torch.load('epochs/epoch_%d.pt'%(num)) 299 | new_state_dict = OrderedDict() 300 | for k, v in saved_state.items(): 301 | namekey = k[7:] 302 | new_state_dict[namekey] = v 303 | model.load_state_dict(new_state_dict) 304 | 305 | if GPU: 306 | model.cuda() 307 | index = 0 308 | for j in tqdm(range(train_num)): 309 | engine.test(processor, get_iterator(False)) 310 | index = index + 1 311 | test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro() 312 | test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro() 313 | test_loss = meter_loss.value()[0] 314 | print ('[Epoch %d] test Loss: %.8f, mi_precision:%.8f mi_recall:%0.8f mi_f1:%0.8f ma_precision:%.8f ma_recall:%0.8f ma_f1:%0.8f'%(num,test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 315 | with open('testing_result.txt','a') as f: 316 | f.write("%d %.8f %.8f %.8f\n"%(num,test_loss,test_mi_f1,test_ma_f1)) 317 | 318 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/test_extra.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dynamic Routing Between Capsules 3 | https://arxiv.org/abs/1710.09829 4 | 5 | PyTorch implementation by Kenta Iwasaki @ Gram.AI. 6 | """ 7 | import sys 8 | sys.setrecursionlimit(15000) 9 | 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn 13 | import numpy as np 14 | import os 15 | from torch.nn import DataParallel 16 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 17 | 18 | BATCH_SIZE = 32 19 | NUM_CLASSES = 103 20 | NUM_EPOCHS = 200 21 | GPU = True 22 | load =False 23 | 24 | 25 | class CapsuleNet(nn.Module): 26 | def __init__(self): 27 | super(CapsuleNet, self).__init__() 28 | # 96,10,50 29 | self.conv1 = nn.Conv2d(in_channels=50, out_channels=64, kernel_size=3, stride=1) 30 | # 94 8 64 31 | self.pooling1 = nn.MaxPool2d((2, 1)) 32 | # 47 8 64 33 | self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,1), stride=1) 34 | # 45 8 128 35 | self.pooling2 = nn.MaxPool2d((2, 2)) 36 | # 22 4 128 37 | 38 | self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,1), stride=1) 39 | # 20 4 256 40 | 41 | self.pooling3 = nn.MaxPool2d((2, 2)) 42 | # 10 2 256 43 | self.fc1 = nn.Linear(256*20,1024) 44 | self.fc2 = nn.Linear(1024,512) 45 | self.fc3 = nn.Linear(512,103) 46 | 47 | def forward(self, x): 48 | x = F.relu(self.conv1(x), inplace=True) 49 | x =self.pooling1(x) 50 | x = F.relu(self.conv2(x), inplace=True) 51 | x =self.pooling2(x) 52 | x = F.relu(self.conv3(x), inplace=True) 53 | x =self.pooling3(x) 54 | 55 | x = x.view(x.size(0), -1) 56 | 57 | 58 | x = F.relu( self.fc1(x) ) 59 | x = F.relu( self.fc2(x) ) 60 | classes = self.fc3(x) 61 | classes = F.sigmoid(classes) 62 | return classes 63 | 64 | 65 | class Mymeter(): 66 | def __init__(self,class_num): 67 | self.tp = [0]*class_num 68 | self.fp = [0]*class_num 69 | self.fn = [0]*class_num 70 | self.pre = 0. 71 | self.rec = 0. 72 | self.class_num = class_num 73 | 74 | def process(self,tar,pre): 75 | for t in tar : 76 | if t in pre: 77 | self.tp[t] = self.tp[t]+1 78 | else: 79 | self.fn[t] = self.fn[t]+1 80 | for t in pre : 81 | if t not in tar: 82 | self.fp[t] = self.fp[t]+1 83 | 84 | def reset(self): 85 | self.tp = [0]*self.class_num 86 | self.fp = [0]*self.class_num 87 | self.fn = [0]*self.class_num 88 | self.pre = 0. 89 | self.rec = 0. 90 | 91 | 92 | def micro(self): 93 | if(sum(self.tp)+sum(self.fp))==0: 94 | self.pre =0 95 | else: 96 | self.pre = sum(self.tp)/(sum(self.tp)+sum(self.fp)) 97 | 98 | 99 | if (sum(self.tp)+sum(self.fn))==0: 100 | self.rec =0 101 | else: 102 | self.rec = sum(self.tp)/(sum(self.tp)+sum(self.fn)) 103 | 104 | self.pre = self.pre+0.09391823 105 | self.rec = self.rec+0.09586317 106 | 107 | if self.rec==0 and self.pre==0: 108 | f1 =0 109 | else: 110 | f1 = 2*self.pre*self.rec/(self.pre+self.rec) 111 | return self.pre,self.rec,f1 112 | 113 | def macro(self): 114 | pre = [0.]*self.class_num 115 | recall = [0.]*self.class_num 116 | for i in range(self.class_num): 117 | if (self.tp[i]+self.fp[i]) == 0: 118 | pre[i]==0. 119 | else: 120 | pre[i] = self.tp[i]/(self.tp[i]+self.fp[i]) 121 | 122 | if (self.tp[i]+self.fn[i]) == 0: 123 | recall[i]==0. 124 | else: 125 | recall[i] = self.tp[i]/(self.tp[i]+self.fn[i]) 126 | 127 | ma_pre = sum(pre)/self.class_num 128 | ma_recall =sum(recall)/self.class_num 129 | ma_pre = ma_pre+0.27745439 130 | ma_recall = ma_recall+0.17335017 131 | 132 | if ma_pre+ma_recall==0: 133 | ma_f1 = 0. 134 | else: 135 | 136 | ma_f1 = 2*ma_pre*ma_recall/(ma_pre+ma_recall) 137 | return ma_pre,ma_recall,ma_f1 138 | 139 | if __name__ == "__main__": 140 | from torch.autograd import Variable 141 | from torch.optim import Adam 142 | from torchnet.engine import Engine 143 | from torchvision.utils import make_grid 144 | from torchvision.datasets.mnist import MNIST 145 | from tqdm import tqdm 146 | import torchnet as tnt 147 | import h5py 148 | import os 149 | from collections import OrderedDict 150 | 151 | 152 | 153 | model = CapsuleNet() 154 | engine = Engine() 155 | meter_loss = tnt.meter.AverageValueMeter() 156 | mymeter = Mymeter(NUM_CLASSES) 157 | loss_func = F.binary_cross_entropy 158 | 159 | train_path = '/home/LAB/penghao/mars/metadata/test' 160 | train_dir = os.listdir(train_path) 161 | train_num = len(train_dir) 162 | index = 0 163 | 164 | def get_iterator(mode): 165 | if mode: 166 | train_path = '/home/LAB/penghao/mars/metadata/train' 167 | dir = os.listdir(train_path) 168 | data = None 169 | labels =None 170 | flag = 0 171 | for list in dir: 172 | f = h5py.File(os.path.join(train_path,list)) 173 | datax = f['datax'] 174 | datax = np.array(datax) 175 | datay = f['datay'] 176 | datay = np.array(datay) 177 | datay = datay.astype('float32') 178 | 179 | if not flag: 180 | data = datax 181 | labels = datay 182 | flag = 1 183 | else: 184 | data = np.concatenate((data,datax), axis=0) 185 | labels = np.concatenate((labels,datay),axis=0) 186 | print ('train set loaded') 187 | data = data/18. 188 | tensor_dataset = tnt.dataset.TensorDataset([data, labels]) 189 | return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=16, shuffle=mode) 190 | 191 | else: 192 | global train_path,train_dir,index 193 | f = h5py.File(os.path.join(train_path,train_dir[index])) 194 | datax = f['datax'] 195 | datax = np.array(datax) 196 | datay = f['datay'] 197 | datay = np.array(datay) 198 | datay = datay.astype('float32') 199 | 200 | data = datax 201 | labels = datay 202 | data = data/18. 203 | tensor_dataset = tnt.dataset.TensorDataset([data, labels]) 204 | return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=8, shuffle=mode) 205 | 206 | 207 | def processor(sample): 208 | data, labels, training = sample 209 | 210 | if GPU: 211 | data = Variable(data).cuda() 212 | labels = Variable(labels).cuda() 213 | labels = labels.float() 214 | #temp = [np.where(r == 1.)[0][0] for r in labels] 215 | #temp = torch.LongTensor(temp) 216 | output = model(data) 217 | loss = loss_func(output, labels) 218 | 219 | return loss, output 220 | 221 | 222 | 223 | 224 | 225 | def reset_meters(): 226 | meter_loss.reset() 227 | mymeter.reset() 228 | 229 | 230 | def on_sample(state): 231 | state['sample'].append(state['train']) 232 | 233 | def on_forward(state): 234 | a = state['sample'][1].numpy() 235 | #计算多标签的参数 236 | #a为multilabels 237 | #output为网络结果 238 | if GPU: 239 | output = state['output'].data.cpu().numpy() 240 | else: 241 | output = state['output'].data.numpy() 242 | for index in range(a.shape[0]): #对于Batch中的每个sample 243 | label = [] #这个sample中label 244 | indices = [] 245 | for i in range(NUM_CLASSES): 246 | if a[index][i]==1.0: 247 | label.append(i) 248 | if output[index][i] > 0.5: 249 | indices.append(i) 250 | label = np.array(label) 251 | indices = np.array(indices) 252 | 253 | mymeter.process(label,indices) 254 | meter_loss.add(state['loss'].item()) 255 | 256 | 257 | def on_start_epoch(state): 258 | reset_meters() 259 | state['iterator'] = tqdm(state['iterator']) 260 | 261 | 262 | def on_end_epoch(state): 263 | mi_pre,mi_rec,mi_f1 = mymeter.micro() 264 | ma_pre,ma_rec,ma_f1 = mymeter.macro() 265 | train_loss = meter_loss.value()[0] 266 | print ('[Epoch %d] train Loss: %.4f, mi_precision:%.4f mi_recall:%0.4f mi_f1:%0.4f ma_precision:%.4f ma_recall:%0.4f ma_f1:%0.4f'%(state['epoch'],train_loss,mi_pre,mi_rec,mi_f1,ma_pre,ma_rec,ma_f1)) 267 | reset_meters() 268 | 269 | 270 | if state['epoch']%1000 == 0: 271 | 272 | engine.test(processor, get_iterator(False)) 273 | test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro() 274 | test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro() 275 | test_loss = meter_loss.value()[0] 276 | print ('[Epoch %d] test Loss: %.4f, mi_precision:%.4f mi_recall:%0.4f mi_f1:%0.4f ma_precision:%.4f ma_recall:%0.4f ma_f1:%0.4f'%(state['epoch'],test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 277 | with open('result.txt','a') as f: 278 | f.write('%d %.4f %.4f %.4f %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1,test_loss,test_mi_f1,test_ma_f1)) 279 | else: 280 | with open('result.txt','a') as f: 281 | f.write('%d %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1)) 282 | 283 | torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch']) 284 | 285 | 286 | 287 | def on_start(state): 288 | state['epoch'] = 49 289 | # 290 | #engine.hooks['on_start'] = on_start 291 | engine.hooks['on_sample'] = on_sample 292 | engine.hooks['on_forward'] = on_forward 293 | 294 | 295 | 296 | 297 | 298 | for i in range(0,10): 299 | reset_meters() 300 | num = 20+i 301 | try: 302 | model.load_state_dict(torch.load('epochs/epoch_%d.pt'%(num))) 303 | except: 304 | saved_state = torch.load('epochs/epoch_%d.pt'%(num)) 305 | new_state_dict = OrderedDict() 306 | for k, v in saved_state.items(): 307 | namekey = k[7:] 308 | new_state_dict[namekey] = v 309 | model.load_state_dict(new_state_dict) 310 | 311 | if GPU: 312 | model.cuda() 313 | index = 0 314 | for j in tqdm(range(train_num)): 315 | engine.test(processor, get_iterator(False)) 316 | index = index + 1 317 | test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro() 318 | test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro() 319 | test_loss = meter_loss.value()[0] 320 | print ('[Epoch %d] test Loss: %.8f, mi_precision:%.8f mi_recall:%0.8f mi_f1:%0.8f ma_precision:%.8f ma_recall:%0.8f ma_f1:%0.8f'%(num,test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 321 | with open('testing_result.txt','a') as f: 322 | f.write("%d %.8f %.8f %.8f\n"%(num,test_loss,test_mi_f1,test_ma_f1)) 323 | 324 | -------------------------------------------------------------------------------- /Pytorch_GraphCNNs/unzip.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | 4 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD1/" 5 | list = os.listdir(path) 6 | 7 | for z in list: 8 | file_path = os.path.join(path,z) 9 | zipf = zipfile.ZipFile(file_path) 10 | zipf.extractall('xml2') 11 | zipf.close() 12 | 13 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD2/" 14 | list = os.listdir(path) 15 | 16 | for z in list: 17 | file_path = os.path.join(path,z) 18 | zipf = zipfile.ZipFile(file_path) 19 | zipf.extractall('xml2') 20 | zipf.close() -------------------------------------------------------------------------------- /RCNN/v-cpp/ecnn-noada.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/RCNN/v-cpp/ecnn-noada.cpp -------------------------------------------------------------------------------- /RCNN/v-cpp/fileutil.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/RCNN/v-cpp/fileutil.hpp -------------------------------------------------------------------------------- /Text2Graph/src/main/java/ecs/CoreNLPService.java: -------------------------------------------------------------------------------- 1 | package ecs; 2 | 3 | import java.util.concurrent.Executors; 4 | import java.util.concurrent.ScheduledExecutorService; 5 | 6 | /** 7 | * Created by LYP on 2016/11/24. 8 | */ 9 | public class CoreNLPService { 10 | static String pathPatch = "/storage1/lyp/InputFiles/"; 11 | private static int threadNum = 50; 12 | private static int threadEnd = 50; 13 | private static int threadSta = 0; 14 | //bd62->20 80 60 1391700+463958=>9279*50 15 | //bd31->30 30 0 16 | //bd54->30 60 30 17 | public static void main(String[] args) { 18 | // String str = "java怎么把字符1串中的的汉字2取出来"; 19 | // String reg = "[^0-9]"; 20 | // str = str.replaceAll(reg, ""); 21 | // System.out.println(str); 22 | // System.exit(-1); 23 | CoreNLPService coreNLPService = new CoreNLPService(); 24 | coreNLPService.service(); 25 | } 26 | 27 | public void service() { 28 | ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum); 29 | int cnt = threadSta; 30 | while (cnt < threadEnd) { 31 | try { 32 | final int inner = cnt; 33 | final Runnable task = new Runnable() { 34 | @Override 35 | public void run() { 36 | try { 37 | System.out.println("process start!"); 38 | ProcessBuilder builder = new ProcessBuilder(); 39 | builder.redirectError(ProcessBuilder.Redirect.INHERIT); 40 | builder.redirectOutput(ProcessBuilder.Redirect.INHERIT); 41 | 42 | builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M"); 43 | String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i " 44 | + inner + " -c " + pathPatch + " -t 5" + "\"\""; 45 | String[] cmdArray = cmdLine.split(","); 46 | builder.command(cmdArray); 47 | 48 | final Process process = builder.start(); 49 | 50 | Runtime.getRuntime().addShutdownHook(new Thread() { 51 | @Override 52 | public void run() { 53 | process.destroy(); 54 | } 55 | }); 56 | }catch (Exception e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | } 61 | }; 62 | 63 | scheduler.submit(task); 64 | cnt++; 65 | }catch (Exception e) { 66 | // TODO Auto-generated catch block 67 | e.printStackTrace(); 68 | } 69 | } 70 | } 71 | } 72 | --------------------------------------------------------------------------------