├── CNN
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── binary_class_data_loader.py
    ├── char_data_processor.py
    ├── data_helpers.py
    ├── eval.py
    ├── multi_class_data_loader.py
    ├── text_cnn.py
    ├── train.py
    └── word_data_processor.py
├── GraphCNN
    ├── SVM_eval.py
    ├── SVM_model.py
    ├── SVM_train.py
    ├── __init__.py
    ├── graphcnn_eval_SVM.py
    ├── graphcnn_eval_multilabel.py
    ├── graphcnn_eval_singlelabel.py
    ├── graphcnn_eval_without_labels.py
    ├── graphcnn_generate_data.py
    ├── graphcnn_hier_eval_without_labels.py
    ├── graphcnn_hier_eval_without_labels_SVM.py
    ├── graphcnn_hier_eval_without_labels_all.py
    ├── graphcnn_hier_eval_without_labels_some.py
    ├── graphcnn_hier_eval_without_labels_some2.py
    ├── graphcnn_hier_eval_without_labels_some_root.py
    ├── graphcnn_input.py
    ├── graphcnn_model.py
    ├── graphcnn_option.py
    ├── graphcnn_train.py
    └── utils
    │   ├── NYT_utils.py
    │   ├── grouping.py
    │   ├── hier_rootlist
    │   ├── hier_rootstr
    │   ├── lshtc_utils.py
    │   ├── lshtc_utils2.py
    │   ├── read
    │   ├── tmp.py
    │   └── utils.py
├── HAN
    ├── model
    │   └── IMDB
    │   │   └── bestmodel
    │   │       └── .gitkeep
    └── src
    │   ├── Dataset.py
    │   ├── EmbLayer.py
    │   ├── HiddenLayer.py
    │   ├── LSTMLayer.py
    │   ├── LSTMModel.py
    │   ├── PoolLayer.py
    │   ├── SentenceSortLayer.py
    │   ├── Update.py
    │   ├── test.py
    │   └── train.py
├── HLSTM
    ├── model
    │   └── IMDB
    │   │   └── bestmodel
    │   │       └── .gitkeep
    └── src
    │   ├── Dataset.py
    │   ├── EmbLayer.py
    │   ├── HiddenLayer.py
    │   ├── LSTMLayer.py
    │   ├── LSTMModel.py
    │   ├── PoolLayer.py
    │   ├── SentenceSortLayer.py
    │   ├── Update.py
    │   ├── test.py
    │   └── train.py
├── Pytorch_GraphCNNs
    ├── make_graphs.py
    ├── make_heiring.py
    ├── rcv1_processer.py
    ├── test.py
    ├── test_extra.py
    ├── train.py
    └── unzip.py
├── RCNN
    └── v-cpp
    │   ├── ecnn-noada.cpp
    │   └── fileutil.hpp
└── Text2Graph
    └── src
        └── main
            └── java
                └── ecs
                    ├── CoreNLPService.java
                    └── TestCoreNLP.java


/CNN/.gitignore:
--------------------------------------------------------------------------------
 1 | *.npy
 2 | runs/
 3 | 
 4 | # Created by https://www.gitignore.io/api/python,ipythonnotebook
 5 | 
 6 | ### Python ###
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | 
33 | # PyInstaller
34 | #  Usually these files are written by a python script from a template
35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 | 
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 | 
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *,cover
52 | 
53 | # Translations
54 | *.mo
55 | *.pot
56 | 
57 | # Django stuff:
58 | *.log
59 | 
60 | # Sphinx documentation
61 | docs/_build/
62 | 
63 | # PyBuilder
64 | target/
65 | 
66 | 
67 | ### IPythonNotebook ###
68 | # Temporary data
69 | .ipynb_checkpoints/
70 | 


--------------------------------------------------------------------------------
/CNN/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/CNN/README.md:
--------------------------------------------------------------------------------
 1 | **[This code belongs to the "Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)**
 2 | 
 3 | It is slightly simplified implementation of Kim's [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) paper in Tensorflow.
 4 | 
 5 | ## Requirements
 6 | 
 7 | - Python 3
 8 | - Tensorflow > 0.8
 9 | - Numpy
10 | 
11 | ## Training
12 | 
13 | Print parameters:
14 | 
15 | ```bash
16 | ./train.py --help
17 | ```
18 | 
19 | ```
20 | optional arguments:
21 |   -h, --help            show this help message and exit
22 |   --embedding_dim EMBEDDING_DIM
23 |                         Dimensionality of character embedding (default: 128)
24 |   --filter_sizes FILTER_SIZES
25 |                         Comma-separated filter sizes (default: '3,4,5')
26 |   --num_filters NUM_FILTERS
27 |                         Number of filters per filter size (default: 128)
28 |   --l2_reg_lambda L2_REG_LAMBDA
29 |                         L2 regularizaion lambda (default: 0.0)
30 |   --dropout_keep_prob DROPOUT_KEEP_PROB
31 |                         Dropout keep probability (default: 0.5)
32 |   --batch_size BATCH_SIZE
33 |                         Batch Size (default: 64)
34 |   --num_epochs NUM_EPOCHS
35 |                         Number of training epochs (default: 100)
36 |   --evaluate_every EVALUATE_EVERY
37 |                         Evaluate model on dev set after this many steps
38 |                         (default: 100)
39 |   --checkpoint_every CHECKPOINT_EVERY
40 |                         Save model after this many steps (default: 100)
41 |   --allow_soft_placement ALLOW_SOFT_PLACEMENT
42 |                         Allow device soft device placement
43 |   --noallow_soft_placement
44 |   --log_device_placement LOG_DEVICE_PLACEMENT
45 |                         Log placement of ops on devices
46 |   --nolog_device_placement
47 | 
48 | ```
49 | 
50 | Train:
51 | 
52 | ```bash
53 | ./train.py
54 | ```
55 | 
56 | ## Evaluating
57 | 
58 | ```bash
59 | ./eval.py --eval_train --checkpoint_dir="./runs/1459637919/checkpoints/"
60 | ```
61 | 
62 | Replace the checkpoint dir with the output from the training. To use your own data, change the `eval.py` script to load your data.
63 | 
64 | 
65 | ## References
66 | 
67 | - [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882)
68 | - [A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1510.03820)


--------------------------------------------------------------------------------
/CNN/binary_class_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from tensorflow.contrib import learn
 4 | 
 5 | class BinaryClassDataLoader(object):
 6 |     """
 7 |     Load binary classification data from two files (positive and negative) and
 8 |     split data into train and dev.
 9 |     """
10 |     def __init__(self, flags, data_processor, clean_data=None, classes=None):
11 |         self.__flags = flags
12 |         self.__data_processor = data_processor
13 |         self.__clean_data = clean_data
14 |         self.__classes = classes
15 | 
16 |     def define_flags(self):
17 |         self.__flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
18 |         self.__flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
19 |         self.__flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.")
20 | 
21 |     def prepare_data(self):
22 |         self.__resolve_params()
23 | 
24 |         x_text, y = self.load_data_and_labels()
25 | 
26 |         # Build vocabulary
27 |         self.vocab_processor = self.__data_processor.vocab_processor(x_text)
28 |         x = np.array(list(self.vocab_processor.fit_transform(x_text)))
29 | 
30 |         # Randomly shuffle data
31 |         np.random.seed(10)
32 |         shuffle_indices = np.random.permutation(np.arange(len(y)))
33 |         x_shuffled = x[shuffle_indices]
34 |         y_shuffled = y[shuffle_indices]
35 | 
36 |         # Split train/test set
37 |         # TODO: This is very crude, should use cross-validation
38 |         dev_sample_index = -1 * int(self.__dev_sample_percentage * float(len(y)))
39 |         x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
40 |         y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
41 |         return [x_train, y_train, x_dev, y_dev]
42 | 
43 |     def restore_vocab_processor(self, vocab_path):
44 |         self.vocab_processor = self.__data_processor.restore_vocab_processor(vocab_path)
45 |         return self.vocab_processor
46 | 
47 |     def class_labels(self, class_indexes):
48 |         if self.__classes is None:
49 |             result = class_indexes
50 |         else:
51 |             result = [ self.__classes[idx] for idx in class_indexes ]
52 |         return result
53 | 
54 |     def load_data_and_labels(self):
55 |         """
56 |         Loads MR polarity data from files, splits the data into words and generates labels.
57 |         Returns split sentences and labels.
58 |         """
59 |         self.__resolve_params()
60 | 
61 |         # Load data from files
62 |         positive_examples = list(open(self.__positive_data_file, "r").readlines())
63 |         negative_examples = list(open(self.__negative_data_file, "r").readlines())
64 |         # Split by words
65 |         x_text = positive_examples + negative_examples
66 |         x_text = [self.__data_processor.clean_data(sent) for sent in x_text]
67 |         # Generate labels
68 |         positive_labels = [[0, 1] for _ in positive_examples]
69 |         negative_labels = [[1, 0] for _ in negative_examples]
70 |         y = np.concatenate([positive_labels, negative_labels], 0)
71 |         return [x_text, y]
72 | 
73 |     def __resolve_params(self):
74 |         self.__dev_sample_percentage = self.__flags.FLAGS.dev_sample_percentage
75 |         self.__positive_data_file = self.__flags.FLAGS.positive_data_file
76 |         self.__negative_data_file = self.__flags.FLAGS.negative_data_file
77 | 


--------------------------------------------------------------------------------
/CNN/char_data_processor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import codecs
 3 | 
 4 | class CharDataProcessor(object):
 5 |     def vocab_processor(_, *texts):
 6 |         max_document_length = 0
 7 |         for text in texts:
 8 |             max_doc_len = max([len(line.decode("utf-8")) for line in text])
 9 |             if max_doc_len > max_document_length:
10 |                 max_document_length = max_doc_len
11 |         return VocabularyProcessor(max_document_length)
12 | 
13 |     def restore_vocab_processor(_, vocab_path):
14 |         return VocabularyProcessor.restore(vocab_path)
15 | 
16 |     def clean_data(_, string):
17 |         return string
18 | 
19 | class VocabularyProcessor(object):
20 |     def __init__(self, max_document_length, min_frequency=0, vocabulary=None,
21 |                        tokenizer_fn=None):
22 |     # init a class. index  maxdocument length and a vocabulabrary
23 |         if vocabulary == None:
24 |             self.vocabulary_ = {"<PAD>":0} # padding
25 |         else:
26 |             self.vocabulary_ = vocabulary
27 | 
28 |         self.index = 1
29 |         self.max_document_length = max_document_length
30 |     def fit_transform(self, raw_documents, unused_y=None, fit=True):
31 |         result = []
32 |         for raw_document in raw_documents:
33 |             # mark for this, we can find it is a [[I am a  student]]
34 |             result.append([self.__vocab_id(char, fit) for char in raw_document.decode("utf-8")])
35 | 
36 |         if self.max_document_length == None:
37 |             max_document_length = max([len(vocab_ids) for vocab_ids in result])
38 |         else:
39 |             max_document_length = self.max_document_length
40 | 
41 |         result = self.__smooth_lengths(result, max_document_length)
42 | 
43 |         return result
44 | 
45 |     def transform(self, raw_documents):
46 |         return self.fit_transform(raw_documents, None, False)
47 | 
48 |     def save(self, file):
49 |         with codecs.open(file, 'w', 'utf-8') as f:
50 |             data = {"vocabulary_": self.vocabulary_, "index": self.index,
51 |                     "max_document_length": self.max_document_length}
52 |             f.write(json.dumps(data, ensure_ascii=False))
53 | 
54 |     @classmethod
55 |     def restore(cls, file):
56 |         with codecs.open(file, "r", "utf-8") as f:
57 |             data = json.loads(f.readline())
58 |             vp = cls(data["max_document_length"], 0, data["vocabulary_"])
59 |             vp.index = data["index"]
60 |             return vp
61 | 
62 |     @staticmethod
63 |     def __smooth_lengths(documents, length):
64 |         result = []
65 |         for document in documents:
66 |             if len(document) > length:
67 |                 doccument = document[:length]
68 |             elif len(document) < length:
69 |                 document = document + [0] * (length - len(document))
70 |             result.append(document)
71 |         return result
72 | 
73 |     def __vocab_id(self, char, fit = True):
74 |         # every word has a id
75 |         if char not in self.vocabulary_:
76 |             if fit:
77 |                 self.vocabulary_[char] = self.index
78 |                 self.index += 1
79 |             else:
80 |                 char = "<PAD>"
81 |         return self.vocabulary_[char]
82 | 
83 | 


--------------------------------------------------------------------------------
/CNN/data_helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | import itertools
 4 | from collections import Counter
 5 | 
 6 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
 7 |     """
 8 |     Generates a batch iterator for a dataset.
 9 |     """
10 |     data = np.array(data)
11 |     data_size = len(data)
12 |     num_batches_per_epoch = int(len(data)/batch_size) + 1
13 |     for epoch in range(num_epochs):
14 |         # Shuffle the data at each epoch
15 |         if shuffle:
16 |             shuffle_indices = np.random.permutation(np.arange(data_size))
17 |             shuffled_data = data[shuffle_indices]
18 |         else:
19 |             shuffled_data = data
20 |         for batch_num in range(num_batches_per_epoch):
21 |             start_index = batch_num * batch_size
22 |             end_index = min((batch_num + 1) * batch_size, data_size)
23 |             yield shuffled_data[start_index:end_index]
24 | 


--------------------------------------------------------------------------------
/CNN/eval.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | import os
 6 | import time
 7 | import datetime
 8 | import data_helpers
 9 | from text_cnn import TextCNN
10 | #from binary_class_data_loader import BinaryClassDataLoader
11 | from multi_class_data_loader import MultiClassDataLoader
12 | #from word_data_processor import WordDataProcessor
13 | from char_data_processor import CharDataProcessor
14 | import csv
15 | 
16 | # Parameters
17 | # ==================================================
18 | 
19 | # Eval Parameters
20 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
21 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
22 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")
23 | 
24 | # Misc Parameters
25 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
26 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
27 | 
28 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor())
29 | data_loader.define_flags()
30 | 
31 | FLAGS = tf.flags.FLAGS
32 | FLAGS._parse_flags()
33 | print("\nParameters:")
34 | for attr, value in sorted(FLAGS.__flags.items()):
35 |     print("{}={}".format(attr.upper(), value))
36 | print("")
37 | 
38 | # CHANGE THIS: Load data. Load your own data here
39 | if FLAGS.eval_train:
40 |     x_raw, y_test = data_loader.load_data_and_labels()
41 |     y_test = np.argmax(y_test, axis=1)
42 | else:
43 |     x_raw = ["a masterpiece four years in the making", "everything is off."]
44 |     y_test = [1, 0]
45 | 
46 | # Map data into vocabulary
47 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
48 | vocab_processor = data_loader.restore_vocab_processor(vocab_path)
49 | x_test = np.array(list(vocab_processor.transform(x_raw)))
50 | 
51 | print("\nEvaluating...\n")
52 | 
53 | # Evaluation
54 | # ==================================================
55 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
56 | graph = tf.Graph()
57 | with graph.as_default():
58 |     session_conf = tf.ConfigProto(
59 |       allow_soft_placement=FLAGS.allow_soft_placement,
60 |       log_device_placement=FLAGS.log_device_placement)
61 |     sess = tf.Session(config=session_conf)
62 |     with sess.as_default():
63 |         # Load the saved meta graph and restore variables
64 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
65 |         saver.restore(sess, checkpoint_file)
66 | 
67 |         # Get the placeholders from the graph by name
68 |         input_x = graph.get_operation_by_name("input_x").outputs[0]
69 |         # input_y = graph.get_operation_by_name("input_y").outputs[0]
70 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
71 | 
72 |         # Tensors we want to evaluate
73 |         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
74 | 
75 |         # Generate batches for one epoch
76 |         batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)
77 | 
78 |         # Collect the predictions here
79 |         all_predictions = []
80 | 
81 |         for x_test_batch in batches:
82 |             batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
83 |             all_predictions = np.concatenate([all_predictions, batch_predictions])
84 | 
85 | # Print accuracy if y_test is defined
86 | if y_test is not None:
87 |     correct_predictions = float(sum(all_predictions == y_test))
88 |     print("Total number of test examples: {}".format(len(y_test)))
89 |     print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
90 | 
91 | # Save the evaluation to a csv
92 | all_predictions = data_loader.class_labels(all_predictions.astype(int))
93 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
94 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
95 | print("Saving evaluation to {0}".format(out_path))
96 | with open(out_path, 'w') as f:
97 |     csv.writer(f).writerows(predictions_human_readable)
98 | 


--------------------------------------------------------------------------------
/CNN/multi_class_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import csv
 3 | 
 4 | class MultiClassDataLoader(object):
 5 |     """
 6 |     Handles multi-class training data.  It takes predefined sets of "train_data_file" and "dev_data_file"
 7 |     of the following record format.
 8 |         <text>\t<class label>
 9 |       ex. "what a masterpiece!	Positive"
10 | 
11 |     Class labels are given as "class_data_file", which is a list of class labels.
12 |     """
13 |     def __init__(self, flags, data_processor):
14 |         self.__flags = flags
15 |         self.__data_processor = data_processor
16 |         self.__train_data_file = None
17 |         self.__dev_data_file = None
18 |         self.__class_data_file = None
19 |         self.__classes_cache = None
20 | 
21 | 
22 |     def define_flags(self):
23 |         self.__flags.DEFINE_string("train_data_file", "./data/rt-polaritydata/train.txt", "Data source for the training data.")
24 |         self.__flags.DEFINE_string("dev_data_file", "./data/rt-polaritydata/test.txt", "Data source for the cross validation data.")
25 |         self.__flags.DEFINE_string("class_data_file", "./data/rt-polaritydata/lable.txt", "Data source for the class list.")
26 | 
27 |     def prepare_data(self):
28 |         self.__resolve_params()
29 |         x_train, y_train = self.__load_data_and_labels(self.__train_data_file)
30 |         x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file)
31 | 
32 |         max_doc_len = max([len(doc.decode("utf-8")) for doc in x_train])
33 |         max_doc_len_dev = max([len(doc.decode("utf-8")) for doc in x_dev])
34 |         if max_doc_len_dev > max_doc_len:
35 |             max_doc_len = max_doc_len_dev
36 |         # Build vocabulary
37 |         self.vocab_processor = self.__data_processor.vocab_processor(x_train, x_dev)
38 |         x_train = np.array(list(self.vocab_processor.fit_transform(x_train)))
39 |         # Build vocabulary
40 |         x_dev = np.array(list(self.vocab_processor.fit_transform(x_dev)))
41 |         return [x_train, y_train, x_dev, y_dev]
42 | 
43 |     def restore_vocab_processor(self, vocab_path):
44 |         return self.__data_processor.restore_vocab_processor(vocab_path)
45 | 
46 |     def class_labels(self, class_indexes):
47 |         return [ self.__classes()[idx] for idx in class_indexes ]
48 | 
49 |     def load_data_and_labels(self):
50 |         self.__resolve_params()
51 |         x_train, y_train = self.__load_data_and_labels(self.__train_data_file)
52 |         x_dev, y_dev = self.__load_data_and_labels(self.__dev_data_file)
53 |         x_all = x_train + x_dev
54 |         y_all = np.concatenate([y_train, y_dev], 0)
55 |         return [x_all, y_all]
56 | 
57 |     def __load_data_and_labels(self, data_file):
58 |         x_text = []
59 |         y = []
60 |         with open(data_file, 'r') as tsvin:
61 |             classes = self.__classes()
62 |             one_hot_vectors = np.eye(len(classes), dtype=int)
63 |             class_vectors = {}
64 |             for i, cls in enumerate(classes):
65 |                 class_vectors[cls] = one_hot_vectors[i]
66 |             #edit for the first to the code.
67 |             all_lines = tsvin.readlines()
68 |             for line in all_lines:
69 |                 temp = line.split(' ',1)
70 |                 data = self.__data_processor.clean_data(temp[1])
71 |                 x_text.append(data)
72 |                 y.append(class_vectors[temp[0]])
73 |             #edit
74 | #            tsvin = csv.reader(tsvin, delimiter='\t')
75 | #            for row in tsvin:
76 | #                data = self.__data_processor.clean_data(row[0])
77 | #                x_text.append(data)
78 | #                y.append(class_vectors[row[1]])
79 |         return [x_text, np.array(y)]
80 | 
81 |     def __classes(self):
82 |         self.__resolve_params()
83 |         if self.__classes_cache is None:
84 |             with open(self.__class_data_file, 'r') as catin:
85 |                 classes = list(catin.readlines())
86 |                 self.__classes_cache = [s.strip() for s in classes]
87 |         return self.__classes_cache
88 | 
89 |     def __resolve_params(self):
90 |         if self.__class_data_file is None:
91 |             self.__train_data_file = self.__flags.FLAGS.train_data_file
92 |             self.__dev_data_file = self.__flags.FLAGS.dev_data_file
93 |             self.__class_data_file = self.__flags.FLAGS.class_data_file
94 | 


--------------------------------------------------------------------------------
/CNN/text_cnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextCNN(object):
 6 |     """
 7 |     A CNN for text classification.
 8 |     Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
 9 |     """
10 |     def __init__(
11 |       self, sequence_length, num_classes, vocab_size,
12 |       embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 | 
19 |         # Keeping track of l2 regularization loss (optional)
20 |         l2_loss = tf.constant(0.0)
21 | 
22 |         # Embedding layer
23 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
24 |             self.W = tf.Variable(
25 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
26 |                 trainable = False,
27 |                 name="W")
28 | 
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
30 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
31 | 
32 |         # Create a convolution + maxpool layer for each filter size
33 |         pooled_outputs = []
34 |         for i, filter_size in enumerate(filter_sizes):
35 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
36 |                 # Convolution Layer
37 |                 filter_shape = [filter_size, embedding_size, 1, num_filters]
38 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
39 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
40 |                 conv = tf.nn.conv2d(
41 |                     self.embedded_chars_expanded,
42 |                     W,
43 |                     strides=[1, 1, 1, 1],
44 |                     padding="VALID",
45 |                     name="conv")
46 |                 # Apply nonlinearity
47 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
48 |                 # Maxpooling over the outputs
49 |                 pooled = tf.nn.max_pool(
50 |                     h,
51 |                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
52 |                     strides=[1, 1, 1, 1],
53 |                     padding='VALID',
54 |                     name="pool")
55 |                 pooled_outputs.append(pooled)
56 | 
57 |         # Combine all the pooled features
58 |         num_filters_total = num_filters * len(filter_sizes)
59 |         self.h_pool = tf.concat(3, pooled_outputs)
60 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
61 | 
62 |         # Add dropout
63 |         with tf.name_scope("dropout"):
64 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
65 | 
66 |         # Final (unnormalized) scores and predictions
67 |         with tf.name_scope("output"):
68 |             W = tf.get_variable(
69 |                 "W",
70 |                 shape=[num_filters_total, num_classes],
71 |                 initializer=tf.contrib.layers.xavier_initializer())
72 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
73 |             l2_loss += tf.nn.l2_loss(W)
74 |             l2_loss += tf.nn.l2_loss(b)
75 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
76 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
77 | 
78 |         # CalculateMean cross-entropy loss
79 |         with tf.name_scope("loss"):
80 |             losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
81 |             self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
82 | 
83 |         # Accuracy
84 |         with tf.name_scope("accuracy"):
85 |             correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
86 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
87 | 


--------------------------------------------------------------------------------
/CNN/train.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import os
  5 | import time
  6 | import datetime
  7 | import data_helpers
  8 | from text_cnn import TextCNN
  9 | #from binary_class_data_loader import BinaryClassDataLoader
 10 | from multi_class_data_loader import MultiClassDataLoader
 11 | #from word_data_processor import WordDataProcessor
 12 | from char_data_processor import CharDataProcessor
 13 | 
 14 | # Parameters
 15 | # ==================================================
 16 | # use this for static wordembedding
 17 | # change the path to static the vec.bin is a Chinese word2vec file trained from the sina weibo
 18 | tf.flags.DEFINE_string("word2vec","./data/rt-polaritydata/vec.bin", "word2vec file with pre-trained embedding (default: None)")
 19 | tf.flags.DEFINE_integer("dev_batch_size", 4096, "Batch Size (default: 64)")
 20 | 
 21 | # Model Hyperparameters
 22 | tf.flags.DEFINE_integer("embedding_dim", 50, "Dimensionality of character embedding (default: 128)")
 23 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
 24 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
 25 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 26 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
 27 | 
 28 | 
 29 | # Training parameters
 30 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 31 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
 32 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 33 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 34 | # Misc Parameters
 35 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 36 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 37 | 
 38 | data_loader = MultiClassDataLoader(tf.flags, CharDataProcessor())
 39 | data_loader.define_flags()
 40 | 
 41 | FLAGS = tf.flags.FLAGS
 42 | FLAGS._parse_flags()
 43 | print("\nParameters:")
 44 | for attr, value in sorted(FLAGS.__flags.items()):
 45 |     print("{}={}".format(attr.upper(), value))
 46 | print("")
 47 | 
 48 | 
 49 | # Data Preparatopn
 50 | # ==================================================
 51 | 
 52 | # Load data
 53 | print("Loading data...")
 54 | x_train, y_train, x_dev, y_dev = data_loader.prepare_data()
 55 | vocab_processor = data_loader.vocab_processor
 56 | 
 57 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
 58 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
 59 | 
 60 | 
 61 | # Training
 62 | # ==================================================
 63 | 
 64 | with tf.Graph().as_default():
 65 |     session_conf = tf.ConfigProto(
 66 |       allow_soft_placement=FLAGS.allow_soft_placement,
 67 |       log_device_placement=FLAGS.log_device_placement)
 68 |     sess = tf.Session(config=session_conf)
 69 |     with sess.as_default():
 70 |         cnn = TextCNN(
 71 |             sequence_length=x_train.shape[1],
 72 |             num_classes=y_train.shape[1],
 73 |             vocab_size=len(vocab_processor.vocabulary_),
 74 |             embedding_size=FLAGS.embedding_dim,
 75 |             filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
 76 |             num_filters=FLAGS.num_filters,
 77 |             l2_reg_lambda=FLAGS.l2_reg_lambda)
 78 | 
 79 |         # Define Training procedure
 80 |         global_step = tf.Variable(0, name="global_step", trainable=False)
 81 |         optimizer = tf.train.AdamOptimizer(1e-3)
 82 |         grads_and_vars = optimizer.compute_gradients(cnn.loss)
 83 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
 84 | 
 85 |         # Keep track of gradient values and sparsity (optional)
 86 |         grad_summaries = []
 87 |         for g, v in grads_and_vars:
 88 |             if g is not None:
 89 |                 grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
 90 |                 sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 91 |                 grad_summaries.append(grad_hist_summary)
 92 |                 grad_summaries.append(sparsity_summary)
 93 |         grad_summaries_merged = tf.merge_summary(grad_summaries)
 94 | 
 95 |         # Output directory for models and summaries
 96 |         timestamp = str(int(time.time()))
 97 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 98 |         print("Writing to {}\n".format(out_dir))
 99 | 
100 |         # Summaries for loss and accuracy
101 |         loss_summary = tf.scalar_summary("loss", cnn.loss)
102 |         acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)
103 | 
104 |         # Train Summaries
105 |         train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
106 |         train_summary_dir = os.path.join(out_dir, "summaries", "train")
107 |         train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
108 | 
109 |         # Dev summaries
110 |         dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
111 |         dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
112 |         dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)
113 | 
114 |         # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
115 |         checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
116 |         checkpoint_prefix = os.path.join(checkpoint_dir, "model")
117 |         if not os.path.exists(checkpoint_dir):
118 |             os.makedirs(checkpoint_dir)
119 |         saver = tf.train.Saver(tf.all_variables())
120 | 
121 |         # Write vocabulary
122 |         vocab_processor.save(os.path.join(out_dir, "vocab"))
123 | 
124 |         # Initialize all variables
125 |         sess.run(tf.initialize_all_variables())
126 |         if FLAGS.word2vec:
127 |             # initial matrix with random uniform, vocab_processor.vocabulary_ is the vocabulalu
128 |             initW = np.random.uniform(-0.25, 0.25, (len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
129 |             # load any vecotors from the
130 |             print("Load word2vec file {}\n".format(FLAGS.word2vec))
131 |             #read as a binary file
132 |             with open(FLAGS.word2vec, "rb") as f:
133 |                 head = f.readline()
134 |                 vocab_size, layer1_size = map(int, head.split())
135 |                 binary_len = np.dtype('float32').itemsize * layer1_size
136 |                 for line in xrange(vocab_size):
137 |                     word = []
138 |                     while True:
139 |                         ch = f.read(1)
140 |                         if ch == ' ':
141 |                             word = ''.join(word)
142 |                             break
143 |                         if ch != '\n':
144 |                             word.append(ch)
145 |                     idx = vocab_processor.vocabulary_.get(word)
146 |                     if idx != None:
147 |                         initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
148 |                     else:
149 |                         f.read(binary_len)
150 |             sess.run(cnn.W.assign(initW))
151 | 
152 |         def train_step(x_batch, y_batch):
153 |             """
154 |             A single training step
155 |             """
156 |             feed_dict = {
157 |               cnn.input_x: x_batch,
158 |               cnn.input_y: y_batch,
159 |               cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
160 |             }
161 |             _, step, summaries, loss, accuracy = sess.run(
162 |                 [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
163 |                 feed_dict)
164 |             time_str = datetime.datetime.now().isoformat()
165 |             print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
166 |             train_summary_writer.add_summary(summaries, step)
167 | 
168 |         def dev_step(x_batch, y_batch, writer=None):
169 |             """
170 |             Evaluates model on a dev set
171 |             """
172 |             feed_dict = {
173 |               cnn.input_x: x_batch,
174 |               cnn.input_y: y_batch,
175 |               cnn.dropout_keep_prob: 1.0
176 |             }
177 |             step, summaries, loss, accuracy = sess.run(
178 |                 [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
179 |                 feed_dict)
180 |             time_str = datetime.datetime.now().isoformat()
181 |             print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
182 |             if writer:
183 |                 writer.add_summary(summaries, step)
184 | 
185 |         # Generate batches
186 |         batches = data_helpers.batch_iter(
187 |             list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
188 |         # Training loop. For each batch...\
189 |         
190 |         
191 |         for batch in batches:
192 |             x_batch, y_batch = zip(*batch)
193 |             train_step(x_batch, y_batch)
194 |             current_step = tf.train.global_step(sess, global_step)
195 |             if current_step % FLAGS.evaluate_every == 0:
196 |                 print("\nEvaluation:") 
197 |                 dev_batches = data_helpers.batch_iter(
198 |                 list(zip(x_dev, y_dev)), FLAGS.dev_batch_size, 1)
199 |                 for dev_batch in dev_batches:
200 |                     x_dev_batch, y_dev_batch = zip(*dev_batch)
201 |                     dev_step(x_dev_batch, y_dev_batch, writer=dev_summary_writer)
202 |                 print("")
203 |             if current_step % FLAGS.checkpoint_every == 0:
204 |                 path = saver.save(sess, checkpoint_prefix, global_step=current_step)
205 |                 print("Saved model checkpoint to {}\n".format(path))
206 | 


--------------------------------------------------------------------------------
/CNN/word_data_processor.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from tensorflow.contrib import learn
 3 | 
 4 | class WordDataProcessor(object):
 5 |     def vocab_processor(_, *texts):
 6 |         max_document_length = 0
 7 |         for text in texts:
 8 |             max_doc_len = max([len(line.split(" ")) for line in text])
 9 |             if max_doc_len > max_document_length:
10 |                 max_document_length = max_doc_len
11 |         return learn.preprocessing.VocabularyProcessor(max_document_length)
12 | 
13 |     def restore_vocab_processor(_, vocab_path):
14 |         return learn.preprocessing.VocabularyProcessor.restore(vocab_path)
15 | 
16 |     def clean_data(_, string):
17 |         """
18 |         Tokenization/string cleaning for all datasets except for SST.
19 |         Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
20 |         """
21 |         string = string.strip()
22 |         string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
23 |         string = re.sub(r"\'s", " \'s", string)
24 |         string = re.sub(r"\'ve", " \'ve", string)
25 |         string = re.sub(r"n\'t", " n\'t", string)
26 |         string = re.sub(r"\'re", " \'re", string)
27 |         string = re.sub(r"\'d", " \'d", string)
28 |         string = re.sub(r"\'ll", " \'ll", string)
29 |         string = re.sub(r",", " , ", string)
30 |         string = re.sub(r"!", " ! ", string)
31 |         string = re.sub(r"\(", " \( ", string)
32 |         string = re.sub(r"\)", " \) ", string)
33 |         string = re.sub(r"\?", " \? ", string)
34 |         string = re.sub(r"\s{2,}", " ", string)
35 |         return string.strip().lower()
36 | 


--------------------------------------------------------------------------------
/GraphCNN/SVM_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # HR-SVM
  3 | 
  4 | from datetime import datetime
  5 | import os.path
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | import math
 11 | 
 12 | import graphcnn_input
 13 | import graphcnn_option
 14 | 
 15 | 
 16 | 
 17 | class Model(object):
 18 |     ''' svm model
 19 |     '''
 20 |     
 21 |     def __init__(self):
 22 |         self._paramaters_list = []
 23 |        
 24 |     def linear_SVM(self, data, target):
 25 |     ''' Linear Support Vector Machine: Soft Margin
 26 |         data: 2D of [samples number, feature vector dimension]
 27 |         target: 2D of [samples number, 1], with value -1 or 1
 28 |     '''
 29 |         # feature vector dimension
 30 |         feature_dim = data.get_shape()[1].value
 31 |         
 32 |         # Create variables for linear regression
 33 |         A = tf.Variable(tf.random_normal(shape=[feature_dim,1]))
 34 |         b = tf.Variable(tf.random_normal(shape=[1,1]))
 35 |         
 36 |         # record para
 37 |         self._paramaters_list.append(A)
 38 |         self._paramaters_list.append(b)
 39 |         
 40 |         # Declare model operations
 41 |         model_output = tf.sub(tf.matmul(data, A), b)
 42 |         
 43 |         # Declare vector L2 'norm' function squared
 44 |         l2_norm = tf.reduce_sum(tf.square(A))
 45 | 
 46 |         # Declare loss function
 47 |         # Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2
 48 |         # L2 regularization parameter, alpha
 49 |         alpha = tf.constant([0.01])
 50 |         # Margin term in loss
 51 |         classification_term = tf.reduce_mean(tf.maximum(0., tf.sub(1., tf.mul(model_output, target))))
 52 |         # Put terms together
 53 |         loss = tf.add(classification_term, tf.mul(alpha, l2_norm),name='svm_loss')
 54 |         
 55 |         tf.add_to_collection('losses', loss)
 56 |         
 57 |         return model_output
 58 | 
 59 | 
 60 | def compute_dependencies_loss(model_list):
 61 |     # Calculate the Variable's dependency constraint
 62 |     filename = os.path.join(graphcnn_option.DATA_PATH, 'fathercode')
 63 |     father = np.loadtxt(filename, dtype=int)
 64 | 
 65 |     # Calculate the inner nodes' parameters value
 66 |     inner = np.zeros([graphcnn_input.NUM_CLASSES])
 67 |     for i in range(0, graphcnn_input.NUM_CLASSES):
 68 |         father_i = father[i]
 69 |         if father_i != -1:
 70 |             inner[father_i] = 1
 71 |     nodes = []
 72 |     for i in range(0, graphcnn_input.NUM_CLASSES):
 73 |         nodes.append([])
 74 |     for i in range(0, graphcnn_input.NUM_CLASSES):
 75 |         if inner[i] == 1:
 76 |             father_i = father[i]
 77 |             nodes[i].append(model_list[i]._paramaters_list)
 78 |             if father_i != -1:
 79 |                 nodes[i].append(model_list[father_i]._paramaters_list)
 80 |                 nodes[father_i].append(model_list[i]._paramaters_list)
 81 |     nodes_paras = []
 82 |     for i in range(0, graphcnn_input.NUM_CLASSES):
 83 |         para_list = []
 84 |         if inner[i] == 1:
 85 |             para_list_len = len(nodes[i][0])
 86 |             para_list_num = len(nodes[i])
 87 |             for para_i in range(0,para_list_len):
 88 |                 para = []
 89 |                 for para_list_i in range(0,para_list_num):
 90 |                     para.append(nodes[i][para_list_i][para_i])
 91 |                 para_list.append(tf.truediv(tf.add_n(para), float(para_list_num))) ##???????????????
 92 |         nodes_paras.append(para_list)
 93 | 
 94 |     for i in range(0, graphcnn_input.NUM_CLASSES):
 95 |         if inner[i] == 1:
 96 |             model_para = model_list[i]._paramaters_list
 97 |             father_model_para = nodes_paras[i]
 98 |         else:
 99 |             model_para = model_list[i]._paramaters_list
100 |             father_i = father[i]
101 |             if father_i != -1:
102 |                 father_model_para = nodes_paras[father_i]
103 |         assert len(model_para) == len(father_model_para), ' something is wrong'
104 |         for j in range(0, len(model_para)):
105 |             sub_vector = tf.sub(model_para[j], father_model_para[j])
106 |             reshape = tf.reshape(sub_vector, [1, -1])
107 |             reshape_trans = tf.reshape(sub_vector, [-1, 1])
108 |             dependencies = tf.mul(tf.matmul(reshape, reshape_trans)[0, 0], graphcnn_option.VARIABLE_DEPENDENCY,
109 |                                   name='dependencies_loss')
110 |             tf.add_to_collection('losses', dependencies)
111 | 
112 | def SVM_inference(data, target, dependencies_loss=True):
113 |     '''
114 |         data: 2D of [samples number, feature vector dimension]
115 |         target: 2D of [samples number, NUM_CLASSES], with value -1 or 1
116 |     '''
117 |     
118 |     model_list = []
119 |     logits_list = []
120 |     for i in range(0, graphcnn_input.NUM_CLASSES):
121 |         target_i = target[:,i]
122 |         target_i = tf.reshape(target_i, [-1, 1])
123 |         model = Model()
124 |         logits = model.linear_SVM(data, target_i)
125 |         model_list.append(model)
126 |         logits_list.append(logits)
127 |     logits = tf.concat(1, logits_list)
128 | 
129 |     if dependencies_loss:
130 |         compute_dependencies_loss(model_list)
131 | 
132 |     return logits     
133 | 
134 | def SVM_loss():
135 |     ''' add loss function: cross entropy.
136 |     '''
137 |     return tf.add_n(tf.get_collection('losses'), name='total_loss')
138 |     
139 | def _add_loss_summaries(total_loss):
140 |     """ Add summaries for losses.
141 |         Generates moving average for all losses and associated summaries for visualizing the performance of the network.
142 |         moving average -> eliminate noise
143 | 
144 |     Args:
145 |         total_loss: Total loss from loss().
146 |     Returns:
147 |         loss_averages_op: op for generating moving averages of losses.
148 |     """
149 |     # Compute the moving average of all individual losses and the total loss.
150 |     # The moving averages are computed using exponential decay:
151 |     # shadow_variable -= (1 - decay) * (shadow_variable - variable)   equivalent to:
152 |     # shadow_variable = decay * shadow_variable + (1 - decay) * variable
153 |     loss_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY, name='avg')
154 |     losses = tf.get_collection('losses')
155 |     loss_averages_op = loss_averages.apply(losses + [total_loss])
156 | 
157 |     if graphcnn_option.SUMMARYWRITER:
158 |         # Attach a scalar summary to all individual losses and the total loss; do the same for the averaged version of the losses.
159 |         for l in losses + [total_loss]:
160 |             # Name each loss as '(raw)' and name the moving average version of the loss as the original loss name.
161 |             tf.scalar_summary(l.op.name + ' (raw)', l)
162 |             tf.scalar_summary(l.op.name, loss_averages.average(l))
163 | 
164 |     return loss_averages_op
165 | 
166 | def SVM_train(total_loss, global_step):
167 |     """ Create an optimizer and apply to all trainable variables.
168 |         Add moving average for all trainable variables.
169 | 
170 |     Args:
171 |         total_loss: total loss from loss().
172 |         global_step: Integer Variable counting the number of training steps processed.
173 | 
174 |     Returns:
175 |         train_op: op for training.
176 |     """
177 | 
178 |     # Variables that affect learning rate.
179 |     num_batches_per_epoch = graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / graphcnn_input.TRAIN_BATCH_SIZE
180 |     decay_steps = int(num_batches_per_epoch * graphcnn_option.NUM_EPOCHS_PER_DECAY)
181 | 
182 |     # Decay the learning rate exponentially based on the number of steps.
183 |     # decayed_learning_rate = INITIAL_LEARNING_RATE * LEARNING_RATE_DECAY_RATE ^ (global_step / decay_steps)
184 |     lr = tf.train.exponential_decay(graphcnn_option.INITIAL_LEARNING_RATE,
185 |                                     global_step,
186 |                                     decay_steps,
187 |                                     graphcnn_option.LEARNING_RATE_DECAY_RATE,
188 |                                     staircase=True)
189 | 
190 |     if graphcnn_option.SUMMARYWRITER:
191 |         tf.scalar_summary('learning_rate', lr)
192 | 
193 |     # Generate moving averages of all losses and associated summaries.
194 |     loss_averages_op = _add_loss_summaries(total_loss)
195 | 
196 |     # Compute gradients
197 |     with tf.control_dependencies([loss_averages_op]):
198 |         # opt = tf.train.GradientDescentOptimizer(lr)
199 |         opt = tf.train.MomentumOptimizer(lr, graphcnn_option.MOMENTUM)
200 |         grads = opt.compute_gradients(total_loss)
201 | 
202 |     # Apply gradients.
203 |     apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
204 | 
205 |     if graphcnn_option.SUMMARYWRITER:
206 |         # Add histograms for trainable variables.
207 |         for var in tf.trainable_variables():
208 |             tf.histogram_summary(var.op.name, var)
209 | 
210 |         # Add histograms for gradients.
211 |         for grad, var in grads:
212 |             if grad is not None:
213 |                 tf.histogram_summary(var.op.name + '/gradients', grad)
214 | 
215 |     # Track the moving averages of all trainable variables.
216 |     variable_averages = tf.train.ExponentialMovingAverage(
217 |         graphcnn_option.MOVING_AVERAGE_DECAY, global_step)
218 |     variables_averages_op = variable_averages.apply(tf.trainable_variables())
219 | 
220 |     with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
221 |         train_op = tf.no_op(name='train')
222 | 
223 |     return train_op
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/GraphCNN/SVM_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # HR-SVM
  3 | 
  4 | from datetime import datetime
  5 | import os.path
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | import math
 11 | 
 12 | import graphcnn_input
 13 | import graphcnn_option
 14 | import SVM_model
 15 | 
 16 | 
 17 | 
 18 | FLAGS = tf.app.flags.FLAGS
 19 | 
 20 | tf.app.flags.DEFINE_string('train_dir', './tmp/graphcnn_train',
 21 |                            """Directory where to write event logs and checkpoint.""")
 22 | tf.app.flags.DEFINE_integer('max_epochs', 8000,
 23 |                             """Number of batches to run.""")
 24 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 25 |                             """Whether to log device placement.""")
 26 | 
 27 | 
 28 | # max_steps for train:
 29 | STEPS_PER_ECOPH = None
 30 | MAX_STEPS = None
 31 | # the period to save the model checkpoint.
 32 | CKPT_PERIOD = None
 33 | 
 34 | trainDataSet = None
 35 | 
 36 |    
 37 | def train(newTrain,checkpoint):
 38 |     with tf.Graph().as_default():
 39 |         global_step = tf.Variable(0, trainable=False)
 40 | 
 41 |         data = tf.placeholder(tf.float32, [graphcnn_input.TRAIN_BATCH_SIZE, graphcnn_input.NUM_CHANNELS])  # NUM_CHANNELS: feature dim
 42 |         labels = tf.placeholder(tf.int32, [graphcnn_input.TRAIN_BATCH_SIZE,graphcnn_input.NUM_CLASSES])  # with value: -1,1
 43 | 
 44 |         # inference model.
 45 |         logits = SVM_model.SVM_inference(data, labels)
 46 |         
 47 |         # Declare prediction function
 48 |         prediction = tf.sign(logits)
 49 |         accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, y_target), tf.float32))
 50 | 
 51 |         # Calculate loss.
 52 |         loss = SVM_model.SVM_loss()
 53 | 
 54 |         # updates the model parameters.
 55 |         train_op = SVM_model.SVM_train(loss, global_step)
 56 | 
 57 |         # Create a saver.
 58 |         saver = tf.train.Saver(var_list=tf.global_variables(),
 59 |                                max_to_keep=6,
 60 |                                keep_checkpoint_every_n_hours=10)
 61 | 
 62 |         if graphcnn_option.SUMMARYWRITER:
 63 |             # Build the summary operation based on the TF collection of Summaries.
 64 |             summary_op = tf.merge_all_summaries()
 65 | 
 66 |         # Build an initialization operation to run below.
 67 |         init = tf.global_variables_initializer()
 68 | 
 69 |         # Start running operations on the Graph. allow_soft_placement must be set to
 70 |         # True to build towers on GPU, as some of the ops do not have GPU implementations.
 71 |         sess = tf.Session(config=tf.ConfigProto(
 72 |             allow_soft_placement=True,
 73 |             log_device_placement=FLAGS.log_device_placement))
 74 | 
 75 |         first_step = 0
 76 |         if not newTrain:
 77 |             if checkpoint == '0': # choose the latest one
 78 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
 79 |                 if ckpt and ckpt.model_checkpoint_path:
 80 |                     new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
 81 |                     # Restores from checkpoint
 82 |                     new_saver.restore(sess, ckpt.model_checkpoint_path)
 83 |                     global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
 84 |                     first_step = int(global_step_for_restore) + 1
 85 |                 else:
 86 |                     print('No checkpoint file found')
 87 |                     return
 88 |             else: #
 89 |                 if os.path.exists(os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)):
 90 |                     new_saver = tf.train.import_meta_graph(
 91 |                         os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint + '.meta'))
 92 |                     new_saver.restore(sess,
 93 |                         os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint))
 94 |                     first_step = int(checkpoint) + 1
 95 |                 else:
 96 |                     print('No checkpoint file found')
 97 |                     return
 98 |         else:
 99 |             sess.run(init)
100 | 
101 |         if graphcnn_option.SUMMARYWRITER:
102 |             summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
103 | 
104 |         filename_train_log = os.path.join(FLAGS.train_dir, 'log_train')
105 |         if os.path.exists(filename_train_log):
106 |             file_train_log = open(filename_train_log, 'a')
107 |         else:
108 |             file_train_log = open(filename_train_log, 'w')
109 | 
110 |         # learning_rate = graphcnn_option.lr_decay_value[0]  # 0.1(5), 0.01(100), 0.001(500), 0.0001(300), 0.00001(100)
111 |         # learning_rate_index = 0
112 |         for step in range(first_step,MAX_STEPS):
113 |             # if learning_rate_index < len(graphcnn_option.lr_decay_value) - 1:
114 |             #     if step > STEPS_PER_ECOPH * graphcnn_option.lr_decay_ecophs[learning_rate_index]:
115 |             #         learning_rate_index = learning_rate_index + 1
116 |             #         learning_rate = graphcnn_option.lr_decay_value[learning_rate_index]
117 | 
118 |             train_data, train_label = trainDataSet.next_batch(graphcnn_input.TRAIN_BATCH_SIZE)
119 |             start_time = time.time()
120 |             _, loss_value = sess.run([train_op, loss],
121 |                                      feed_dict= {data:train_data, labels:train_label})
122 |             duration = time.time() - start_time
123 | 
124 |             assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
125 | 
126 |             if step % 10 == 0:
127 |                 sec_per_batch = float(duration)
128 |                 format_str = ('%s: step=%d, loss=%.4f; %.3f sec/batch)')
129 |                 print(format_str % (datetime.now(), step, loss_value, sec_per_batch), file=file_train_log)
130 |                 print(format_str % (datetime.now(), step, loss_value, sec_per_batch))
131 | 
132 |             if graphcnn_option.SUMMARYWRITER:
133 |                 if step % 100 == 0:
134 |                     summary_str = sess.run(summary_op,
135 |                                            feed_dict= {data:train_data, labels:train_label})
136 |                     summary_writer.add_summary(summary_str, step)
137 | 
138 |             # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta')
139 |             if step % CKPT_PERIOD == 0 or (step + 1) == MAX_STEPS:
140 |                 checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
141 |                 saver.save(sess, checkpoint_path, global_step=step)
142 |         file_train_log.close()
143 | 
144 | def main(argv=None):
145 |     global trainDataSet, evalDataSet, STEPS_PER_ECOPH, MAX_STEPS, CKPT_PERIOD
146 |     newTrain = True
147 |     checkpoint = 0
148 |     # assert not tf.gfile.Exists(FLAGS.train_dir), 'please move the old train directory to pre_versions!'
149 |     if tf.gfile.Exists(FLAGS.train_dir):
150 |         ans = input('whether to open up a new training:(y/n)')
151 |         if ans == 'y' or ans == 'Y':
152 |             newTrain = True
153 |             tf.gfile.DeleteRecursively(FLAGS.train_dir)
154 |         elif ans == 'n' or ans == 'N':
155 |             newTrain = False
156 |             checkpoint = input('please input the choosed checkpoint to restore:(0 for latest)')
157 |         else:
158 |             print('invalid input!')
159 |             return
160 |     if newTrain:
161 |         tf.gfile.MakeDirs(FLAGS.train_dir)
162 | 
163 |     # update paras
164 |     trainDataSet = graphcnn_input.generate_SVM_train_data(graphcnn_option.TRAIN_DATA_DIR,
165 |                                                       ont_hot=True,index_mode=True)
166 | 
167 |     # max_steps for train:
168 |     STEPS_PER_ECOPH = math.ceil(
169 |         graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / float(graphcnn_input.TRAIN_BATCH_SIZE))
170 |     MAX_STEPS = FLAGS.max_epochs * STEPS_PER_ECOPH
171 | 
172 |     # the period to save the model checkpoint.
173 |     CKPT_PERIOD = graphcnn_option.CKPT_PERIOD  # ?????????????????????
174 |     # CKPT_PERIOD = 5000
175 |     # tem = str(STEPS_PER_ECOPH * 20)  # save the model every ecoph  # 5
176 |     # CKPT_PERIOD = int(int(tem[0]) * pow(10, len(tem) - 1))
177 | 
178 |     print('training...')
179 |     train(newTrain,checkpoint)
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     tf.app.run()
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/GraphCNN/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ must run in python3x"""
  3 | import  numpy as np
  4 | import tensorflow as tf
  5 | import os
  6 | import shutil
  7 | __author__ = 'Yu He'
  8 | __version__ = 'v30'
  9 | 
 10 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 11 | 
 12 | 
 13 | detail_filename = os.path.join('./data', 'best_eval_for_predicted_value_dictribution')
 14 | total_predicted_value_dictribution = np.loadtxt(detail_filename,dtype=float)
 15 | detail_filename = os.path.join('./data', 'best_eval_for_true_value')
 16 | total_true_value = np.loadtxt(detail_filename,dtype=int)
 17 | 
 18 | total_predicted_value = ((total_predicted_value_dictribution) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 19 | 
 20 | 
 21 | 
 22 | # label34 = np.ones([total_true_value.shape[0],17],dtype=int)
 23 | # total_true_value = np.concatenate((total_true_value,label34),axis=1)
 24 | # total_predicted_value = np.concatenate((total_predicted_value,label34),axis=1)
 25 | #
 26 | 
 27 | 
 28 | filename_eval_log = os.path.join('./data', 'log_eval')
 29 | file_eval_log = open(filename_eval_log, 'w')
 30 | np.set_printoptions(threshold=np.nan)
 31 | print('\nevaluation:', file=file_eval_log)
 32 | print('\nevaluation:')
 33 | 
 34 | total_predicted_value = total_predicted_value.astype(bool)
 35 | total_true_value = total_true_value.astype(bool)
 36 | 
 37 | print('  example based evaluations:', file=file_eval_log)
 38 | print('  example based evaluations:')
 39 | 
 40 | equal = total_true_value == total_predicted_value
 41 | match = np.sum(equal, axis=1) == np.size(equal, axis=1)
 42 | exact_match_ratio = np.sum(match) / np.size(match)
 43 | print('      exact_match_ratio = %.4f' % exact_match_ratio, file=file_eval_log)
 44 | print('      exact_match_ratio = %.4f' % exact_match_ratio)
 45 | 
 46 | true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1)
 47 | true_or_predict = np.sum(total_true_value | total_predicted_value, axis=1)
 48 | accuracy = np.mean(true_and_predict / true_or_predict)
 49 | print('      accuracy = %.4f' % accuracy, file=file_eval_log)
 50 | print('      accuracy = %.4f' % accuracy)
 51 | 
 52 | precison = np.mean(true_and_predict / (np.sum(total_predicted_value, axis=1) + 1e-9))
 53 | print('      precison = %.4f' % precison, file=file_eval_log)
 54 | print('      precison = %.4f' % precison)
 55 | 
 56 | recall = np.mean(true_and_predict / np.sum(total_true_value, axis=1))
 57 | print('      recall = %.4f' % recall, file=file_eval_log)
 58 | print('      recall = %.4f' % recall)
 59 | 
 60 | F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1)
 61 |                                                + np.sum(total_predicted_value, axis=1)))
 62 | print('      F1_Measure = %.4f' % F1_Measure, file=file_eval_log)
 63 | print('      F1_Measure = %.4f' % F1_Measure)
 64 | 
 65 | HammingLoss = np.mean(total_true_value ^ total_predicted_value)
 66 | print('      HammingLoss = %.4f' % HammingLoss, file=file_eval_log)
 67 | print('      HammingLoss = %.4f' % HammingLoss)
 68 | 
 69 | 
 70 | print('  label based evaluations:', file=file_eval_log)
 71 | print('  label based evaluations:')
 72 | 
 73 | TP = np.sum(total_true_value & total_predicted_value,axis=0,dtype=np.int32)
 74 | FP = np.sum((~total_true_value) & total_predicted_value,axis=0,dtype=np.int32)
 75 | FN = np.sum(total_true_value & (~total_predicted_value),axis=0,dtype=np.int32)
 76 | 
 77 | TP_re = np.reshape(TP,[TP.shape[0],1])
 78 | FP_re = np.reshape(FP,[FP.shape[0],1])
 79 | FN_re = np.reshape(FN,[FN.shape[0],1])
 80 | re =  np.concatenate((TP_re,FP_re,FN_re),axis=1)
 81 | print('TP FP FN:')
 82 | print('TP FP FN:', file=file_eval_log)
 83 | print(re,file=file_eval_log)
 84 | print(re)
 85 | 
 86 | 
 87 | # TP = np.concatenate((TP[0:6],TP[7:28],TP[29:31],TP[32:36],TP[37:52],TP[53:]))
 88 | # FP = np.concatenate((FP[0:6],FP[7:28],FP[29:31],FP[32:36],FP[37:52],FP[53:]))
 89 | # FN = np.concatenate((FN[0:6],FN[7:28],FN[29:31],FN[32:36],FN[37:52],FN[53:]))
 90 | 
 91 | # for i in [6,28,31,36,52]:
 92 | #     TP[i] = TP[i-1]
 93 | #     FP[i] = FP[i - 1]
 94 | #     FN[i] = FN[i - 1]
 95 | #
 96 | # TP = np.concatenate((TP[0:49],TP[51:66],TP[67:69],TP[70:80],TP[81:]))
 97 | # FP = np.concatenate((FP[0:49],FP[51:66],FP[67:69],FP[70:80],FP[81:]))
 98 | # FN = np.concatenate((FN[0:49],FN[51:66],FN[67:69],FN[70:80],FN[81:]))
 99 | 
100 | 
101 | _P = np.sum(TP) / (np.sum(TP) + np.sum(FP)  + 1e-9 )
102 | _R = np.sum(TP) / (np.sum(TP) + np.sum(FN)  + 1e-9 )
103 | Micro_F1 = (2 * _P *_R) / (_P + _R)
104 | print('      P = %.4f' % _P, file=file_eval_log)
105 | print('      P = %.4f' % _P)
106 | print('      R = %.4f' % _R, file=file_eval_log)
107 | print('      R = %.4f' % _R)
108 | print('      Micro-F1 = %.4f' % Micro_F1, file=file_eval_log)
109 | print('      Micro-F1 = %.4f' % Micro_F1)
110 | 
111 | _P_t = TP / (TP + FP + 1e-9)
112 | _R_t = TP / (TP + FN + 1e-9)
113 | Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
114 | 
115 | 
116 | _P_t_re = np.reshape(_P_t,[_P_t.shape[0],1])
117 | _R_t_re = np.reshape(_R_t,[_R_t.shape[0],1])
118 | re =  np.concatenate((_P_t_re,_R_t_re),axis=1)
119 | print('_P_t _R_t:')
120 | print('_P_t:', file=file_eval_log)
121 | print(re,file=file_eval_log)
122 | print(re)
123 | 
124 | print('      Macro-F1 = %.4f' % Macro_F1, file=file_eval_log)
125 | print('      Macro-F1 = %.4f' % Macro_F1)
126 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_eval_without_labels.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from datetime import datetime
  4 | import math
  5 | import time
  6 | import os
  7 | import shutil
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | 
 12 | import graphcnn_model
 13 | import graphcnn_input
 14 | import graphcnn_option
 15 | 
 16 | 
 17 | evalDataSet = None
 18 | 
 19 | FLAGS = tf.app.flags.FLAGS
 20 | 
 21 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_eval',
 22 |                            """Directory where to write event logs.""")
 23 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 24 |                            """Directory where to read model checkpoints.""")
 25 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 26 |                             """How often to run the eval.""")
 27 | tf.app.flags.DEFINE_boolean('run_once', False,
 28 |                          """Whether to run eval only once.""")
 29 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 30 |                             """Whether to log device placement.""")
 31 | 
 32 | 
 33 | 
 34 | 
 35 | def evaluate(checkpoint):
 36 |     with tf.Graph().as_default() as g:
 37 |         # Get images and labels
 38 |         data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH,
 39 |                                                  graphcnn_input.NUM_CHANNELS])
 40 |         # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES])
 41 | 
 42 |         # inference
 43 |         # logits = graphcnn_model.inference(data, eval_data=True)
 44 |         logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False)
 45 | 
 46 |         # multi-label sigmoid
 47 |         logits = tf.sigmoid(logits)
 48 | 
 49 |         # Restore the moving average version of the learned variables for eval. # ?????????????????????????
 50 |         variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY)
 51 |         variables_to_restore = variable_averages.variables_to_restore()
 52 |         saver = tf.train.Saver(variables_to_restore)
 53 | 
 54 |         # Build the summary operation based on the TF collection of Summaries.
 55 |         # summary_op = tf.merge_all_summaries()
 56 |         # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)
 57 | 
 58 | 
 59 |         with tf.Session(config=tf.ConfigProto(
 60 |             allow_soft_placement=True,
 61 |             log_device_placement=FLAGS.log_device_placement)) as sess:
 62 |             if checkpoint == '0':
 63 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
 64 |                 if ckpt and ckpt.model_checkpoint_path:
 65 |                     # Restores from checkpoint
 66 |                     saver.restore(sess, ckpt.model_checkpoint_path)
 67 |                     # extract global_step
 68 |                     global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
 69 |                 else:
 70 |                     print('No checkpoint file found')
 71 |                     return
 72 |             else:
 73 |                 if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)):
 74 |                     saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint))
 75 |                     global_step_for_restore = int(checkpoint)
 76 |                 else:
 77 |                     print('No checkpoint file found')
 78 |                     return
 79 | 
 80 |             num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE))
 81 |             total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE
 82 |             step = 0
 83 |             total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32)  ##
 84 |             while step < num_iter:
 85 |                 test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE)
 86 |                 predicted_value = sess.run(
 87 |                     logits, feed_dict={data: test_data})
 88 |                 total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0)
 89 |                 step += 1
 90 | 
 91 |             total_predicted_value = total_predicted_value[1:]
 92 | 
 93 |             np.savetxt('./log_eval_for_predicted_value_dictribution', total_predicted_value[range(0,100)], fmt='%.4f')
 94 | 
 95 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution')
 96 |             if os.path.exists(detail_filename):
 97 |                 os.remove(detail_filename)
 98 |             np.savetxt(detail_filename, total_predicted_value, fmt='%.4f')
 99 |             total_predicted_value = ((total_predicted_value) >= graphcnn_option.EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
100 |             assert total_sample_count == total_predicted_value.shape[0], 'sample_count error!'
101 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
102 |             if os.path.exists(detail_filename):
103 |                 os.remove(detail_filename)
104 |             np.savetxt(detail_filename, total_predicted_value, fmt='%d')
105 | 
106 |             np.savetxt('./log_eval_for_predicted_value', total_predicted_value[range(0,100)], fmt='%.4f')
107 |             
108 |             detail_filename = os.path.join(graphcnn_option.DATA_PATH,'remap')
109 |             remap = np.loadtxt(detail_filename,dtype=int)
110 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
111 |             fr = open(detail_filename, 'w')
112 |             for i in range(0, np.size(total_predicted_value, axis=0)):
113 |                 labels = np.where(total_predicted_value[i] == 1)[0]
114 |                 labels_remap = remap[labels,0]
115 |                 for elem in labels_remap:
116 |                     print(elem, end=' ', file=fr)
117 |                 print('', file=fr)
118 |             fr.close()
119 | 
120 |             filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval')
121 |             file_eval_log = open(filename_eval_log, 'w')
122 |             np.set_printoptions(threshold=np.nan)
123 |             print('\nevaluation:', file=file_eval_log)
124 |             print('\nevaluation:')
125 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log)
126 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore))
127 |             print('evaluation is end...')
128 |             print('evaluation is end...', file=file_eval_log)
129 | 
130 |             print('evaluation samples number:%d, evaluation classes number:%d' %
131 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]),file=file_eval_log)
132 |             print('evaluation samples number:%d, evaluation classes number:%d' %
133 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]))
134 |             print('evaluation detail: '
135 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
136 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'),
137 |                   file=file_eval_log)
138 |             print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval')
139 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
140 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'))
141 |             file_eval_log.close()
142 | 
143 | 
144 | def main(argv=None):  # pylint: disable=unused-argument
145 |     global evalDataSet
146 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
147 |     if tf.gfile.Exists(FLAGS.eval_dir):
148 |         print('the evaluate data has already exists!')
149 |         str = input('continue will delete the old evaluate directory:(y/n)')
150 |         if str == 'y' or str == 'Y':
151 |             tf.gfile.DeleteRecursively(FLAGS.eval_dir)
152 |         elif str == 'n' or str == 'N':
153 |             print('eval end!')
154 |             return
155 |         else:
156 |             print('invalid input!')
157 |             return
158 |     tf.gfile.MakeDirs(FLAGS.eval_dir)
159 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
160 |     checkpoint = '0'
161 |     evalDataSet = graphcnn_input.generate_eval_data(graphcnn_option.EVAL_DATA_DIR,
162 |                                                     ont_hot=True,index_mode=True,
163 |                                                     label_used=False)
164 |     print('evaluating...')
165 |     evaluate(checkpoint)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     tf.app.run()
170 | 
171 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_all.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 222
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | 
 19 | evalDataSet = None
 20 | 
 21 | FLAGS = tf.app.flags.FLAGS
 22 | 
 23 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 24 |                            """Directory where to write event logs.""")
 25 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 26 |                            """Directory where to read model checkpoints.""")
 27 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 28 |                             """How often to run the eval.""")
 29 | tf.app.flags.DEFINE_boolean('run_once', False,
 30 |                          """Whether to run eval only once.""")
 31 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 32 |                             """Whether to log device placement.""")
 33 | 
 34 | 
 35 | 
 36 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5
 37 | 
 38 | def evaluate(checkpoint,test_index_array):
 39 |     with tf.Graph().as_default() as g, tf.device('/cpu:0'):
 40 |         # Get images and labels
 41 |         data = tf.placeholder(tf.float32, [graphcnn_input.EVAL_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH,
 42 |                                            graphcnn_input.NUM_CHANNELS])
 43 |         # labels = tf.placeholder(tf.int32, [graphcnn_input.EVAL_BATCH_SIZE,graphcnn_input.NUM_CLASSES])
 44 | 
 45 |         # inference
 46 |         logits = graphcnn_model.inference(data, eval_data=True)
 47 |         # logits = graphcnn_model.inference_CPU(data, eval_data=True, dependencies_loss=False)
 48 | 
 49 |         # multi-label sigmoid
 50 |         logits = tf.sigmoid(logits)
 51 | 
 52 |         # Restore the moving average version of the learned variables for eval. # ?????????????????????????
 53 |         variable_averages = tf.train.ExponentialMovingAverage(graphcnn_option.MOVING_AVERAGE_DECAY)
 54 |         variables_to_restore = variable_averages.variables_to_restore()
 55 |         saver = tf.train.Saver(variables_to_restore)
 56 | 
 57 |         # Build the summary operation based on the TF collection of Summaries.
 58 |         # summary_op = tf.merge_all_summaries()
 59 |         # summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)
 60 | 
 61 | 
 62 |         with tf.Session(config=tf.ConfigProto(
 63 |                 allow_soft_placement=True,
 64 |                 log_device_placement=FLAGS.log_device_placement)) as sess:
 65 |             if checkpoint == '0':
 66 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
 67 |                 if ckpt and ckpt.model_checkpoint_path:
 68 |                     # Restores from checkpoint
 69 |                     saver.restore(sess, ckpt.model_checkpoint_path)
 70 |                     # extract global_step
 71 |                     global_step_for_restore = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
 72 |                 else:
 73 |                     print('No checkpoint file found')
 74 |                     return
 75 |             else:
 76 |                 if os.path.exists(os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint)):
 77 |                     saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt-' + checkpoint))
 78 |                     global_step_for_restore = int(checkpoint)
 79 |                 else:
 80 |                     print('No checkpoint file found')
 81 |                     return
 82 | 
 83 |             num_iter = int(math.floor(graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / graphcnn_input.EVAL_BATCH_SIZE))
 84 |             total_sample_count = num_iter * graphcnn_input.EVAL_BATCH_SIZE
 85 |             step = 0
 86 |             total_predicted_value = np.zeros([1, graphcnn_input.NUM_CLASSES], dtype=np.float32)  ##
 87 |             while step < num_iter:
 88 |                 test_data = evalDataSet.next_batch(graphcnn_input.EVAL_BATCH_SIZE)
 89 |                 predicted_value = sess.run(
 90 |                     logits, feed_dict={data: test_data})
 91 |                 total_predicted_value = np.concatenate((total_predicted_value, predicted_value), axis=0)
 92 |                 step += 1
 93 | 
 94 |             total_predicted_value = total_predicted_value[1:]
 95 | 
 96 |             detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 97 |             if os.path.exists(detail_filename):
 98 |                 os.remove(detail_filename)
 99 |             np.savetxt(detail_filename, total_predicted_value, fmt='%.4f')
100 | 
101 | 
102 |             filename_eval_log = os.path.join(FLAGS.eval_dir, 'log_eval')
103 |             file_eval_log = open(filename_eval_log, 'w')
104 |             np.set_printoptions(threshold=np.nan)
105 |             print('\nevaluation:', file=file_eval_log)
106 |             print('\nevaluation:')
107 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore), file=file_eval_log)
108 |             print('  %s, ckpt-%d' % (datetime.now(), global_step_for_restore))
109 |             print('evaluation is end...')
110 |             print('evaluation is end...', file=file_eval_log)
111 | 
112 |             print('evaluation samples number:%d, evaluation classes number:%d' %
113 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]), file=file_eval_log)
114 |             print('evaluation samples number:%d, evaluation classes number:%d' %
115 |                   (total_predicted_value.shape[0], total_predicted_value.shape[1]))
116 |             print('evaluation detail: '
117 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
118 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'),
119 |                   file=file_eval_log)
120 |             print('evaluation detail: ' + os.path.join(FLAGS.eval_dir, 'log_eval')
121 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
122 |                   + ', ' + os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution'))
123 |             file_eval_log.close()
124 | 
125 | 
126 | 
127 | def main(argv=None):  # pylint: disable=unused-argument
128 |     global evalDataSet
129 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
130 | 
131 |     if tf.gfile.Exists(FLAGS.eval_dir):
132 |         # print('the evaluate data has already exists!')
133 |         # str = input('continue will delete the old evaluate directory:(y/n)')
134 |         # if str == 'y' or str == 'Y':
135 |         tf.gfile.DeleteRecursively(FLAGS.eval_dir)
136 |         #elif str == 'n' or str == 'N':
137 |         #    print('eval end!')
138 |         #    return
139 |         #else:
140 |         #    print('invalid input!')
141 |         #    return
142 |     tf.gfile.MakeDirs(FLAGS.eval_dir)
143 | 
144 |     test_index_array = np.array(range(0, 81262))
145 | 
146 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
147 |     checkpoint = '0'
148 |     evalDataSet = graphcnn_input.generate_hier_eval_data(test_index_array,
149 |                                                          data_dir=graphcnn_option.EVAL_DATA_DIR,
150 |                                                          ont_hot=True,
151 |                                                          index_mode=True,
152 |                                                          label_used=False)
153 |     print('evaluating...')
154 |     evaluate(checkpoint,test_index_array)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     tf.app.run()
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_some.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 444
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # 生成测试数据的索引文件
 39 | def generate_eval_index():
 40 |     test_index_array = []
 41 |     # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME)
 42 |     filepath = '../hier_eval_root'
 43 |     pathDir = os.listdir(filepath)
 44 |     for allDir in pathDir:
 45 |         child = os.path.join(filepath, allDir)
 46 |         if os.path.getsize(child):
 47 |             example_label_array = np.loadtxt(child,dtype=int)
 48 |             examlpe_array = example_label_array[:,0]
 49 |             label_array = example_label_array[:, 1]
 50 |             for root in graphcnn_option.HIER_ROOT_CODE:
 51 |                 index = np.where(label_array==root)[0]
 52 |                 for one in examlpe_array[index]:
 53 |                     if one not in test_index_array:
 54 |                         test_index_array.append(one)
 55 | 
 56 |     # for allDir in pathDir:
 57 |     #     child = os.path.join(filepath, allDir)
 58 |     #     os.remove(child)
 59 | 
 60 |     # 将索引文件写到hier_eval文件夹下
 61 |     filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index')
 62 |     np.savetxt(filename,test_index_array,fmt='%d')
 63 | 
 64 |     return test_index_array
 65 | 
 66 | 
 67 | def evaluate(checkpoint,test_index_array):
 68 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 69 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 70 |         total_predicted_value = total_predicted_value[test_index_array]
 71 | 
 72 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 73 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 74 |         total_predicted_value = (
 75 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 76 | 
 77 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 78 |         if os.path.exists(detail_filename):
 79 |             os.remove(detail_filename)
 80 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 81 | 
 82 | 
 83 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 84 |         total_remap = np.loadtxt(filename, dtype=int)
 85 | 
 86 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 87 |                                        graphcnn_option.HIER_labels_remap_file)
 88 |         remap = np.loadtxt(detail_filename, dtype=int)
 89 | 
 90 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 91 |         fr_leaf = open(filename,'a')
 92 |         filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file)
 93 |         fr_leaf_exp = open(filename, 'a')
 94 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 95 |         fr_root = open(filename, 'w')
 96 | 
 97 |         # rootstr_tmp = []
 98 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 99 |         fr = open(detail_filename, 'w')
100 |         for i in range(0, np.size(total_predicted_value, axis=0)):
101 |             labels = np.where(total_predicted_value[i] == 1)[0]
102 |             if len(labels) > 0:
103 |                 labels_remap = remap[labels, 0]
104 |                 for elem in labels_remap:
105 |                     print(elem, end=' ', file=fr)
106 |                     if elem in total_remap[:,0]: # leaf
107 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
108 |                     else:
109 |                         print('%d %d' % (test_index_array[i], elem), file=fr_root)
110 |                         # for j in range(0,len(rootlist)):
111 |                         #     if elem in rootlist[j]:
112 |                         #         if rootstr[j] not in rootstr_tmp:
113 |                         #             rootstr_tmp.append(rootstr[j])
114 |                 print('', file=fr)
115 |             else:
116 |                 # labels_remap = remap[:, 0]
117 |                 labels = total_predicted_value_argmax[i]
118 |                 labels_value = total_predicted_value_max[i]
119 |                 labels_remap = remap[labels, 0]
120 |                 # for elem in labels_remap:
121 |                 elem = labels_remap
122 |                 print(elem, file=fr)
123 |                 if elem in total_remap[:, 0]:  # leaf
124 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp)
125 |                 else:
126 |                     print('%d %d' % (test_index_array[i], elem), file=fr_root)
127 |                 # if labels_value < 0.5:
128 |                 #     labels_remap = remap[:, 0]
129 |                 #     for elem in labels_remap:
130 |                 #         if elem not in total_remap[:, 0]:
131 |                 #             print('%d %d' % (test_index_array[i], elem), file=fr_root)
132 | 
133 |         fr.close()
134 |         fr_leaf.close()
135 |         fr_root.close()
136 |         fr_leaf_exp.close()
137 | 
138 |         # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root')
139 |         # fr = open(filename, 'w')
140 |         # for one in rootstr_tmp:
141 |         #     print(one)
142 |         #     print(one,file=fr)
143 |         # fr.close()
144 | 
145 | 
146 | 
147 | 
148 | def main(argv=None):  # pylint: disable=unused-argument
149 |     global evalDataSet
150 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
151 | 
152 |     # test_index_array = np.array(range(0, 81262))
153 |     if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root
154 |         test_index_array = np.array(range(0,81262))
155 |         # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int)
156 |     else:
157 |         test_index_array = generate_eval_index()
158 |     if test_index_array is None or len(test_index_array)==0:
159 |         print('no hier_data need eval')
160 |         return
161 |     else:
162 |         print('choosing for evaluation...')
163 |         print('choosed number:%d' % len(test_index_array))
164 | 
165 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
166 |     checkpoint = '0'
167 | 
168 |     # print('choosing for evaluation...')
169 |     evaluate(checkpoint,test_index_array)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     tf.app.run()
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_some2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 333
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | def evaluate(checkpoint,test_index_array):
 37 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 38 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 39 |         total_predicted_value = total_predicted_value[test_index_array]
 40 | 
 41 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 42 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 43 |         total_predicted_value = (
 44 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 45 | 
 46 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 47 |         if os.path.exists(detail_filename):
 48 |             os.remove(detail_filename)
 49 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 50 | 
 51 | 
 52 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 53 |         total_remap = np.loadtxt(filename, dtype=int)
 54 | 
 55 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 56 |                                        graphcnn_option.HIER_labels_remap_file)
 57 |         remap = np.loadtxt(detail_filename, dtype=int)
 58 | 
 59 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 60 |         fr_leaf = open(filename,'a')
 61 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 62 |         fr_root = open(filename, 'w')
 63 | 
 64 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr')
 65 |         # fr = open(filename, 'r')
 66 |         # rootstr = fr.readlines()
 67 |         # fr.close()
 68 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist')
 69 |         # fr = open(filename, 'r')
 70 |         # rootlines = fr.readlines()
 71 |         # fr.close()
 72 |         # rootlist = []
 73 |         # for line in rootlines:
 74 |         #     line = line.strip()
 75 |         #     linelist = line.split(' ')
 76 |         #     linelist = [int(k) for k in linelist]
 77 |         #     rootlist.append(linelist)
 78 | 
 79 |         # rootstr_tmp = []
 80 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
 81 |         fr = open(detail_filename, 'w')
 82 |         for i in range(0, np.size(total_predicted_value, axis=0)):
 83 |             labels = np.where(total_predicted_value[i] == 1)[0]
 84 |             if len(labels) > 0:
 85 |                 labels_remap = remap[labels, 0]
 86 |                 for elem in labels_remap:
 87 |                     print(elem, end=' ', file=fr)
 88 |                     if elem in total_remap[:,0]: # leaf
 89 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
 90 |                 print('', file=fr)
 91 |             else:
 92 |                 labels = total_predicted_value_argmax[i]
 93 |                 labels_remap = remap[labels, 0]
 94 |                 elem = labels_remap
 95 |                 labels_value = total_predicted_value_max[i]
 96 |                 print(elem, file=fr)
 97 |                 if elem in total_remap[:, 0]:  # leaf
 98 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_root)
 99 | 
100 | 
101 |         fr.close()
102 |         fr_leaf.close()
103 |         fr_root.close()
104 | 
105 | 
106 | 
107 | 
108 | def main(argv=None):  # pylint: disable=unused-argument
109 |     global evalDataSet
110 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
111 | 
112 |     test_index_array = np.array(range(0, 81262))
113 |     print('choosing for evaluation...')
114 |     print('choosed number:%d' % len(test_index_array))
115 | 
116 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
117 |     checkpoint = '0'
118 | 
119 |     # print('choosing for evaluation...')
120 |     evaluate(checkpoint,test_index_array)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     tf.app.run()
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_hier_eval_without_labels_some_root.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 444
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import shutil
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import graphcnn_model
 15 | import graphcnn_input
 16 | import graphcnn_option
 17 | 
 18 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.9
 19 | 
 20 | evalDataSet = None
 21 | 
 22 | FLAGS = tf.app.flags.FLAGS
 23 | 
 24 | tf.app.flags.DEFINE_string('eval_dir', './tmp/graphcnn_hier_eval',
 25 |                            """Directory where to write event logs.""")
 26 | tf.app.flags.DEFINE_string('checkpoint_dir', './tmp/graphcnn_train',
 27 |                            """Directory where to read model checkpoints.""")
 28 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 29 |                             """How often to run the eval.""")
 30 | tf.app.flags.DEFINE_boolean('run_once', False,
 31 |                          """Whether to run eval only once.""")
 32 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 33 |                             """Whether to log device placement.""")
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # 生成测试数据的索引文件
 39 | def generate_eval_index():
 40 |     test_index_array = []
 41 |     # filepath = os.path.join(graphcnn_option.DATA_PATH, graphcnn_option.HIER_DIR_NAME)
 42 |     filepath = '../hier_eval_root'
 43 |     pathDir = os.listdir(filepath)
 44 |     for allDir in pathDir:
 45 |         child = os.path.join(filepath, allDir)
 46 |         if os.path.getsize(child):
 47 |             example_label_array = np.loadtxt(child,dtype=int)
 48 |             examlpe_array = example_label_array[:,0]
 49 |             label_array = example_label_array[:, 1]
 50 |             for root in graphcnn_option.HIER_ROOT_CODE:
 51 |                 index = np.where(label_array==root)[0]
 52 |                 for one in examlpe_array[index]:
 53 |                     if one not in test_index_array:
 54 |                         test_index_array.append(one)
 55 | 
 56 |     # for allDir in pathDir:
 57 |     #     child = os.path.join(filepath, allDir)
 58 |     #     os.remove(child)
 59 | 
 60 |     # 将索引文件写到hier_eval文件夹下
 61 |     filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_hier_eval_index')
 62 |     np.savetxt(filename,test_index_array,fmt='%d')
 63 | 
 64 |     return test_index_array
 65 | 
 66 | 
 67 | def evaluate(checkpoint,test_index_array):
 68 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_dictribution_all')
 69 |         total_predicted_value = np.loadtxt(detail_filename,dtype=float)
 70 |         total_predicted_value = total_predicted_value[test_index_array]
 71 | 
 72 |         total_predicted_value_max = np.max(total_predicted_value, axis=1)
 73 |         total_predicted_value_argmax = np.argmax(total_predicted_value, axis=1)
 74 |         total_predicted_value = (
 75 |         (total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 76 | 
 77 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value')
 78 |         if os.path.exists(detail_filename):
 79 |             os.remove(detail_filename)
 80 |         np.savetxt(detail_filename, total_predicted_value, fmt='%d')
 81 | 
 82 | 
 83 |         filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.DATA_LABELS_REMAP_NAME)
 84 |         total_remap = np.loadtxt(filename, dtype=int)
 85 | 
 86 |         detail_filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, graphcnn_option.HIER_DIR_NAME,
 87 |                                        graphcnn_option.HIER_labels_remap_file)
 88 |         remap = np.loadtxt(detail_filename, dtype=int)
 89 | 
 90 |         filename = os.path.join('../hier_result_leaf', graphcnn_option.HIER_eval_result_leaf_file)
 91 |         fr_leaf = open(filename,'a')
 92 |         filename = os.path.join('../hier_result_leaf_exp', graphcnn_option.HIER_eval_result_leaf_exp_file)
 93 |         fr_leaf_exp = open(filename, 'a')
 94 |         filename = os.path.join('../hier_result_root', graphcnn_option.HIER_eval_result_root_file)
 95 |         fr_root = open(filename, 'w')
 96 | 
 97 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootstr')
 98 |         # fr = open(filename, 'r')
 99 |         # rootstr = fr.readlines()
100 |         # fr.close()
101 |         # filename = os.path.join(graphcnn_option.EVAL_DATA_DIR, 'hier_rootlist')
102 |         # fr = open(filename, 'r')
103 |         # rootlines = fr.readlines()
104 |         # fr.close()
105 |         # rootlist = []
106 |         # for line in rootlines:
107 |         #     line = line.strip()
108 |         #     linelist = line.split(' ')
109 |         #     linelist = [int(k) for k in linelist]
110 |         #     rootlist.append(linelist)
111 | 
112 |         # rootstr_tmp = []
113 |         detail_filename = os.path.join(FLAGS.eval_dir, 'log_eval_for_predicted_value_list')
114 |         fr = open(detail_filename, 'w')
115 |         for i in range(0, np.size(total_predicted_value, axis=0)):
116 |             labels = np.where(total_predicted_value[i] == 1)[0]
117 |             if len(labels) > 0:
118 |                 labels_remap = remap[labels, 0]
119 |                 for elem in labels_remap:
120 |                     print(elem, end=' ', file=fr)
121 |                     if elem in total_remap[:,0]: # leaf
122 |                         print('%d %d'%(test_index_array[i],elem),file=fr_leaf)
123 |                     else:
124 |                         print('%d %d' % (test_index_array[i], elem), file=fr_root)
125 |                         # for j in range(0,len(rootlist)):
126 |                         #     if elem in rootlist[j]:
127 |                         #         if rootstr[j] not in rootstr_tmp:
128 |                         #             rootstr_tmp.append(rootstr[j])
129 |                 print('', file=fr)
130 |             else:
131 |                 # labels_remap = remap[:, 0]
132 |                 labels = total_predicted_value_argmax[i]
133 |                 labels_value = total_predicted_value_max[i]
134 |                 labels_remap = remap[labels, 0]
135 |                 # for elem in labels_remap:
136 |                 elem = labels_remap
137 |                 print(elem, file=fr)
138 |                 if elem in total_remap[:, 0]:  # leaf
139 |                     print('%d %d %.4f' % (test_index_array[i], elem, labels_value), file=fr_leaf_exp)
140 |                 else:
141 |                     print('%d %d' % (test_index_array[i], elem), file=fr_root)
142 |                 # if labels_value < 0.5:
143 |                 #     labels_remap = remap[:, 0]
144 |                 #     for elem in labels_remap:
145 |                 #         if elem not in total_remap[:, 0]:
146 |                 #             print('%d %d' % (test_index_array[i], elem), file=fr_root)
147 | 
148 |         fr.close()
149 |         fr_leaf.close()
150 |         fr_root.close()
151 |         fr_leaf_exp.close()
152 | 
153 |         # filename = os.path.join(FLAGS.eval_dir, 'hier_next_root')
154 |         # fr = open(filename, 'w')
155 |         # for one in rootstr_tmp:
156 |         #     print(one)
157 |         #     print(one,file=fr)
158 |         # fr.close()
159 | 
160 | 
161 | 
162 | 
163 | def main(argv=None):  # pylint: disable=unused-argument
164 |     global evalDataSet
165 |     # assert not tf.gfile.Exists(FLAGS.eval_dir), 'please move the old evaluate directory to pre_versions!'
166 | 
167 |     # test_index_array = np.array(range(0, 81262))
168 |     if graphcnn_option.HIER_ROOT_CODE[0]==2143406: # root
169 |         test_index_array = np.array(range(0,81262))
170 |         # test_index_array = np.loadtxt('../example_no_result.txt',dtype=int)
171 |     else:
172 |         test_index_array = generate_eval_index()
173 |     if test_index_array is None or len(test_index_array)==0:
174 |         print('no hier_data need eval')
175 |         return
176 |     else:
177 |         print('choosing for evaluation...')
178 |         print('choosed number:%d' % len(test_index_array))
179 | 
180 |     # checkpoint = input('please input the choosed checkpoint to eval:(0 for latest)')
181 |     checkpoint = '0'
182 | 
183 |     # print('choosing for evaluation...')
184 |     evaluate(checkpoint,test_index_array)
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     tf.app.run()
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_option.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## data
 3 | ORI_DATA_NAME = 'graphs'
 4 | ORI_TRAIN_DATA_NAME = 'train_graphs'
 5 | ORI_TEST_DATA_NAME = 'test_graphs'
 6 | ORI_DATA_VEC_NAME = 'index2vec'
 7 | ORI_DATA_OPTION_NAME = 'option'
 8 | 
 9 | TRAIN_DATA_NAME = 'data.train'
10 | TEST_DATA_NAME = 'data.test'
11 | DATA_OPTION_NAME = 'data.option'
12 | 
13 | DATA_LABELS_REMAP_NAME = 'remap'
14 | 
15 | ## LSHTC Hierarchy training
16 | 
17 | 
18 | HIER_used = True
19 | HIER_test_used = True
20 | rootstr = '_1_2322682_' # ????
21 | HIER_ROOT_CODE = [2322682] # ????
22 | HIER_DIR_NAME = 'hier'
23 | HIER_labels_remap_file = 'hier'+rootstr+'remap'
24 | HIER_train_graphs_index_file = 'hier'+rootstr+'train_graphs_index'
25 | HIER_train_labels_file = 'hier'+rootstr+'train_labels'
26 | HIER_train_data_file = 'hier'+rootstr+'train_data'  # ??
27 | HIER_test_graphs_index_file = 'hier'+rootstr+'test_graphs_index'
28 | HIER_test_labels_file = 'hier'+rootstr+'test_labels'
29 | HIER_test_data_file = 'hier'+rootstr+'test_data'  # ??
30 | 
31 | HIER_eval_result_leaf_file = 'hier_eval_result'+rootstr+'leaf'
32 | HIER_eval_result_leaf_exp_file = 'hier_eval_result'+rootstr+'leaf_exp'
33 | HIER_eval_result_root_file = 'hier_eval_result'+rootstr+'root'
34 | 
35 | if HIER_used:
36 |     TRAIN_DATA_NAME = HIER_train_data_file
37 |     if HIER_test_used:
38 |         TEST_DATA_NAME = HIER_test_data_file
39 | 
40 | 
41 | 
42 | 
43 | # lr_decay_value = [0.1,0.01,0.001,0.0005,0.0001] # single-label wiki_cn
44 | # lr_decay_ecophs = [2,150,750,1250,1500]   # single-label wiki_cn
45 | # lr_decay_value = [0.1,0.01,0.001,0.01,0.001,0.0001]
46 | lr_decay_value = [0.01,0.001,0.0001,0.01,0.001,0.0001,0.00001]
47 | # lr_decay_ecophs = [10,400,1500,1800,2000]   # multi-label, RCV
48 | lr_decay_ecophs = [1,300,600,601,1000,1400,1500]   # multi-label, RCV
49 | 
50 | # multi-label, RCV: INITIAL_LEARNING_RATE = 0.001, decay_epochs = 600
51 | 
52 | 
53 | 
54 | ## Basic parameters.
55 | TRAIN_DATA_DIR = '../graphCNN_data'  # Path to the train data directory.
56 | EVAL_DATA_DIR = '../graphCNN_data'  # Path to the test data directory.
57 | DATA_PATH = './data'   # Path to data directory
58 | 
59 | USE_FP16 = False  # Train the model using fp16.
60 | 
61 | # summaryWriter
62 | SUMMARYWRITER = False
63 | 
64 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
65 | # to differentiate the operations. Note that this prefix is removed from the
66 | # names of the summaries when visualizing a model.
67 | TOWER_NAME = 'tower'
68 | 
69 | 
70 | 
71 | ## model parameters
72 | NUM_EPOCHS_PER_DECAY = 1000 #350     # Epochs after which learning rate decays.
73 | INITIAL_LEARNING_RATE = 0.001       # Initial learning rate.
74 | LEARNING_RATE_DECAY_RATE = 0.1  # Learning rate decay rate.
75 | 
76 | MOMENTUM = 0.9 # Momentum of SGD
77 | 
78 | DROPOUT_FRACTION = 0.5 # Add a dropout during training.
79 | 
80 | MOVING_AVERAGE_DECAY = 0.999 # The decay to use for the moving average.
81 | 
82 | WEIGHT_DECAY = 0.0005     # 0.00005  # 0.0005 # l2 regularization weight decay
83 | 
84 | VARIABLE_DEPENDENCY = 0.00005 # 0.0005 # the Variable's dependency constraint
85 | 
86 | 
87 | ## train parameters
88 | NUM_GPUS = 4 # How many GPUs to use
89 | 
90 | CKPT_PERIOD = 5000
91 | 
92 | 
93 | ## eval parameters
94 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification
95 | 


--------------------------------------------------------------------------------
/GraphCNN/graphcnn_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | from datetime import datetime
  5 | import os.path
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | import math
 11 | 
 12 | import graphcnn_model
 13 | import graphcnn_input
 14 | import graphcnn_option
 15 | 
 16 | 
 17 | 
 18 | FLAGS = tf.app.flags.FLAGS
 19 | 
 20 | tf.app.flags.DEFINE_string('train_dir', './tmp/graphcnn_train',
 21 |                            """Directory where to write event logs and checkpoint.""")
 22 | tf.app.flags.DEFINE_integer('max_epochs', 8000,
 23 |                             """Number of batches to run.""")
 24 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 25 |                             """Whether to log device placement.""")
 26 | 
 27 | 
 28 | # max_steps for train:
 29 | STEPS_PER_ECOPH = None
 30 | MAX_STEPS = None
 31 | # the period to save the model checkpoint.
 32 | CKPT_PERIOD = None
 33 | 
 34 | trainDataSet = None
 35 | 
 36 | 
 37 | def evalution_batch(total_predicted_value,total_true_value):
 38 | 
 39 | 
 40 |     total_predicted_value = ((total_predicted_value) >= graphcnn_option.EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 41 |     total_predicted_value = total_predicted_value.astype(bool)
 42 |     total_true_value = total_true_value.astype(bool)
 43 | 
 44 |     true_and_predict = np.sum(total_true_value & total_predicted_value, axis=1)
 45 |     example_based_F1_Measure = np.mean((true_and_predict * 2) / (np.sum(total_true_value, axis=1)
 46 |                                                    + np.sum(total_predicted_value, axis=1)))
 47 | 
 48 |     TP = np.sum(total_true_value & total_predicted_value, axis=0, dtype=np.int32)
 49 |     FP = np.sum((~total_true_value) & total_predicted_value, axis=0, dtype=np.int32)
 50 |     FN = np.sum(total_true_value & (~total_predicted_value), axis=0, dtype=np.int32)
 51 |     _P = np.sum(TP) / (np.sum(TP) + np.sum(FP) + 1e-9)
 52 |     _R = np.sum(TP) / (np.sum(TP) + np.sum(FN) + 1e-9)
 53 |     Micro_F1 = (2 * _P * _R) / (_P + _R)
 54 |     _P_t = TP / (TP + FP + 1e-9)
 55 |     _R_t = TP / (TP + FN + 1e-9)
 56 |     Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
 57 | 
 58 |     return example_based_F1_Measure, Micro_F1, Macro_F1
 59 | 
 60 | def train(newTrain,checkpoint):
 61 |     with tf.Graph().as_default():
 62 |         global_step = tf.Variable(0, trainable=False)
 63 | 
 64 |         data = tf.placeholder(tf.float32, [graphcnn_input.TRAIN_BATCH_SIZE, graphcnn_input.HEIGHT, graphcnn_input.WIDTH,
 65 |                                                  graphcnn_input.NUM_CHANNELS])
 66 |         labels = tf.placeholder(tf.int32, [graphcnn_input.TRAIN_BATCH_SIZE,graphcnn_input.NUM_CLASSES])
 67 | 
 68 |         # inference model.
 69 |         # logits = graphcnn_model.inference_GPU(data)
 70 |         logits = graphcnn_model.inference(data)
 71 |         # logits = graphcnn_model.inference_CPU(data,dependencies_loss=False)
 72 | 
 73 |         # Calculate loss.
 74 |         loss = graphcnn_model.loss(logits, labels)
 75 | 
 76 |         # updates the model parameters.
 77 |         train_op = graphcnn_model.train(loss, global_step)
 78 | 
 79 |         # Create a saver.
 80 |         saver = tf.train.Saver(var_list=tf.global_variables(),
 81 |                                max_to_keep=6,
 82 |                                keep_checkpoint_every_n_hours=10)
 83 | 
 84 |         if graphcnn_option.SUMMARYWRITER:
 85 |             # Build the summary operation based on the TF collection of Summaries.
 86 |             summary_op = tf.merge_all_summaries()
 87 | 
 88 |         # Build an initialization operation to run below.
 89 |         init = tf.global_variables_initializer()
 90 | 
 91 |         # Start running operations on the Graph. allow_soft_placement must be set to
 92 |         # True to build towers on GPU, as some of the ops do not have GPU implementations.
 93 |         sess = tf.Session(config=tf.ConfigProto(
 94 |             allow_soft_placement=True,
 95 |             log_device_placement=FLAGS.log_device_placement))
 96 | 
 97 |         first_step = 0
 98 |         if not newTrain:
 99 |             if checkpoint == '0': # choose the latest one
100 |                 ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
101 |                 if ckpt and ckpt.model_checkpoint_path:
102 |                     new_saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
103 |                     # Restores from checkpoint
104 |                     new_saver.restore(sess, ckpt.model_checkpoint_path)
105 |                     global_step_for_restore = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
106 |                     first_step = int(global_step_for_restore) + 1
107 |                 else:
108 |                     print('No checkpoint file found')
109 |                     return
110 |             else: #
111 |                 if os.path.exists(os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint)):
112 |                     new_saver = tf.train.import_meta_graph(
113 |                         os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint + '.meta'))
114 |                     new_saver.restore(sess,
115 |                         os.path.join(FLAGS.train_dir, 'model.ckpt-' + checkpoint))
116 |                     first_step = int(checkpoint) + 1
117 |                 else:
118 |                     print('No checkpoint file found')
119 |                     return
120 |         else:
121 |             sess.run(init)
122 | 
123 |         if graphcnn_option.SUMMARYWRITER:
124 |             summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
125 | 
126 |         filename_train_log = os.path.join(FLAGS.train_dir, 'log_train')
127 |         if os.path.exists(filename_train_log):
128 |             file_train_log = open(filename_train_log, 'a')
129 |         else:
130 |             file_train_log = open(filename_train_log, 'w')
131 | 
132 |         # learning_rate = graphcnn_option.lr_decay_value[0]  # 0.1(5), 0.01(100), 0.001(500), 0.0001(300), 0.00001(100)
133 |         # learning_rate_index = 0
134 |         for step in range(first_step,MAX_STEPS):
135 |             # if learning_rate_index < len(graphcnn_option.lr_decay_value) - 1:
136 |             #     if step > STEPS_PER_ECOPH * graphcnn_option.lr_decay_ecophs[learning_rate_index]:
137 |             #         learning_rate_index = learning_rate_index + 1
138 |             #         learning_rate = graphcnn_option.lr_decay_value[learning_rate_index]
139 | 
140 |             train_data, train_label = trainDataSet.next_batch(graphcnn_input.TRAIN_BATCH_SIZE)
141 |             start_time = time.time()
142 |             _, loss_value = sess.run([train_op, loss],
143 |                                      feed_dict= {data:train_data, labels:train_label})
144 |             duration = time.time() - start_time
145 | 
146 |             assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
147 | 
148 |             if step % 10 == 0:
149 |                 sec_per_batch = float(duration)
150 |                 format_str = ('%s: step=%d, loss=%.4f; %.3f sec/batch)')
151 |                 print(format_str % (datetime.now(), step, loss_value, sec_per_batch), file=file_train_log)
152 |                 print(format_str % (datetime.now(), step, loss_value, sec_per_batch))
153 | 
154 |             if graphcnn_option.SUMMARYWRITER:
155 |                 if step % 100 == 0:
156 |                     summary_str = sess.run(summary_op,
157 |                                            feed_dict= {data:train_data, labels:train_label})
158 |                     summary_writer.add_summary(summary_str, step)
159 | 
160 |             # Save the model checkpoint periodically. (named 'model.ckpt-global_step.meta')
161 |             if step % CKPT_PERIOD == 0 or (step + 1) == MAX_STEPS:
162 |                 checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
163 |                 saver.save(sess, checkpoint_path, global_step=step)
164 |         file_train_log.close()
165 | 
166 | def main(argv=None):
167 |     global trainDataSet, STEPS_PER_ECOPH, MAX_STEPS, CKPT_PERIOD
168 |     newTrain = True
169 |     checkpoint = 0
170 |     # assert not tf.gfile.Exists(FLAGS.train_dir), 'please move the old train directory to pre_versions!'
171 |     if tf.gfile.Exists(FLAGS.train_dir):
172 |         ans = input('whether to open up a new training:(y/n)')
173 |         if ans == 'y' or ans == 'Y':
174 |             newTrain = True
175 |             tf.gfile.DeleteRecursively(FLAGS.train_dir)
176 |         elif ans == 'n' or ans == 'N':
177 |             newTrain = False
178 |             checkpoint = input('please input the choosed checkpoint to restore:(0 for latest)')
179 |         else:
180 |             print('invalid input!')
181 |             return
182 |     if newTrain:
183 |         tf.gfile.MakeDirs(FLAGS.train_dir)
184 | 
185 |     # update paras
186 |     trainDataSet = graphcnn_input.generate_train_data(graphcnn_option.TRAIN_DATA_DIR,
187 |                                                       ont_hot=True,index_mode=True)
188 | 
189 |     # max_steps for train:
190 |     STEPS_PER_ECOPH = math.ceil(
191 |         graphcnn_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / float(graphcnn_input.TRAIN_BATCH_SIZE))
192 |     MAX_STEPS = FLAGS.max_epochs * STEPS_PER_ECOPH
193 | 
194 |     # the period to save the model checkpoint.
195 |     CKPT_PERIOD = graphcnn_option.CKPT_PERIOD  # ?????????????????????
196 |     # CKPT_PERIOD = 5000
197 |     # tem = str(STEPS_PER_ECOPH * 20)  # save the model every ecoph  # 5
198 |     # CKPT_PERIOD = int(int(tem[0]) * pow(10, len(tem) - 1))
199 | 
200 |     print('training...')
201 |     train(newTrain,checkpoint)
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     tf.app.run()
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/GraphCNN/utils/grouping.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/grouping.py


--------------------------------------------------------------------------------
/GraphCNN/utils/hier_rootlist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/hier_rootlist


--------------------------------------------------------------------------------
/GraphCNN/utils/hier_rootstr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/GraphCNN/utils/hier_rootstr


--------------------------------------------------------------------------------
/GraphCNN/utils/read:
--------------------------------------------------------------------------------
 1 | a 1
 2 | a 1
 3 | a 1
 4 | a 1
 5 | a 1
 6 | a 1
 7 | a 1
 8 | a 1
 9 | b 1
10 | b 1
11 | b 1
12 | b 1
13 | c 1
14 | c 1
15 | c 1
16 | c 1
17 | a 1
18 | a 1
19 | a 1
20 | a 1
21 | b 1
22 | b 1
23 | b 1
24 | b 1
25 | c 1
26 | c 1
27 | c 1
28 | c 1
29 | 


--------------------------------------------------------------------------------
/GraphCNN/utils/tmp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | import shutil
 6 | 
 7 | # 遍历指定目录，显示目录下的所有文件名
 8 | def eachFile(filepath):
 9 |     pathDir =  os.listdir(filepath)
10 |     for allDir in pathDir:
11 |         child = os.path.join('%s%s' % (filepath, allDir))
12 | 
13 | def xx():
14 |     filename = 'graphcnn_hier_eval_without_labels.py'
15 |     DIR = '.'
16 |     pathDir =  os.listdir(DIR)
17 |     for path in pathDir:
18 |         if len(path)>5 and path[0:5]=='LSHTC':
19 |             sourceFile = os.path.join(DIR, filename)
20 |             targetFile = os.path.join(DIR,path,filename)
21 |             if os.path.exists(targetFile):
22 |                 os.remove(targetFile)
23 |             shutil.copy(sourceFile, targetFile)
24 | 
25 | 
26 | a = np.array([[1,2,3],[1,2,3]])
27 | a = np.reshape(a,[-1,1])
28 | print(a)


--------------------------------------------------------------------------------
/GraphCNN/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import  os
 3 | 
 4 | def main():
 5 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups'
 6 |     fr = open(filename, 'r')
 7 |     lines = fr.readlines()
 8 |     fr.close()
 9 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/label_groups_info'
10 |     fr = open(filename, 'w')
11 |     for line in lines:
12 |         line = line.strip()
13 |         linelist = line.split(' ')
14 |         print(len(linelist),file=fr)
15 |     fr.close()
16 | 
17 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups'
18 |     fr = open(filename, 'r')
19 |     lines = fr.readlines()
20 |     fr.close()
21 |     filename = '/home/heyu/PycharmProjects/graphCNN/data/example_groups_info'
22 |     fr = open(filename, 'w')
23 |     for line in lines:
24 |         line = line.strip()
25 |         linelist = line.split(' ')
26 |         print(len(linelist),file=fr)
27 |     fr.close()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/HAN/model/IMDB/bestmodel/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/HAN/model/IMDB/bestmodel/.gitkeep


--------------------------------------------------------------------------------
/HAN/src/Dataset.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import numpy
  3 | import copy
  4 | import theano
  5 | import random
  6 | 
  7 | def genBatch(data):
  8 |     m =0 
  9 |     maxsentencenum = len(data[0])
 10 |     for doc in data:
 11 |         for sentence in doc:
 12 |             if len(sentence)>m:
 13 |                 m = len(sentence)
 14 |         for i in xrange(maxsentencenum - len(doc)):
 15 |             doc.append([-1])
 16 |     tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data)                          #[-1]是加在最前面
 17 |     tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp)
 18 |     return tmp 
 19 |             
 20 | def genLenBatch(lengths,maxsentencenum):
 21 |     lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths)
 22 |     return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths)
 23 | 
 24 | def genwordmask(docsbatch):
 25 |     mask = copy.deepcopy(docsbatch)
 26 |     mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask)
 27 |     mask = numpy.asarray(mask,dtype=numpy.float32)
 28 |     mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 
 29 |     return mask
 30 | 
 31 | def gensentencemask(sentencenum):
 32 |     maxnum = sentencenum[0]
 33 |     mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32)
 34 |     return mask.T
 35 | 
 36 | class Dataset(object):
 37 |     def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ):
 38 |         lines = map(lambda x: x.split('\t\t'), open(filename).readlines())
 39 |         # here i need more label. there is only one label            
 40 |         label = map(lambda x: x[0].split(' '), lines)
 41 |         oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32)
 42 |         for i in range(0,len(label)):
 43 |             for j in label[i]:
 44 |                 oneslable[i][int(j)] = 1
 45 |         label = oneslable
 46 |         print("already done the ones-hot")
 47 |         docs = map(lambda x: x[1][0:len(x[1])-1], lines) 
 48 |         docs = map(lambda x: x.split('<sssss>'), docs) 
 49 |         docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs)
 50 |         docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs)
 51 |         tmp = zip(docs, label)
 52 |         #random.shuffle(tmp)
 53 |         tmp.sort(lambda x, y: len(y[0]) - len(x[0]))  
 54 |         docs, label = zip(*tmp)
 55 | 
 56 |         sentencenum = map(lambda x : len(x),docs)
 57 |         length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs)
 58 |         self.epoch = len(docs) / maxbatch                                        
 59 |         if len(docs) % maxbatch != 0:
 60 |             self.epoch += 1
 61 |         
 62 |         self.docs = []
 63 |         self.label = []
 64 |         self.wordmask = []
 65 |         self.sentencemask = []
 66 |         self.maxsentencenum = []
 67 | 
 68 |         for i in xrange(self.epoch):
 69 |             self.maxsentencenum.append(sentencenum[i*maxbatch])
 70 |             docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 71 |             self.docs.append(docsbatch)
 72 |             self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 73 |             self.wordmask.append(genwordmask(docsbatch))
 74 |             self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 75 |         # self.docs = []
 76 |         # self.label = []
 77 |         # self.length = []
 78 |         # self.sentencenum = []
 79 |         # self.wordmask = []
 80 |         # self.sentencemask = []
 81 |         # self.maxsentencenum = []
 82 | 
 83 |         # for i in xrange(self.epoch):
 84 |         #     self.maxsentencenum.append(sentencenum[i*maxbatch])
 85 |         #     self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 
 86 |         #     docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 87 |         #     self.docs.append(docsbatch)
 88 |         #     self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 89 |         #     self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4))
 90 |         #     self.wordmask.append(genwordmask(docsbatch))
 91 |         #     self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 92 | 
 93 | class Wordlist(object):
 94 |     def __init__(self, filename, maxn = 100000):
 95 |         lines = map(lambda x: x.split(), open(filename).readlines()[:maxn])
 96 |         self.size = len(lines)
 97 | 
 98 |         self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))]
 99 |         self.voc = dict(self.voc)
100 | 
101 |     def getID(self, word):
102 |         try:
103 |             return self.voc[word]
104 |         except:
105 |             return -1
106 | 
107 | 


--------------------------------------------------------------------------------
/HAN/src/EmbLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class EmbLayer(object):
 8 |     def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None):
 9 |         self.input = inp
10 |         self.name = name
11 | 
12 |         if prefix == None:
13 |             f = file('../data/'+dataname+'/embinit.save', 'rb')
14 |             W = cPickle.load(f)
15 |             f.close()
16 |             W = theano.shared(value=W, name='E', borrow=True)    
17 |         else:
18 |             f = file(prefix + name + '.save', 'rb')
19 |             W = cPickle.load(f)
20 |             f.close()
21 |         self.W = W
22 | 
23 |         self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim))
24 |         self.params = [self.W]
25 | 
26 |     def save(self, prefix):
27 |         f = file(prefix + self.name + '.save', 'wb')
28 |         for obj in self.params:
29 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
30 |         f.close()
31 | 


--------------------------------------------------------------------------------
/HAN/src/HiddenLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class HiddenLayer(object):
 8 |     def __init__(self, rng, input, n_in, n_out, name, prefix=None,
 9 |                  activation=T.tanh):
10 |         self.name = name
11 |         self.input = input
12 | 
13 |         if prefix is None:
14 |             W_values = numpy.asarray(
15 |                 rng.uniform(
16 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
17 |                     high=numpy.sqrt(6. / (n_in + n_out)),
18 |                     size=(n_in, n_out)
19 |                 ),
20 |                 dtype=numpy.float32
21 |             )
22 |             if activation == theano.tensor.nnet.sigmoid:
23 |                 W_values *= 4
24 |             W = theano.shared(value=W_values, name='W', borrow=True)
25 | 
26 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
27 |             b = theano.shared(value=b_values, name='b', borrow=True)
28 |         else:
29 |             f = file(prefix + name + '.save', 'rb')
30 |             W = cPickle.load(f)
31 |             b = cPickle.load(f)
32 |             f.close()
33 | 
34 |         self.W = W
35 |         self.b = b
36 | 
37 |         lin_output = T.dot(input, self.W) + self.b
38 |         self.output = (
39 |             lin_output if activation is None
40 |             else activation(lin_output)
41 |         )
42 | 
43 |         self.params = [self.W, self.b]
44 | 
45 |     def save(self, prefix):
46 |         f = file(prefix + self.name + '.save', 'wb')
47 |         for obj in self.params:
48 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
49 |         f.close()
50 | 


--------------------------------------------------------------------------------
/HAN/src/LSTMLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def randMatrix(rng, shape, lim):
  8 |     return numpy.asarray(
  9 |         rng.uniform(
 10 |             low=-lim,
 11 |             high=lim,
 12 |             size=shape
 13 |         ),
 14 |         dtype=numpy.float32
 15 |     )
 16 | 
 17 | class LSTMLayer(object):
 18 |     def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None):
 19 |         self.input = input
 20 |         self.name = name
 21 | 
 22 |         limV = numpy.sqrt(6. / (n_in + n_out * 2))
 23 |         limG = limV * 4
 24 | 
 25 |         if prefix is None:
 26 |             Wi1_values = randMatrix(rng, (n_in, n_out), limG)
 27 |             Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True)
 28 |             Wi2_values = randMatrix(rng, (n_out, n_out), limG)
 29 |             Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True)
 30 |             bi_values = numpy.zeros((n_out,), dtype=numpy.float32)
 31 |             bi = theano.shared(value=bi_values, name='bi', borrow=True)
 32 | 
 33 |             Wo1_values = randMatrix(rng, (n_in, n_out), limG)
 34 |             Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True)
 35 |             Wo2_values = randMatrix(rng, (n_out, n_out), limG)
 36 |             Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True)
 37 |             bo_values = numpy.zeros((n_out,), dtype=numpy.float32)
 38 |             bo = theano.shared(value=bo_values, name='bo', borrow=True)
 39 | 
 40 |             Wf1_values = randMatrix(rng, (n_in, n_out), limG)
 41 |             Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True)
 42 |             Wf2_values = randMatrix(rng, (n_out, n_out), limG)
 43 |             Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True)
 44 |             bf_values = numpy.zeros((n_out,), dtype=numpy.float32)
 45 |             bf = theano.shared(value=bf_values, name='bf', borrow=True)
 46 | 
 47 |             Wc1_values = randMatrix(rng, (n_in, n_out), limV)
 48 |             Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True)
 49 |             Wc2_values = randMatrix(rng, (n_out, n_out), limV)
 50 |             Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True)
 51 |             bc_values = numpy.zeros((n_out,), dtype=numpy.float32)
 52 |             bc = theano.shared(value=bc_values, name='bc', borrow=True)
 53 | 
 54 |         else:
 55 |             f = file(prefix + name + '.save', 'rb')
 56 |             Wi1 = cPickle.load(f)
 57 |             Wi2 = cPickle.load(f)
 58 |             bi = cPickle.load(f)
 59 | 
 60 |             Wo1 = cPickle.load(f)
 61 |             Wo2 = cPickle.load(f)
 62 |             bo = cPickle.load(f)
 63 | 
 64 |             Wf1 = cPickle.load(f)
 65 |             Wf2 = cPickle.load(f)
 66 |             bf = cPickle.load(f)
 67 | 
 68 |             Wc1 = cPickle.load(f)
 69 |             Wc2 = cPickle.load(f)
 70 |             bc = cPickle.load(f)
 71 | 
 72 |             f.close()
 73 | 
 74 |         self.Wi1 = Wi1
 75 |         self.Wi2 = Wi2
 76 |         self.bi = bi
 77 | 
 78 |         self.Wo1 = Wo1
 79 |         self.Wo2 = Wo2
 80 |         self.bo = bo
 81 | 
 82 |         self.Wf1 = Wf1
 83 |         self.Wf2 = Wf2
 84 |         self.bf = bf
 85 | 
 86 |         self.Wc1 = Wc1
 87 |         self.Wc2 = Wc2
 88 |         self.bc = bc
 89 | 
 90 |         def step(emb, mask, C, prev):
 91 |             Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi)
 92 |             Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo)
 93 |             Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf)
 94 |             Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc)
 95 | 
 96 |             CC = C * Gf + Ct * Gi
 97 |             CC = CC * mask.dimshuffle(0,'x') 
 98 |             CC = T.cast(CC,'float32')
 99 |             h = T.tanh(CC) * Go
100 |             h = h * mask.dimshuffle(0,'x') 
101 |             h = T.cast(h,'float32')
102 |             return [CC, h]
103 | 
104 |         outs, _ = theano.scan(fn=step,
105 |             outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))],
106 |             sequences=[input, mask])
107 | 
108 |         self.output = outs[1]
109 | 
110 |         self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo,
111 |             self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc]
112 | 
113 |     def save(self, prefix):
114 |         f = file(prefix + self.name + '.save', 'wb')
115 |         for obj in self.params:
116 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
117 |         f.close()
118 | 


--------------------------------------------------------------------------------
/HAN/src/LSTMModel.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*- 
  2 | from datetime import datetime
  3 | from EmbLayer import EmbLayer
  4 | from LSTMLayer import LSTMLayer
  5 | from HiddenLayer import HiddenLayer
  6 | from PoolLayer import *
  7 | from SentenceSortLayer import *
  8 | import theano
  9 | import theano.tensor as T
 10 | import numpy 
 11 | import random
 12 | import sys
 13 | import time
 14 | from Update import AdaUpdates
 15 | 
 16 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification
 17 | 
 18 | class LSTMModel(object):
 19 |     def __init__(self, n_voc, trainset, testset, dataname, classes, prefix):
 20 |         if prefix != None:
 21 |             prefix += '/'
 22 |         self.trainset = trainset
 23 |         self.testset = testset
 24 |         self.classes = int(classes)
 25 | 
 26 |         docs = T.imatrix()
 27 |         label = T.imatrix()
 28 |         wordmask = T.fmatrix()
 29 |         sentencemask = T.fmatrix()
 30 |         maxsentencenum = T.iscalar()
 31 |         isTrain = T.iscalar()
 32 | 
 33 |         rng = numpy.random
 34 | 
 35 |         layers = []
 36 |         layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix))
 37 |         layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix))
 38 |         layers.append(SimpleAttentionLayer(rng, layers[-1].output, wordmask,50, 50, 'wordattentionlayer', prefix))
 39 |         layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum,prefix))
 40 |         layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix))
 41 |         layers.append(SimpleAttentionLayer(rng, layers[-1].output, sentencemask,50, 50, 'sentenceattentionlayer', prefix))
 42 |         layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix))
 43 |         layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid))
 44 |         self.layers = layers
 45 |         
 46 |         predict = layers[-1].output
 47 |         cost = T.nnet.binary_crossentropy(layers[-1].output, label).sum(1)
 48 |         cost = cost.mean()
 49 |         # modifu corrrect.
 50 |         # predicted_value = ((layers[-1].output) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 51 |         # predicted_value = predicted_value.astype(bool)
 52 |         # true_value = label.astype(bool)
 53 |         # equal = true_value == predicted_value
 54 |         # match = np.sum(equal, axis=1) == np.size(equal, axis=1)
 55 |         # # value 1 match_ratio
 56 |         # exact_match_ratio = np.sum(match) / np.size(match)
 57 |         # true_and_predict = np.sum(true_value & predicted_value, axis=1)
 58 |         # true_or_predict = np.sum(true_value | predicted_value, axis=1)
 59 |         # # value 2 accuracy
 60 |         # accuracy = np.mean(true_and_predict / true_or_predict)
 61 |         # # value 3 pression
 62 |         # precison = np.mean(true_and_predict / (np.sum(predicted_value, axis=1) + 1e-9))
 63 |         # # recall 4 recall
 64 |         # recall = np.mean(true_and_predict / np.sum(true_value, axis=1))
 65 |         # # f1_Measure
 66 |         # F1_Measure = np.mean((true_and_predict * 2) / (np.sum(true_value, axis=1) + np.sum(predicted_value, axis=1)))
 67 |         # # HammingLoss
 68 |         # HammingLoss = np.mean(true_value ^ total_predicted_value)
 69 |         # TP
 70 |         # TP = np.sum(true_value & predicted_value,axis=0,dtype=np.int32)
 71 |         # FP = np.sum((~true_value) & predicted_value,axis=0,dtype=np.int32)
 72 |         # FN = np.sum(true_value & (~predicted_value),axis=0,dtype=np.int32)
 73 |         # _P = np.sum(TP) / (np.sum(TP) + np.sum(FP)  + 1e-9 )
 74 |         # _R = np.sum(TP) / (np.sum(TP) + np.sum(FN)  + 1e-9 )
 75 |         # Micro_F1 = (2 * _P *_R) / (_P + _R)
 76 |         # _P_t = TP / (TP + FP + 1e-9)
 77 |         # _R_t = TP / (TP + FN + 1e-9)
 78 |         # Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
 79 |         #cost = -T.mean(T.log(layers[-1].output)[T.arange(label.shape[0]), label], acc_dtype='float32')
 80 |         #modify this
 81 |         #correct = T.sum(T.eq(T.argmax(layers[-1].output, axis=1), label), acc_dtype='int32')
 82 |         #err = T.argmax(layers[-1].output, axis=1) - label
 83 |         #mse = T.sum(err * err)
 84 |         
 85 |         params = []
 86 |         for layer in layers:
 87 |             params += layer.params
 88 |         L2_rate = numpy.float32(1e-5)
 89 |         for param in params[1:]:
 90 |             cost += T.sum(L2_rate * (param * param), acc_dtype='float32')
 91 |         gparams = [T.grad(cost, param) for param in params]
 92 | 
 93 |         updates = AdaUpdates(params, gparams, 0.95, 1e-6)
 94 | 
 95 |         self.train_model = theano.function(
 96 |             inputs=[docs, label,wordmask,sentencemask,maxsentencenum],
 97 |             outputs=cost,
 98 |             updates=updates,
 99 |         )
100 | 
101 |         self.test_model = theano.function(
102 |             inputs=[docs,wordmask,sentencemask,maxsentencenum],
103 |             outputs=predict,
104 |         )
105 | 
106 |     def train(self, iters):
107 |         lst = numpy.random.randint(self.trainset.epoch, size = iters)
108 |         n = 0
109 |         for i in lst:
110 |             n += 1
111 |             out = self.train_model(self.trainset.docs[i], self.trainset.label[i],self.trainset.wordmask[i],self.trainset.sentencemask[i],self.trainset.maxsentencenum[i])
112 |             print n, 'cost:', out, 'time', datetime.now()
113 |         
114 |     def test(self):
115 |         file_eval = open('evallog.txt','a')
116 |         old = sys.stdout
117 |         sys.stdout = file_eval
118 |         print 'time start:', datetime.now()
119 |         sys.stdout = old
120 |         total_predicted_value = numpy.zeros([1, self.classes], dtype=numpy.float32)  ##
121 |         total_true_value = numpy.zeros([1, self.classes], dtype=numpy.int32)
122 |         for i in xrange(self.testset.epoch):
123 |             predicted_value = self.test_model(self.testset.docs[i],self.testset.wordmask[i],self.testset.sentencemask[i],self.testset.maxsentencenum[i])
124 |             total_predicted_value = numpy.concatenate((total_predicted_value, predicted_value), axis=0)
125 |             total_true_value = numpy.concatenate((total_true_value, self.testset.label[i]), axis=0)
126 |         total_predicted_value = total_predicted_value[1:]
127 |         total_true_value = total_true_value[1:]
128 |         assert len(total_true_value) == len(total_predicted_value), 'shape error' 
129 |         total_predicted_value = ((total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
130 |         total_predicted_value = total_predicted_value.astype(bool)
131 |         total_true_value = total_true_value.astype(bool)
132 |         TP = numpy.sum(total_true_value & total_predicted_value,axis=0,dtype=numpy.int32)
133 |         FP = numpy.sum((~total_true_value) & total_predicted_value,axis=0,dtype=numpy.int32)
134 |         FN = numpy.sum(total_true_value & (~total_predicted_value),axis=0,dtype=numpy.int32)
135 |         _P = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FP)  + 1e-9 )
136 |         _R = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FN)  + 1e-9 )
137 |         Micro_F1 = (2 * _P *_R) / (_P + _R + 1e-9)
138 |         _P_t = TP / (TP + FP + 1e-9)
139 |         _R_t = TP / (TP + FN + 1e-9)
140 |         print 'TP',TP,'FP',FP,'FN',FN
141 |         Macro_F1 = numpy.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
142 |         print('Micro-F1 = %.4f' % Micro_F1)
143 |         print('Macro-F1 = %.4f' % Macro_F1)
144 |         old = sys.stdout
145 |         sys.stdout = file_eval
146 |         print 'time end:', datetime.now()
147 |         print 'TP',TP,'FP',FP,'FN',FN
148 |         print('Micro-F1 = %.4f' % Micro_F1)
149 |         print('Macro-F1 = %.4f' % Macro_F1)
150 |         sys.stdout = old
151 |         file_eval.close()
152 |         return Micro_F1, Macro_F1
153 | 
154 | 
155 |     def save(self, prefix):
156 |         prefix += '/'
157 |         for layer in self.layers:
158 |             layer.save(prefix)
159 | 


--------------------------------------------------------------------------------
/HAN/src/PoolLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def softmask(x,mask):
  8 |     y = T.exp(x)
  9 |     y =y *mask
 10 |     sumx = T.sum(y,axis=1)
 11 |     x = y/sumx.dimshuffle(0,'x')
 12 |     return x
 13 | 
 14 | class LastPoolLayer(object):
 15 |     def __init__(self, input):
 16 |         self.input = input
 17 |         self.output = input[-1]
 18 |         self.params = []
 19 | 
 20 |     def save(self, prefix):
 21 |         pass
 22 | 
 23 | class MeanPoolLayer(object):
 24 |     def __init__(self, input, ll):
 25 |         self.input = input
 26 |         self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x')          
 27 |         self.params = []
 28 | 
 29 |     def save(self, prefix):
 30 |         pass
 31 | 
 32 | 
 33 | class MaxPoolLayer(object):
 34 |     def __init__(self, input):
 35 |         self.input = input
 36 |         self.output = T.max(input, axis = 0)
 37 |         self.params = []
 38 | 
 39 |     def save(self, prefix):
 40 |         pass
 41 | 
 42 | 
 43 | class SimpleAttentionLayer(object):
 44 |     def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None):
 45 |         self.input = input
 46 | 
 47 |         if prefix is None:
 48 |             W_values = numpy.asarray(                                              
 49 |                 rng.uniform(
 50 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
 51 |                     high=numpy.sqrt(6. / (n_in + n_out)),
 52 |                     size=(n_in, n_out)
 53 |                 ),
 54 |                 dtype=numpy.float32
 55 |             )
 56 |             W = theano.shared(value=W_values, name='W', borrow=True)
 57 |             
 58 |             v_values = numpy.asarray(
 59 |                 rng.normal(scale=0.1, size=(n_out,)),
 60 |                 dtype=numpy.float32
 61 |             )
 62 |             v = theano.shared(value=v_values, name='v', borrow=True)
 63 |             
 64 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)            
 65 |             b = theano.shared(value=b_values, name='b', borrow=True)
 66 | 
 67 |         else:
 68 |             f = file(prefix + name + '.save', 'rb')
 69 |             W = cPickle.load(f)
 70 |             v = cPickle.load(f)
 71 |             b = cPickle.load(f)
 72 |             f.close()
 73 | 
 74 |         self.W = W
 75 |         self.v = v
 76 |         self.b = b
 77 | 
 78 |         atten = T.tanh(T.dot(input, self.W)+ b)                        
 79 |         atten = T.sum(atten * v, axis=2, acc_dtype='float32')                   
 80 |         atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0)         
 81 |         output = atten.dimshuffle(0, 1, 'x') * input
 82 |         self.output = T.sum(output, axis=0, acc_dtype='float32')                
 83 |         
 84 |         self.params = [self.W,self.v,self.b]
 85 |         self.name=name
 86 |         self.atten = atten
 87 | 
 88 |     def save(self, prefix):
 89 |         f = file(prefix + self.name + '.save', 'wb')
 90 |         for obj in self.params:
 91 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
 92 |         f.close()
 93 | 
 94 | 
 95 | class Dropout(object):
 96 |     def __init__(self, input, rate, istrain):
 97 |         rate = numpy.float32(rate)
 98 |         self.input = input
 99 |         srng = T.shared_randomstreams.RandomStreams()
100 |         mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
101 |         self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
102 |         self.params = []
103 | 
104 |     def save(self, prefix):
105 |         pass
106 | 


--------------------------------------------------------------------------------
/HAN/src/SentenceSortLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | 
 6 | class SentenceSortLayer(object):
 7 |     def __init__(self, input,maxsentencenum,prefix):
 8 |         self.input = input
 9 |         [sentencelen,emblen] = T.shape(input)
10 |         output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen))
11 |         output = output.dimshuffle(1,0,2)
12 |         self.output = output
13 |         self.params = []
14 |         
15 | 
16 |     def save(self, prefix):
17 |         pass
18 | 


--------------------------------------------------------------------------------
/HAN/src/Update.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | def AdaUpdates(parameters, gradients, rho, eps):
 7 | 	rho = np.float32(rho)
 8 | 	eps = np.float32(eps)
 9 | 	
10 | 	gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
11 | 	deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
12 | 
13 | 	gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ]
14 | 	deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
15 | 
16 | 	deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ]
17 | 
18 | 	gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
19 | 	deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
20 | 	parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
21 | 	return gradient_sq_updates + deltas_sq_updates + parameters_updates
22 | 


--------------------------------------------------------------------------------
/HAN/src/test.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import sys
 3 | from Dataset import *
 4 | from LSTMModel import LSTMModel
 5 | 
 6 | dataname = sys.argv[1]
 7 | classes = sys.argv[2]
 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
 9 | 
10 | testset = Dataset('../data/'+dataname+'/test.txt', voc)
11 | trainset = []
12 | print 'data loaded.'
13 | 
14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel')
15 | print 'model loaded.'
16 | model.test()
17 | 


--------------------------------------------------------------------------------
/HAN/src/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #-*- coding: UTF-8 -*-  
 3 | import sys
 4 | from Dataset import *
 5 | from LSTMModel import LSTMModel
 6 | 
 7 | dataname = sys.argv[1]
 8 | classes = sys.argv[2]
 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
10 | 
11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes)
12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes)
13 | print 'data loaded.'
14 | 
15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None)
16 | model.train(100)
17 | print '****************************************************************************'
18 | print 'test 1'
19 | result = model.test()
20 | print '****************************************************************************'
21 | print '\n'
22 | for i in xrange(1,400):
23 | 	model.train(1000)
24 | 	print '****************************************************************************'
25 | 	print 'test',i+1
26 | 	newresult=model.test()
27 | 	print '****************************************************************************'
28 | 	print '\n'
29 | 	if newresult[0]>result[0] :
30 | 		result=newresult
31 | 		model.save('../model/'+dataname+'/bestmodel')
32 | print 'bestmodel saved!'
33 | 
34 | 


--------------------------------------------------------------------------------
/HLSTM/model/IMDB/bestmodel/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/HLSTM/model/IMDB/bestmodel/.gitkeep


--------------------------------------------------------------------------------
/HLSTM/src/Dataset.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import numpy
  3 | import copy
  4 | import theano
  5 | import random
  6 | 
  7 | def genBatch(data):
  8 |     m =0 
  9 |     maxsentencenum = len(data[0])
 10 |     for doc in data:
 11 |         for sentence in doc:
 12 |             if len(sentence)>m:
 13 |                 m = len(sentence)
 14 |         for i in xrange(maxsentencenum - len(doc)):
 15 |             doc.append([-1])
 16 |     tmp = map(lambda doc: numpy.asarray(map(lambda sentence : sentence + [-1]*(m - len(sentence)), doc), dtype = numpy.int32).T, data)                          #[-1]是加在最前面
 17 |     tmp = reduce(lambda doc,docs : numpy.concatenate((doc,docs),axis = 1),tmp)
 18 |     return tmp 
 19 |             
 20 | def genLenBatch(lengths,maxsentencenum):
 21 |     lengths = map(lambda length : numpy.asarray(length + [1.0]*(maxsentencenum-len(length)), dtype = numpy.float32)+numpy.float32(1e-4),lengths)
 22 |     return reduce(lambda x,y : numpy.concatenate((x,y),axis = 0),lengths)
 23 | 
 24 | def genwordmask(docsbatch):
 25 |     mask = copy.deepcopy(docsbatch)
 26 |     mask = map(lambda x : map(lambda y : [1.0 ,0.0][y == -1],x), mask)
 27 |     mask = numpy.asarray(mask,dtype=numpy.float32)
 28 |     mask[0] = numpy.ones([mask.shape[1]],dtype=numpy.float32) 
 29 |     return mask
 30 | 
 31 | def gensentencemask(sentencenum):
 32 |     maxnum = sentencenum[0]
 33 |     mask = numpy.asarray(map(lambda num : [1.0]*num + [0.0]*(maxnum - num),sentencenum), dtype = numpy.float32)
 34 |     return mask.T
 35 | 
 36 | class Dataset(object):
 37 |     def __init__(self, filename, emb, classes, maxbatch = 32, maxword = 500 ):
 38 |         lines = map(lambda x: x.split('\t\t'), open(filename).readlines())
 39 |         # here i need more label. there is only one label            
 40 |         label = map(lambda x: x[0].split(' '), lines)
 41 |         oneslable = numpy.zeros([len(label), int(classes)], dtype=numpy.int32)
 42 |         for i in range(0,len(label)):
 43 |             for j in label[i]:
 44 |                 oneslable[i][int(j)] = 1
 45 |         label = oneslable
 46 |         print("already done the ones-hot")
 47 |         docs = map(lambda x: x[1][0:len(x[1])-1], lines) 
 48 |         docs = map(lambda x: x.split('<sssss>'), docs) 
 49 |         docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs)
 50 |         docs = map(lambda doc: map(lambda sentence: filter(lambda wordid: wordid !=-1,map(lambda word: emb.getID(word),sentence)),doc),docs)
 51 |         tmp = zip(docs, label)
 52 |         #random.shuffle(tmp)
 53 |         tmp.sort(lambda x, y: len(y[0]) - len(x[0]))  
 54 |         docs, label = zip(*tmp)
 55 | 
 56 |         sentencenum = map(lambda x : len(x),docs)
 57 |         length = map(lambda doc : map(lambda sentence : len(sentence), doc), docs)
 58 |         self.epoch = len(docs) / maxbatch                                        
 59 |         if len(docs) % maxbatch != 0:
 60 |             self.epoch += 1
 61 |         
 62 |         # self.docs = []
 63 |         # self.label = []
 64 |         # self.wordmask = []
 65 |         # self.sentencemask = []
 66 |         # self.maxsentencenum = []
 67 | 
 68 |         # for i in xrange(self.epoch):
 69 |         #     self.maxsentencenum.append(sentencenum[i*maxbatch])
 70 |         #     docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 71 |         #     self.docs.append(docsbatch)
 72 |         #     self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 73 |         #     self.wordmask.append(genwordmask(docsbatch))
 74 |         #     self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 75 |         self.docs = []
 76 |         self.label = []
 77 |         self.length = []
 78 |         self.sentencenum = []
 79 |         self.wordmask = []
 80 |         self.sentencemask = []
 81 |         self.maxsentencenum = []
 82 | 
 83 |         for i in xrange(self.epoch):
 84 |             self.maxsentencenum.append(sentencenum[i*maxbatch])
 85 |             self.length.append(genLenBatch(length[i*maxbatch:(i+1)*maxbatch],sentencenum[i*maxbatch])) 
 86 |             docsbatch = genBatch(docs[i*maxbatch:(i+1)*maxbatch])
 87 |             self.docs.append(docsbatch)
 88 |             self.label.append(numpy.asarray(label[i*maxbatch:(i+1)*maxbatch], dtype = numpy.int32))
 89 |             self.sentencenum.append(numpy.asarray(sentencenum[i*maxbatch:(i+1)*maxbatch],dtype = numpy.float32)+numpy.float32(1e-4))
 90 |             self.wordmask.append(genwordmask(docsbatch))
 91 |             self.sentencemask.append(gensentencemask(sentencenum[i*maxbatch:(i+1)*maxbatch]))
 92 |         
 93 | 
 94 | class Wordlist(object):
 95 |     def __init__(self, filename, maxn = 100000):
 96 |         lines = map(lambda x: x.split(), open(filename).readlines()[:maxn])
 97 |         self.size = len(lines)
 98 | 
 99 |         self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))]
100 |         self.voc = dict(self.voc)
101 | 
102 |     def getID(self, word):
103 |         try:
104 |             return self.voc[word]
105 |         except:
106 |             return -1
107 | 
108 | 


--------------------------------------------------------------------------------
/HLSTM/src/EmbLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class EmbLayer(object):
 8 |     def __init__(self, rng, inp, n_voc, dim, name, dataname,prefix=None):
 9 |         self.input = inp
10 |         self.name = name
11 | 
12 |         if prefix == None:
13 |             f = file('../data/'+dataname+'/embinit.save', 'rb')
14 |             W = cPickle.load(f)
15 |             f.close()
16 |             W = theano.shared(value=W, name='E', borrow=True)    
17 |         else:
18 |             f = file(prefix + name + '.save', 'rb')
19 |             W = cPickle.load(f)
20 |             f.close()
21 |         self.W = W
22 | 
23 |         self.output = self.W[inp.flatten()].reshape((inp.shape[0], inp.shape[1], dim))
24 |         self.params = [self.W]
25 | 
26 |     def save(self, prefix):
27 |         f = file(prefix + self.name + '.save', 'wb')
28 |         for obj in self.params:
29 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
30 |         f.close()
31 | 


--------------------------------------------------------------------------------
/HLSTM/src/HiddenLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | import cPickle
 6 | 
 7 | class HiddenLayer(object):
 8 |     def __init__(self, rng, input, n_in, n_out, name, prefix=None,
 9 |                  activation=T.tanh):
10 |         self.name = name
11 |         self.input = input
12 | 
13 |         if prefix is None:
14 |             W_values = numpy.asarray(
15 |                 rng.uniform(
16 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
17 |                     high=numpy.sqrt(6. / (n_in + n_out)),
18 |                     size=(n_in, n_out)
19 |                 ),
20 |                 dtype=numpy.float32
21 |             )
22 |             if activation == theano.tensor.nnet.sigmoid:
23 |                 W_values *= 4
24 |             W = theano.shared(value=W_values, name='W', borrow=True)
25 | 
26 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
27 |             b = theano.shared(value=b_values, name='b', borrow=True)
28 |         else:
29 |             f = file(prefix + name + '.save', 'rb')
30 |             W = cPickle.load(f)
31 |             b = cPickle.load(f)
32 |             f.close()
33 | 
34 |         self.W = W
35 |         self.b = b
36 | 
37 |         lin_output = T.dot(input, self.W) + self.b
38 |         self.output = (
39 |             lin_output if activation is None
40 |             else activation(lin_output)
41 |         )
42 | 
43 |         self.params = [self.W, self.b]
44 | 
45 |     def save(self, prefix):
46 |         f = file(prefix + self.name + '.save', 'wb')
47 |         for obj in self.params:
48 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
49 |         f.close()
50 | 


--------------------------------------------------------------------------------
/HLSTM/src/LSTMLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def randMatrix(rng, shape, lim):
  8 |     return numpy.asarray(
  9 |         rng.uniform(
 10 |             low=-lim,
 11 |             high=lim,
 12 |             size=shape
 13 |         ),
 14 |         dtype=numpy.float32
 15 |     )
 16 | 
 17 | class LSTMLayer(object):
 18 |     def __init__(self, rng, input, mask, n_in, n_out, name, prefix=None):
 19 |         self.input = input
 20 |         self.name = name
 21 | 
 22 |         limV = numpy.sqrt(6. / (n_in + n_out * 2))
 23 |         limG = limV * 4
 24 | 
 25 |         if prefix is None:
 26 |             Wi1_values = randMatrix(rng, (n_in, n_out), limG)
 27 |             Wi1 = theano.shared(value=Wi1_values, name='Wi1', borrow=True)
 28 |             Wi2_values = randMatrix(rng, (n_out, n_out), limG)
 29 |             Wi2 = theano.shared(value=Wi2_values, name='Wi2', borrow=True)
 30 |             bi_values = numpy.zeros((n_out,), dtype=numpy.float32)
 31 |             bi = theano.shared(value=bi_values, name='bi', borrow=True)
 32 | 
 33 |             Wo1_values = randMatrix(rng, (n_in, n_out), limG)
 34 |             Wo1 = theano.shared(value=Wo1_values, name='Wo1', borrow=True)
 35 |             Wo2_values = randMatrix(rng, (n_out, n_out), limG)
 36 |             Wo2 = theano.shared(value=Wo2_values, name='Wo2', borrow=True)
 37 |             bo_values = numpy.zeros((n_out,), dtype=numpy.float32)
 38 |             bo = theano.shared(value=bo_values, name='bo', borrow=True)
 39 | 
 40 |             Wf1_values = randMatrix(rng, (n_in, n_out), limG)
 41 |             Wf1 = theano.shared(value=Wf1_values, name='Wf1', borrow=True)
 42 |             Wf2_values = randMatrix(rng, (n_out, n_out), limG)
 43 |             Wf2 = theano.shared(value=Wf2_values, name='Wf2', borrow=True)
 44 |             bf_values = numpy.zeros((n_out,), dtype=numpy.float32)
 45 |             bf = theano.shared(value=bf_values, name='bf', borrow=True)
 46 | 
 47 |             Wc1_values = randMatrix(rng, (n_in, n_out), limV)
 48 |             Wc1 = theano.shared(value=Wc1_values, name='Wc1', borrow=True)
 49 |             Wc2_values = randMatrix(rng, (n_out, n_out), limV)
 50 |             Wc2 = theano.shared(value=Wc2_values, name='Wc2', borrow=True)
 51 |             bc_values = numpy.zeros((n_out,), dtype=numpy.float32)
 52 |             bc = theano.shared(value=bc_values, name='bc', borrow=True)
 53 | 
 54 |         else:
 55 |             f = file(prefix + name + '.save', 'rb')
 56 |             Wi1 = cPickle.load(f)
 57 |             Wi2 = cPickle.load(f)
 58 |             bi = cPickle.load(f)
 59 | 
 60 |             Wo1 = cPickle.load(f)
 61 |             Wo2 = cPickle.load(f)
 62 |             bo = cPickle.load(f)
 63 | 
 64 |             Wf1 = cPickle.load(f)
 65 |             Wf2 = cPickle.load(f)
 66 |             bf = cPickle.load(f)
 67 | 
 68 |             Wc1 = cPickle.load(f)
 69 |             Wc2 = cPickle.load(f)
 70 |             bc = cPickle.load(f)
 71 | 
 72 |             f.close()
 73 | 
 74 |         self.Wi1 = Wi1
 75 |         self.Wi2 = Wi2
 76 |         self.bi = bi
 77 | 
 78 |         self.Wo1 = Wo1
 79 |         self.Wo2 = Wo2
 80 |         self.bo = bo
 81 | 
 82 |         self.Wf1 = Wf1
 83 |         self.Wf2 = Wf2
 84 |         self.bf = bf
 85 | 
 86 |         self.Wc1 = Wc1
 87 |         self.Wc2 = Wc2
 88 |         self.bc = bc
 89 | 
 90 |         def step(emb, mask, C, prev):
 91 |             Gi = T.nnet.sigmoid(T.dot(emb, self.Wi1) + T.dot(prev, self.Wi2) + self.bi)
 92 |             Go = T.nnet.sigmoid(T.dot(emb, self.Wo1) + T.dot(prev, self.Wo2) + self.bo)
 93 |             Gf = T.nnet.sigmoid(T.dot(emb, self.Wf1) + T.dot(prev, self.Wf2) + self.bf)
 94 |             Ct = T.tanh(T.dot(emb, self.Wc1) + T.dot(prev, self.Wc2) + self.bc)
 95 | 
 96 |             CC = C * Gf + Ct * Gi
 97 |             CC = CC * mask.dimshuffle(0,'x') 
 98 |             CC = T.cast(CC,'float32')
 99 |             h = T.tanh(CC) * Go
100 |             h = h * mask.dimshuffle(0,'x') 
101 |             h = T.cast(h,'float32')
102 |             return [CC, h]
103 | 
104 |         outs, _ = theano.scan(fn=step,
105 |             outputs_info=[T.zeros_like(T.dot(input[0], self.Wi1)), T.zeros_like(T.dot(input[0], self.Wi1))],
106 |             sequences=[input, mask])
107 | 
108 |         self.output = outs[1]
109 | 
110 |         self.params = [self.Wi1, self.Wi2, self.bi, self.Wo1, self.Wo2, self.bo,
111 |             self.Wf1, self.Wf2, self.bf, self.Wc1, self.Wc2, self.bc]
112 | 
113 |     def save(self, prefix):
114 |         f = file(prefix + self.name + '.save', 'wb')
115 |         for obj in self.params:
116 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
117 |         f.close()
118 | 


--------------------------------------------------------------------------------
/HLSTM/src/LSTMModel.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*- 
  2 | from datetime import datetime
  3 | from EmbLayer import EmbLayer
  4 | from LSTMLayer import LSTMLayer
  5 | from HiddenLayer import HiddenLayer
  6 | from PoolLayer import *
  7 | from SentenceSortLayer import *
  8 | import theano
  9 | import theano.tensor as T
 10 | import numpy 
 11 | import random
 12 | import sys
 13 | import time
 14 | from Update import AdaUpdates
 15 | 
 16 | EVALUTION_THRESHOLD_FOR_MULTI_LABEL = 0.5 # the evalution threshold for multi-label classification
 17 | 
 18 | class LSTMModel(object):
 19 |     def __init__(self, n_voc, trainset, testset, dataname, classes, prefix):
 20 |         if prefix != None:
 21 |             prefix += '/'
 22 |         self.trainset = trainset
 23 |         self.testset = testset
 24 |         self.classes = int(classes)
 25 | 
 26 |         docs = T.imatrix()
 27 |         label = T.imatrix()
 28 |         length = T.fvector()
 29 |         wordmask = T.fmatrix()
 30 |         sentencemask = T.fmatrix()
 31 |         maxsentencenum = T.iscalar()
 32 |         sentencenum = T.fvector()
 33 |         isTrain = T.iscalar()
 34 | 
 35 |         rng = numpy.random
 36 | 
 37 |         # layers = []
 38 |         # layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix))
 39 |         # layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix))
 40 |         # layers.append(SimpleAttentionLayer(rng, layers[-1].output, wordmask,50, 50, 'wordattentionlayer', prefix))
 41 |         # layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum,prefix))
 42 |         # layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix))
 43 |         # layers.append(SimpleAttentionLayer(rng, layers[-1].output, sentencemask,50, 50, 'sentenceattentionlayer', prefix))
 44 |         # layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix))
 45 |         # layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid))
 46 |         # self.layers = layers
 47 |         layers = []
 48 |         layers.append(EmbLayer(rng, docs, n_voc, 50, 'emblayer', dataname, prefix))
 49 |         layers.append(LSTMLayer(rng, layers[-1].output, wordmask, 50, 50, 'wordlstmlayer', prefix)) 
 50 |         layers.append(MeanPoolLayer(layers[-1].output, length))
 51 |         layers.append(SentenceSortLayer(layers[-1].output,maxsentencenum))
 52 |         layers.append(LSTMLayer(rng, layers[-1].output, sentencemask, 50, 50, 'sentencelstmlayer', prefix))
 53 |         layers.append(MeanPoolLayer(layers[-1].output, sentencenum))
 54 |         layers.append(HiddenLayer(rng, layers[-1].output, 50, 50, 'fulllayer', prefix))
 55 |         layers.append(HiddenLayer(rng, layers[-1].output, 50, int(classes), 'softmaxlayer', prefix, activation=T.nnet.sigmoid))
 56 |         self.layers = layers
 57 |         
 58 |         predict = layers[-1].output
 59 |         cost = T.nnet.binary_crossentropy(layers[-1].output, label).sum(1)
 60 |         cost = cost.mean()
 61 |         # modifu corrrect.
 62 |         # predicted_value = ((layers[-1].output) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
 63 |         # predicted_value = predicted_value.astype(bool)
 64 |         # true_value = label.astype(bool)
 65 |         # equal = true_value == predicted_value
 66 |         # match = np.sum(equal, axis=1) == np.size(equal, axis=1)
 67 |         # # value 1 match_ratio
 68 |         # exact_match_ratio = np.sum(match) / np.size(match)
 69 |         # true_and_predict = np.sum(true_value & predicted_value, axis=1)
 70 |         # true_or_predict = np.sum(true_value | predicted_value, axis=1)
 71 |         # # value 2 accuracy
 72 |         # accuracy = np.mean(true_and_predict / true_or_predict)
 73 |         # # value 3 pression
 74 |         # precison = np.mean(true_and_predict / (np.sum(predicted_value, axis=1) + 1e-9))
 75 |         # # recall 4 recall
 76 |         # recall = np.mean(true_and_predict / np.sum(true_value, axis=1))
 77 |         # # f1_Measure
 78 |         # F1_Measure = np.mean((true_and_predict * 2) / (np.sum(true_value, axis=1) + np.sum(predicted_value, axis=1)))
 79 |         # # HammingLoss
 80 |         # HammingLoss = np.mean(true_value ^ total_predicted_value)
 81 |         # TP
 82 |         # TP = np.sum(true_value & predicted_value,axis=0,dtype=np.int32)
 83 |         # FP = np.sum((~true_value) & predicted_value,axis=0,dtype=np.int32)
 84 |         # FN = np.sum(true_value & (~predicted_value),axis=0,dtype=np.int32)
 85 |         # _P = np.sum(TP) / (np.sum(TP) + np.sum(FP)  + 1e-9 )
 86 |         # _R = np.sum(TP) / (np.sum(TP) + np.sum(FN)  + 1e-9 )
 87 |         # Micro_F1 = (2 * _P *_R) / (_P + _R)
 88 |         # _P_t = TP / (TP + FP + 1e-9)
 89 |         # _R_t = TP / (TP + FN + 1e-9)
 90 |         # Macro_F1 = np.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
 91 |         #cost = -T.mean(T.log(layers[-1].output)[T.arange(label.shape[0]), label], acc_dtype='float32')
 92 |         #modify this
 93 |         #correct = T.sum(T.eq(T.argmax(layers[-1].output, axis=1), label), acc_dtype='int32')
 94 |         #err = T.argmax(layers[-1].output, axis=1) - label
 95 |         #mse = T.sum(err * err)
 96 |         
 97 |         params = []
 98 |         for layer in layers:
 99 |             params += layer.params
100 |         L2_rate = numpy.float32(1e-5)
101 |         for param in params[1:]:
102 |             cost += T.sum(L2_rate * (param * param), acc_dtype='float32')
103 |         gparams = [T.grad(cost, param) for param in params]
104 | 
105 |         updates = AdaUpdates(params, gparams, 0.95, 1e-6)
106 | 
107 |         self.train_model = theano.function(
108 |             inputs=[docs, label,length,sentencenum,wordmask,sentencemask,maxsentencenum],
109 |             outputs=cost,
110 |             updates=updates,
111 |         )
112 | 
113 |         self.test_model = theano.function(
114 |             inputs=[docs,length,sentencenum,wordmask,sentencemask,maxsentencenum],
115 |             outputs=predict,
116 |         )
117 | 
118 |     def train(self, iters):
119 |         lst = numpy.random.randint(self.trainset.epoch, size = iters)
120 |         n = 0
121 |         for i in lst:
122 |             n += 1
123 |             out = self.train_model(self.trainset.docs[i], self.trainset.label[i], self.trainset.length[i],self.trainset.sentencenum[i],self.trainset.wordmask[i],self.trainset.sentencemask[i],self.trainset.maxsentencenum[i])
124 |             print n, 'cost:', out, 'time', datetime.now()
125 |         
126 |     def test(self):
127 |         file_eval = open('evallog.txt','a')
128 |         old = sys.stdout
129 |         sys.stdout = file_eval
130 |         print 'time start:', datetime.now()
131 |         sys.stdout = old
132 |         total_predicted_value = numpy.zeros([1, self.classes], dtype=numpy.float32)  ##
133 |         total_true_value = numpy.zeros([1, self.classes], dtype=numpy.int32)
134 |         for i in xrange(self.testset.epoch):
135 |             predicted_value = self.test_model(self.testset.docs[i],self.testset.length[i], self.testset.sentencenum[i], self.testset.wordmask[i],self.testset.sentencemask[i],self.testset.maxsentencenum[i])
136 |             total_predicted_value = numpy.concatenate((total_predicted_value, predicted_value), axis=0)
137 |             total_true_value = numpy.concatenate((total_true_value, self.testset.label[i]), axis=0)
138 |         total_predicted_value = total_predicted_value[1:]
139 |         total_true_value = total_true_value[1:]
140 |         assert len(total_true_value) == len(total_predicted_value), 'shape error' 
141 |         total_predicted_value = ((total_predicted_value) >= EVALUTION_THRESHOLD_FOR_MULTI_LABEL).astype(int)
142 |         total_predicted_value = total_predicted_value.astype(bool)
143 |         total_true_value = total_true_value.astype(bool)
144 |         TP = numpy.sum(total_true_value & total_predicted_value,axis=0,dtype=numpy.int32)
145 |         FP = numpy.sum((~total_true_value) & total_predicted_value,axis=0,dtype=numpy.int32)
146 |         FN = numpy.sum(total_true_value & (~total_predicted_value),axis=0,dtype=numpy.int32)
147 |         _P = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FP)  + 1e-9 )
148 |         _R = numpy.sum(TP) / (numpy.sum(TP) + numpy.sum(FN)  + 1e-9 )
149 |         Micro_F1 = (2 * _P *_R) / (_P + _R + 1e-9)
150 |         _P_t = TP / (TP + FP + 1e-9)
151 |         _R_t = TP / (TP + FN + 1e-9)
152 |         print 'TP',TP,'FP',FP,'FN',FN
153 |         Macro_F1 = numpy.mean((2 * _P_t * _R_t) / (_P_t + _R_t + 1e-9))
154 |         print('Micro-F1 = %.4f' % Micro_F1)
155 |         print('Macro-F1 = %.4f' % Macro_F1)
156 |         old = sys.stdout
157 |         sys.stdout = file_eval
158 |         print 'time end:', datetime.now()
159 |         print 'TP',TP,'FP',FP,'FN',FN
160 |         print('Micro-F1 = %.4f' % Micro_F1)
161 |         print('Macro-F1 = %.4f' % Macro_F1)
162 |         sys.stdout = old
163 |         file_eval.close()
164 |         return Micro_F1, Macro_F1
165 | 
166 | 
167 |     def save(self, prefix):
168 |         prefix += '/'
169 |         for layer in self.layers:
170 |             layer.save(prefix)
171 | 


--------------------------------------------------------------------------------
/HLSTM/src/PoolLayer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: UTF-8 -*-  
  2 | import theano
  3 | import theano.tensor as T
  4 | import numpy
  5 | import cPickle
  6 | 
  7 | def softmask(x,mask):
  8 |     y = T.exp(x)
  9 |     y =y *mask
 10 |     sumx = T.sum(y,axis=1)
 11 |     x = y/sumx.dimshuffle(0,'x')
 12 |     return x
 13 | 
 14 | class LastPoolLayer(object):
 15 |     def __init__(self, input):
 16 |         self.input = input
 17 |         self.output = input[-1]
 18 |         self.params = []
 19 | 
 20 |     def save(self, prefix):
 21 |         pass
 22 | 
 23 | class MeanPoolLayer(object):
 24 |     def __init__(self, input, ll):
 25 |         self.input = input
 26 |         self.output = T.sum(input, axis=0, acc_dtype='float32') / ll.dimshuffle(0, 'x')          
 27 |         self.params = []
 28 | 
 29 |     def save(self, prefix):
 30 |         pass
 31 | 
 32 | 
 33 | class MaxPoolLayer(object):
 34 |     def __init__(self, input):
 35 |         self.input = input
 36 |         self.output = T.max(input, axis = 0)
 37 |         self.params = []
 38 | 
 39 |     def save(self, prefix):
 40 |         pass
 41 | 
 42 | 
 43 | class SimpleAttentionLayer(object):
 44 |     def __init__(self, rng, input,mask, n_in, n_out, name, prefix=None):
 45 |         self.input = input
 46 | 
 47 |         if prefix is None:
 48 |             W_values = numpy.asarray(                                              
 49 |                 rng.uniform(
 50 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
 51 |                     high=numpy.sqrt(6. / (n_in + n_out)),
 52 |                     size=(n_in, n_out)
 53 |                 ),
 54 |                 dtype=numpy.float32
 55 |             )
 56 |             W = theano.shared(value=W_values, name='W', borrow=True)
 57 |             
 58 |             v_values = numpy.asarray(
 59 |                 rng.normal(scale=0.1, size=(n_out,)),
 60 |                 dtype=numpy.float32
 61 |             )
 62 |             v = theano.shared(value=v_values, name='v', borrow=True)
 63 |             
 64 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)            
 65 |             b = theano.shared(value=b_values, name='b', borrow=True)
 66 | 
 67 |         else:
 68 |             f = file(prefix + name + '.save', 'rb')
 69 |             W = cPickle.load(f)
 70 |             v = cPickle.load(f)
 71 |             b = cPickle.load(f)
 72 |             f.close()
 73 | 
 74 |         self.W = W
 75 |         self.v = v
 76 |         self.b = b
 77 | 
 78 |         atten = T.tanh(T.dot(input, self.W)+ b)                        
 79 |         atten = T.sum(atten * v, axis=2, acc_dtype='float32')                   
 80 |         atten = softmask(atten.dimshuffle(1,0),mask.dimshuffle(1,0)).dimshuffle(1, 0)         
 81 |         output = atten.dimshuffle(0, 1, 'x') * input
 82 |         self.output = T.sum(output, axis=0, acc_dtype='float32')                
 83 |         
 84 |         self.params = [self.W,self.v,self.b]
 85 |         self.name=name
 86 |         self.atten = atten
 87 | 
 88 |     def save(self, prefix):
 89 |         f = file(prefix + self.name + '.save', 'wb')
 90 |         for obj in self.params:
 91 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
 92 |         f.close()
 93 | 
 94 | 
 95 | class Dropout(object):
 96 |     def __init__(self, input, rate, istrain):
 97 |         rate = numpy.float32(rate)
 98 |         self.input = input
 99 |         srng = T.shared_randomstreams.RandomStreams()
100 |         mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
101 |         self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
102 |         self.params = []
103 | 
104 |     def save(self, prefix):
105 |         pass
106 | 


--------------------------------------------------------------------------------
/HLSTM/src/SentenceSortLayer.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy
 5 | 
 6 | class SentenceSortLayer(object):
 7 |     def __init__(self, input,maxsentencenum):
 8 |         self.input = input
 9 |         [sentencelen,emblen] = T.shape(input)
10 |         output = input.reshape((sentencelen / maxsentencenum,maxsentencenum,emblen))
11 |         output = output.dimshuffle(1,0,2)
12 |         self.output = output
13 |         self.params = []
14 |         
15 | 
16 |     def save(self, prefix):
17 |         pass
18 | 


--------------------------------------------------------------------------------
/HLSTM/src/Update.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | def AdaUpdates(parameters, gradients, rho, eps):
 7 | 	rho = np.float32(rho)
 8 | 	eps = np.float32(eps)
 9 | 	
10 | 	gradients_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
11 | 	deltas_sq = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32), borrow=True) for p in parameters ]
12 | 
13 | 	gradients_sq_new = [ rho*g_sq + (np.float32(1)-rho)*(g*g) for g_sq,g in zip(gradients_sq,gradients) ]
14 | 	deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
15 | 
16 | 	deltas_sq_new = [ rho*d_sq + (np.float32(1)-rho)*(d*d) for d_sq,d in zip(deltas_sq,deltas) ]
17 | 
18 | 	gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
19 | 	deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
20 | 	parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
21 | 	return gradient_sq_updates + deltas_sq_updates + parameters_updates
22 | 


--------------------------------------------------------------------------------
/HLSTM/src/test.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: UTF-8 -*-  
 2 | import sys
 3 | from Dataset import *
 4 | from LSTMModel import LSTMModel
 5 | 
 6 | dataname = sys.argv[1]
 7 | classes = sys.argv[2]
 8 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
 9 | 
10 | testset = Dataset('../data/'+dataname+'/test.txt', voc)
11 | trainset = []
12 | print 'data loaded.'
13 | 
14 | model = LSTMModel(voc.size, trainset, testset, dataname, classes, '../model/'+dataname+'/bestmodel')
15 | print 'model loaded.'
16 | model.test()
17 | 


--------------------------------------------------------------------------------
/HLSTM/src/train.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #-*- coding: UTF-8 -*-  
 3 | import sys
 4 | from Dataset import *
 5 | from LSTMModel import LSTMModel
 6 | 
 7 | dataname = sys.argv[1]
 8 | classes = sys.argv[2]
 9 | voc = Wordlist('../data/'+dataname+'/wordlist.txt')
10 | 
11 | trainset = Dataset('../data/'+dataname+'/train.txt', voc, classes)
12 | devset = Dataset('../data/'+dataname+'/dev.txt', voc, classes)
13 | print 'data loaded.'
14 | 
15 | model = LSTMModel(voc.size,trainset, devset, dataname, classes, None)
16 | model.train(100)
17 | print '****************************************************************************'
18 | print 'test 1'
19 | result = model.test()
20 | print '****************************************************************************'
21 | print '\n'
22 | for i in xrange(1,400):
23 | 	model.train(1000)
24 | 	print '****************************************************************************'
25 | 	print 'test',i+1
26 | 	newresult=model.test()
27 | 	print '****************************************************************************'
28 | 	print '\n'
29 | 	if newresult[0]>result[0] :
30 | 		result=newresult
31 | 		model.save('../model/'+dataname+'/bestmodel')
32 | print 'bestmodel saved!'
33 | 
34 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/make_graphs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import nltk
  4 | import string
  5 | import re
  6 | import os
  7 | from nltk.corpus import wordnet as wn
  8 | import sys
  9 | import collections
 10 | from nltk.stem.lancaster import LancasterStemmer
 11 | from nltk.stem import WordNetLemmatizer
 12 | from nltk.tokenize import WordPunctTokenizer
 13 | import numpy as np
 14 | import gensim
 15 | import codecs
 16 | import h5py
 17 | import json
 18 | from multiprocessing import Pool
 19 | import xml.etree.ElementTree as ET
 20 | 
 21 | reload(sys)
 22 | sys.setdefaultencoding('utf-8')
 23 | 
 24 | PATH = os.path.dirname(os.path.realpath(__file__))
 25 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '”', '“', '’', "‘",
 26 |                         "'", '"']
 27 | wordEngStop = nltk.corpus.stopwords.words('english')
 28 | st = LancasterStemmer()
 29 | lemmatizer = WordNetLemmatizer()
 30 | 
 31 | count = 1;
 32 | 
 33 | w_idnex,wdata = None,None
 34 | 
 35 | classes = None
 36 | 
 37 | def count_words(s):
 38 |     global english_punctuations, wordEngStop, st
 39 |     tokenstr = []
 40 |     result = {}
 41 |                 
 42 |     mtext = ' '.join(s)
 43 |     mtext = mtext.lower().strip().decode(errors="ignore")
 44 |     mtext = re.sub(r'-', r' ', mtext)
 45 |     mtext = re.sub(r'([0-9]+),([0-9]+)', r'\1\2', mtext)
 46 |     mtext = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", mtext)
 47 |     mtext = re.sub(r"\'s", " \'s", mtext)
 48 |     mtext = re.sub(r"\'ve", " \'ve", mtext)
 49 |     mtext = re.sub(r"n\'t", " n\'t", mtext)
 50 |     mtext = re.sub(r"\'re", " \'re", mtext)
 51 |     mtext = re.sub(r"\'d", " \'d", mtext)
 52 |     mtext = re.sub(r"\'ll", " \'ll", mtext)
 53 |     mtext = re.sub(r",", " , ", mtext)
 54 |     mtext = re.sub(r"!", " ! ", mtext)
 55 |     mtext = re.sub(r"\(", " \( ", mtext)
 56 |     mtext = re.sub(r"\)", " \) ", mtext)
 57 |     mtext = re.sub(r"\?", " \? ", mtext)
 58 |     mtext = re.sub(r"\s{2,}", " ", mtext)
 59 |     
 60 |     finalwords = []
 61 |     words = WordPunctTokenizer().tokenize(mtext)
 62 |     for word in words:
 63 |         if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha():
 64 |             orig_stem = lemmatizer.lemmatize(word)
 65 |             tokenstr.append(orig_stem)
 66 |             result[orig_stem] = result.get(orig_stem, 0) + 1
 67 |     '''
 68 |     # 字母最小化，分词，定义英文过滤词和停用词，然后填充单词表和计算单词频率
 69 |     s = s.lower()
 70 |     tokens = nltk.word_tokenize(s)
 71 | 
 72 |     for word in tokens:
 73 |         if not word in english_punctuations and not word in wordEngStop:
 74 |             orig_stem = word
 75 |             tokenstr.append(orig_stem)
 76 |             result[orig_stem] = result.get(orig_stem, 0) + 1
 77 |     '''
 78 |     # sort
 79 |     result = collections.OrderedDict(sorted(result.items(), key=lambda x: (x[1], x[0]), reverse=True))
 80 |     wordslist = result.keys()
 81 |     assert len(set(tokenstr)) == len(wordslist)
 82 |     # 不重复的单词按照出现次数降序排列的list，第二个是按照出现顺序排列的单词词组
 83 |     return (wordslist, tokenstr)
 84 | 
 85 | 
 86 | # dfs填充
 87 | def fill_table(TD_list, related_tables,target_width, qqueue):
 88 |     TD_list[0] = qqueue[0]
 89 |     count = 1
 90 |     # 当前单词的邻接单词list
 91 |     while qqueue != [] and count < target_width:
 92 |         use_index = qqueue[0]
 93 |         del qqueue[0]
 94 |         use_list = related_tables[use_index]
 95 |         len1 = len(use_list)
 96 |         len2 = target_width - count
 97 |         if len1 >= len2:
 98 |             TD_list[count:] = use_list[:len2]
 99 |             assert len(TD_list) == target_width
100 |             count = target_width
101 |             break
102 |         else:
103 |             TD_list[count:count + len1] = use_list
104 |             assert len(TD_list) == target_width
105 |             count += len1
106 |             for next_id in use_list:
107 |                 qqueue.append(next_id)
108 |     for i in range(count, target_width):
109 |         TD_list[i] = -1
110 | 
111 | 
112 | def test_text2matrix(_str, sliding_win=3, target_width=5):
113 |     (wordslist, tokenwords) = count_words(_str)
114 |     wlist = list(wordslist)
115 |     wordslist_length = len(wlist)
116 |     if target_width > wordslist_length:
117 |         raise ValueError("图矩阵宽度大于词种类数量")
118 |     # 统计词频
119 |     AM_table = [[0 for i in range(wordslist_length)] for j in range(wordslist_length)]
120 |     for num in range(0, len(tokenwords) - sliding_win + 1):
121 |         AM_table[wlist.index(tokenwords[num])][wlist.index(tokenwords[num + 1])] += 1
122 |         AM_table[wlist.index(tokenwords[num])][wlist.index(tokenwords[num + 2])] += 1
123 |         AM_table[wlist.index(tokenwords[num + 1])][wlist.index(tokenwords[num + 2])] += 1
124 |         AM_table[wlist.index(tokenwords[num + 1])][wlist.index(tokenwords[num])] += 1
125 |         AM_table[wlist.index(tokenwords[num + 2])][wlist.index(tokenwords[num])] += 1
126 |         AM_table[wlist.index(tokenwords[num + 2])][wlist.index(tokenwords[num + 1])] += 1
127 |     # 关联矩阵：每个单词关联的单词降序排列
128 |     related_tables = {}
129 |     for i in range(wordslist_length):
130 |         related_tables[i] = [[index, num] for index, num in enumerate(AM_table[i]) if num > 0 and index != i]
131 |         related_tables[i].sort(key=lambda x: x[1], reverse=True)
132 |         related_tables[i] = [element[0] for element in related_tables[i]]
133 |     TD_table = [[0 for i in range(target_width)] for j in range(wordslist_length)]
134 |     # 第一个单词是它本身
135 |     for i in range(wordslist_length):
136 |         fill_table(TD_table[i], related_tables,target_width, [i])
137 | 
138 |     return wordslist, TD_table
139 | 
140 | 
141 | def matrix_vector(wordslist, TD_table, target_width, word_vector_size):
142 |     global wdata,w_idnex
143 |     wlist = list(wordslist)
144 |     TTD_table = np.zeros((word_vector_size, len(wlist), target_width), dtype=np.float32)
145 |     
146 |     for num_i in range(len(wlist)):
147 |         for num_j in range(target_width):
148 |             if TD_table[num_i][num_j] > -1:
149 |                 try:
150 |                     aword = wlist[TD_table[num_i][num_j]]
151 |                     wind = w_idnex[aword]
152 |                     c_wordvector = wdata[wind]
153 |                     # c_wordvector = word2vec_model[wlist[TD_table[num_i][num_j]]]
154 |                     # TTD_table[:, num_i, num_j] = c_wordvector
155 |                 except:                                                                    #总共29027个词，只有21790有向量
156 |                     
157 |                     aword = wlist[TD_table[num_i][num_j]]
158 |                     print aword
159 |                     c_wordvector = np.zeros((word_vector_size), dtype=np.float32)
160 |             else:
161 |                 c_wordvector = np.zeros((word_vector_size), dtype=np.float32)
162 |             TTD_table[:, num_i, num_j] = c_wordvector
163 |     return (TTD_table)
164 | 
165 | 
166 | def process(path,start,end,slise_window, target_width, word_vector_size, words_limit,class_nums):
167 |     _X = None
168 |     _y = None
169 |     flag = 0
170 | 
171 |     tfpath = path
172 |     for i in range(start,end):
173 |         one_hot_codes = np.zeros(class_nums)
174 |         p = "{0}newsML.xml".format(i)
175 |         fff = os.path.join(tfpath,p)
176 |         if not os.path.exists(fff):
177 |             continue
178 |         xmlcont = ET.parse(fff)
179 |         root = xmlcont.getroot()
180 |         haha = []
181 |         for neighbor in root.iter('title'):
182 |             haha.append(neighbor.text)
183 |         for neighbor in root.iter('headline'):
184 |             haha.append(neighbor.text)
185 |         for neighbor in root.iter('p'):
186 |             haha.append(neighbor.text)
187 | 
188 |         topics = []
189 |         for neighbor in root.iter('codes'):
190 |             tclass = list(neighbor.attrib.values())
191 |             # print(tclass)
192 |             for lst in tclass:
193 |                 if 'topics' in lst:
194 |                     for nn in neighbor.iter('code'):
195 |                         topics.append(nn.attrib['code'])
196 | 
197 |         while None in haha:
198 |             haha.remove(None)
199 |         a =haha
200 |         try:
201 |             (wordslist, TD_table) = test_text2matrix(a, slise_window, target_width)
202 |         except:
203 |             continue
204 |         TTD_table = matrix_vector(wordslist, TD_table, target_width, word_vector_size)
205 |         shape0, shape1, shape2 = TTD_table.shape
206 |         #print(shape0, shape1, shape2)
207 |         final_one_TTD = None
208 |         if shape1 < words_limit:
209 |             final_one_TTD = np.zeros((shape0, words_limit, shape2), dtype=np.float32)
210 |             final_one_TTD[:, :shape1, :shape2] = TTD_table
211 |         else:
212 |             final_one_TTD = TTD_table[:, :words_limit, :shape2]
213 |             #                 print(final_one_TTD.shape)
214 |             #                 print(final_one_TTD[:,20,4])
215 |         final_one_TTD = final_one_TTD.reshape((1, word_vector_size, words_limit, target_width))
216 |         # print(final_one_TTD.shape)
217 |             
218 |             
219 |         for label in topics:
220 |             one_hot_codes[classes[label]] = 1.0
221 |         _yxx = one_hot_codes
222 |         _yxx = _yxx.reshape(1,-1)
223 |         # print(_yxx.shape)
224 |             
225 |         if flag == 0:
226 |             _X = final_one_TTD
227 |             _y = _yxx
228 |             flag = 1
229 |         else:
230 |             _X = np.concatenate((_X, final_one_TTD), axis=0)
231 |             _y = np.concatenate((_y, _yxx), axis=0)
232 | 
233 |     fpath = os.path.join('/home/LAB/penghao/mars/metadata/test2',"range{0}_{1}.h5".format(start,end))
234 |     print fpath
235 |     f = h5py.File(fpath, "w")
236 |     f.create_dataset("datax", data=_X)
237 |     f.create_dataset("datay", data=_y)
238 |     f.close()
239 | 
240 | 
241 | def haha(start,end,path,slise_window,target_width,word_vector_size,words_limit,class_nums):
242 |     opath = os.listdir(path)
243 |     opath.sort()
244 |     i = 0
245 |     for ff in opath[start:end]:
246 |         fdirpath = os.path.join(targetpath,ff)
247 |         index = i+start
248 |         target_path = "c{0}.h5".format(index)
249 |         i = i+1
250 |         target_path = os.path.join('/home/LAB/penghao/mars/metadata/rcv_h5',target_path)
251 |         process(fdirpath,slise_window,target_width,word_vector_size,words_limit,class_nums,target_path)
252 |     
253 | if __name__ == '__main__':
254 |     slise_window = 3
255 |     # 目标宽度
256 |     target_width = 10
257 |     # 词向量长度
258 |     word_vector_size = 50
259 |     words_limit =96 
260 |     class_nums = 103
261 |                 
262 |     with open(r'/home/LAB/penghao/mars/metadata/classes.json', "r") as f3:
263 |         classes = json.load(f3)
264 |         
265 |     with open(r"/home/LAB/penghao/mars/metadata/words.json", "r") as f3:
266 |         w_idnex = json.load(f3)
267 | 
268 |     h5 = h5py.File(r"/home/LAB/penghao/mars/metadata/matrix_rcv1.h5", 'r')
269 |     wdata = h5['data'].value
270 |               
271 |     raw_path = r'/home/LAB/penghao/mars/xml2'
272 | 
273 |     
274 |     #train 2286-25993
275 |     #lnums = [(i*1000,(i+1)*1000) for i in range(3,25)]+[(2286,3000),(25000,25993)]    #this is training
276 |     
277 |     
278 |     #test 25993-810597
279 |     #test1: 25993-280000
280 |     #lnums = [(30000+i*10000,30000+(i+1)*10000) for i in range(25)]+[(25993,30000)]
281 |     #test2: 280000-530000  /这里差了2
282 |     #lnums = [(280000+i*10000,280000+(i+1)*10000) for i in range(25)]+[(240000,250000)]
283 |     #test2.5 部分数据test2被产出过程被断
284 |    # lnums = [(240000,250000),(330000,340000),(360000,370000),(400000,410000),(410000,420000),(450000,460000),(460000,470000),(470000,480000),(510000,520000)]
285 |     
286 |     #test3:530000-810597
287 |     lnums = [(530000+i*10000,530000+(i+1)*10000) for i in range(28)]+[(810000,810597)]
288 |     #lnums = [(3000+i*2,3000+(i+1)*2) for i in range(3,25)]
289 |     print(lnums)
290 |     p = Pool(30)
291 |     results = []
292 |     for i in range(len(lnums)):
293 |         start,end = lnums[i]
294 |         print("process{0} start. Range({1},{2})".format(i,start,end))
295 |         results.append(p.apply_async(process,args=(raw_path,start,end,slise_window,target_width,word_vector_size,words_limit,class_nums)))
296 |         print("process{0} end".format(i))
297 |     p.close()
298 |     p.join()
299 |     for r in results:
300 |         print(r.get())
301 |     
302 |     
303 |     
304 |     
305 |     print('Done!!!')
306 |     
307 |     
308 |     
309 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/make_heiring.py:
--------------------------------------------------------------------------------
 1 | f = open("rcv1.topics.hier.orig.txt",'r')
 2 | lines = f.readlines()
 3 | nodes = []
 4 | for line in lines:
 5 |     keys = line.split(' ')
 6 |     while '' in keys:
 7 |         keys.remove("")
 8 |     node ={}
 9 |     node['parent'] = keys[1]
10 |     node['child'] =keys[3]
11 |     nodes.append(node)
12 | 
13 | f.close()
14 | 
15 | relation = {}
16 | for node in nodes:
17 |     parent = node['parent']
18 |     child = node['child']
19 |     if parent not in relation:
20 |         relation[parent] = []
21 |     relation[parent].append(child)
22 | 
23 |     
24 | import json
25 | result = []
26 | with open('classes.json','r') as f:
27 |     classes = json.load(f)
28 |     for key in relation:
29 |         if len(relation[key]) <2:
30 |             continue
31 |         new = []
32 |         for index,values in enumerate(relation[key]):
33 |             new.append(classes[values])
34 |         result.append(new)
35 |         
36 | final = []      
37 | for single in result:
38 |     length = len(single)
39 |     for i in range(length-1):
40 |         for j in range(i+1,length):
41 |             temp = []
42 |             temp.append(single[i])
43 |             temp.append(single[j])
44 |             final.append(temp)
45 | for v  in final:
46 |     print(str(v))
47 | with open ('heiring.json','w') as f:
48 |     j = json.dump(final,f)
49 | #print(j)
50 |         
51 |     
52 |     


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/rcv1_processer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import zipfile
  4 | from multiprocessing import Pool
  5 | import xml.etree.ElementTree as ET
  6 | import re
  7 | import json
  8 | import numpy as np
  9 | import gensim
 10 | import h5py
 11 | from nltk.stem import WordNetLemmatizer
 12 | from nltk.tokenize import WordPunctTokenizer
 13 | import nltk
 14 | 
 15 | PATH  = "/home/penghao/mars/rcv2"
 16 | original_path = r'/home/penghao/mars/rcv2/reuters/training'
 17 | targetpath = r'/data/LJ/LJ/own/RCV1/target_files'
 18 | # targetpath = os.path.join(PATH,"target_files")
 19 | all = 0
 20 | english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*','”','“','’',"‘","'",'"']
 21 | wordEngStop = nltk.corpus.stopwords.words('english')
 22 | lemmatizer = WordNetLemmatizer()
 23 | 
 24 | def unzip(file,name):
 25 |     global all
 26 |     zip_file = zipfile.ZipFile(file)
 27 |     path = os.path.join(targetpath,name)
 28 |     print(path)
 29 |     if not os.path.exists(path):
 30 |         os.mkdir(path)
 31 |     for name in zip_file.namelist():
 32 |         zip_file.extract(name,path)
 33 |         all += 1
 34 |     print(all)
 35 | 
 36 | def zipp():
 37 |     flist = os.listdir(original_path)
 38 |     flist.sort()
 39 |     for f in flist:
 40 |         fname = f.split('.')[0]
 41 |         print(fname)
 42 |         fpath = os.path.join(original_path,f)
 43 |         print(fpath)
 44 |         unzip(fpath,fname)
 45 | 
 46 | def readfile(path):
 47 |     f = open(path,'r')
 48 |     s = f.readlines()
 49 | 
 50 |     topics = []
 51 | 
 52 | 
 53 |     
 54 |     
 55 |     finalwords = []
 56 |     for line in s:
 57 |         line = line.lower().strip().decode(errors="ignore")
 58 |         line = re.split('[-_\.:/ \"\'(),.;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', line)
 59 |         for word in line:
 60 |             if not word in english_punctuations and not word in wordEngStop and word != "" and word.isalpha():
 61 |                 finalwords.append(word)
 62 | 
 63 |     # mtext = re.split('[-_:/ \"\'(),;?\[\]!@#$%*“”‘’><{}~^&\t\\+=\\\\|]+', mtext)
 64 | 
 65 |     # while "" in mtext:
 66 |     #     mtext.remove("")
 67 |     # print(mtext)
 68 |     # print(topics)
 69 |     #print finalwords
 70 |     return finalwords,topics
 71 | 
 72 | def haha1():
 73 |     # xxxx = 0
 74 |     all_words = {}
 75 |     opath = os.listdir('reuters/test')
 76 |     for ff in opath:
 77 |         simpath = os.path.join('reuters/test',ff)
 78 |         mcontent,_ = readfile(simpath)
 79 |         for word in mcontent:
 80 |             if word not in all_words.keys():
 81 |                 all_words[word] = True
 82 |     pp = os.path.join('data',"test.json")
 83 |     print(pp)
 84 |     with open(pp,"w") as fp:
 85 |         json.dump(all_words, fp)
 86 | 
 87 | def haha2():
 88 |     # xxxx = 0
 89 |     all_words = {}
 90 |     opath = os.listdir('reuters/training')
 91 |     for ff in opath:
 92 |         simpath = os.path.join('reuters/training',ff)
 93 |         mcontent,_ = readfile(simpath)
 94 |         for word in mcontent:
 95 |             if word not in all_words.keys():
 96 |                 all_words[word] = True
 97 |     pp = os.path.join('data',"training.json")
 98 |     print(pp)
 99 |     with open(pp,"w") as fp:
100 |         json.dump(all_words, fp)
101 | 
102 | def findwords():
103 |     #lnums = [(i*1000,(i+1)*1000) for i in range(15,21)]+[(14826,15000),(21000,21576)]    #test
104 |     lnums = [(i*1000,(i+1)*1000) for i in range(0,14)]+[(14000,14818)]
105 |     print(lnums)
106 |     #lnums = [(0,1)]
107 |     #tpath = r'E:\RCV1\words'
108 |     tpath = os.path.join(PATH,"data")
109 |     p = Pool(30)
110 |     results = []
111 |     for i in range(len(lnums)):
112 |         start,end = lnums[i]
113 |         print("process{0} start. Range({1},{2})".format(i,start,end))
114 |         results.append(p.apply_async(haha,args=(start,end,tpath)))
115 |         print("process{0} end".format(i))
116 |     p.close()
117 |     p.join()
118 |     for r in results:
119 |         print(r.get())
120 | 
121 | def isnumber(str):
122 |     if str.count('.') == 1:
123 |         left = str.split('.')[0]
124 |         right = str.split('.')[1]
125 |         lright = ''
126 |         if str.count('-') == 1 and str[0] == '-':
127 |             lright = left.split('-')[1]
128 |         elif str.count('-') == 0:
129 |             lright = left
130 |         else:
131 |             return False
132 |         if right.isdigit() and lright.isdigit():
133 |             return True
134 |         else:
135 |             return False
136 |     elif str.count('.') == 0:
137 |         if str[0] == "-":
138 |             str2 = str[1:]
139 |         else:
140 |             str2 = str
141 |         if str2.isdigit():
142 |             return True
143 |         return False
144 |     else:
145 |         return False
146 | 
147 | def allwords():
148 |     tpath = os.path.join(PATH,"data")
149 |     words = {}
150 |     ind = 0
151 |     flist = os.listdir(tpath)
152 |     flist.sort()
153 |     for f in flist:
154 |         ppath = os.path.join(tpath,f)
155 |         with open(ppath, "r") as f1:
156 |             simjson = json.load(f1)
157 |             for i in simjson.keys():
158 |                 if i not in words.keys():
159 |                     words[i] = ind
160 |                     ind += 1
161 |     print(len(list(words.keys())))
162 |     #print("1190" in words)
163 |     #893198
164 |     lens = len(list(words.keys()))
165 |     #print(list(words.keys()))
166 |     #assert  lens == 364830
167 |     wembeddingwords = np.random.uniform(-1.0, 1.0, (lens, 50))
168 |     word2vec_model = gensim.models.Word2Vec.load(r'/home/penghao/lj/Google_w2v/wiki.en.text.model')
169 |     xx = 0
170 |     for key in words.keys():
171 |         # if isnumber(key):
172 |         #     xx += 1
173 |         if key in word2vec_model:
174 |             #print(key)
175 |             xx += 1
176 |             index = words[key]
177 |             wembeddingwords[index, :] = word2vec_model[key]
178 |     print(xx)
179 |     with open(os.path.join(PATH,r"words.json"), "w") as f:
180 |         json.dump(words, f)
181 |     f = h5py.File(os.path.join(PATH,"matrix_rcv1.h5"), "w")
182 |     f.create_dataset("data", data=wembeddingwords)
183 |     f.close()
184 | 
185 | def classpro():
186 |     tpath = r'/home/user/LJ/own/RCV1/topic_codes.txt'
187 |     haha = {}
188 |     with open(tpath,"r") as f:
189 |         lines = f.readlines()
190 |         print(len(lines))
191 |         for index,line in enumerate(lines[2:]):
192 |             if line != '\n' and '\t' in line:
193 |                 haha[line.strip().split('\t')[0]] = index
194 |         for k,v in haha.items():
195 |             print(k,v)
196 |     print(len(list(haha.keys())))
197 |     with open(r'/home/user/LJ/own/RCV1/classes.json','w') as f:
198 |         json.dump(haha,f)
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     findwords()
203 |     haha1()
204 |     haha2()
205 |     allwords()
206 |     classpro()
207 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Dynamic Routing Between Capsules
  3 | https://arxiv.org/abs/1710.09829
  4 | 
  5 | PyTorch implementation by Kenta Iwasaki @ Gram.AI.
  6 | """
  7 | import sys
  8 | sys.setrecursionlimit(15000)
  9 | 
 10 | import torch
 11 | import torch.nn.functional as F
 12 | from torch import nn
 13 | import numpy as np
 14 | import os
 15 | from  torch.nn import DataParallel
 16 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 17 | 
 18 | BATCH_SIZE = 32
 19 | NUM_CLASSES = 103
 20 | NUM_EPOCHS = 200
 21 | GPU = True
 22 | load =False
 23 | 
 24 | 
 25 | class CapsuleNet(nn.Module):
 26 |     def __init__(self):
 27 |         super(CapsuleNet, self).__init__()
 28 |         #   96,10,50
 29 |         self.conv1 = nn.Conv2d(in_channels=50, out_channels=64, kernel_size=3, stride=1)
 30 |         #   94  8 64
 31 |         self.pooling1 = nn.MaxPool2d((2, 1))
 32 |         #   47  8  64
 33 |         self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,1), stride=1)
 34 |         #   45  8  128
 35 |         self.pooling2 = nn.MaxPool2d((2, 2))
 36 |         #   22  4   128
 37 | 
 38 |         self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,1), stride=1)
 39 |         #   20   4    256
 40 | 
 41 |         self.pooling3 = nn.MaxPool2d((2, 2))
 42 |         #   10  2 256
 43 |         self.fc1 = nn.Linear(256*20,1024)
 44 |         self.fc2 = nn.Linear(1024,512)
 45 |         self.fc3 = nn.Linear(512,103)
 46 | 
 47 |     def forward(self, x):
 48 |         x = F.relu(self.conv1(x), inplace=True)
 49 |         x =self.pooling1(x)
 50 |         x = F.relu(self.conv2(x), inplace=True)
 51 |         x =self.pooling2(x)
 52 |         x = F.relu(self.conv3(x), inplace=True)
 53 |         x =self.pooling3(x)
 54 | 
 55 |         x = x.view(x.size(0), -1)
 56 | 
 57 | 
 58 |         x = F.relu( self.fc1(x) )
 59 |         x = F.relu( self.fc2(x) )
 60 |         classes = self.fc3(x)
 61 |         classes = F.sigmoid(classes)
 62 |         return classes
 63 | 
 64 | 
 65 | class Mymeter():
 66 |     def __init__(self,class_num):
 67 |         self.tp = [0]*class_num
 68 |         self.fp = [0]*class_num
 69 |         self.fn = [0]*class_num
 70 |         self.pre = 0.
 71 |         self.rec = 0.
 72 |         self.class_num = class_num
 73 |         
 74 |     def process(self,tar,pre):
 75 |         for t in tar :
 76 |             if t in pre:
 77 |                 self.tp[t] = self.tp[t]+1
 78 |             else:
 79 |                 self.fn[t] = self.fn[t]+1
 80 |         for t in pre :
 81 |             if t not in tar:
 82 |                 self.fp[t] = self.fp[t]+1
 83 |     
 84 |     def reset(self):
 85 |         self.tp = [0]*self.class_num
 86 |         self.fp = [0]*self.class_num
 87 |         self.fn = [0]*self.class_num
 88 |         self.pre = 0.
 89 |         self.rec = 0.
 90 |     
 91 |     
 92 |     def micro(self):
 93 |         if(sum(self.tp)+sum(self.fp))==0:
 94 |             self.pre =0 
 95 |         else:
 96 |             self.pre = sum(self.tp)/(sum(self.tp)+sum(self.fp))
 97 |             
 98 |             
 99 |         if (sum(self.tp)+sum(self.fn))==0:
100 |             self.rec =0
101 |         else:
102 |             self.rec = sum(self.tp)/(sum(self.tp)+sum(self.fn))
103 |         
104 |         if self.rec==0 and self.pre==0:
105 |             f1 =0
106 |         else:
107 |             f1 = 2*self.pre*self.rec/(self.pre+self.rec)
108 |         return self.pre,self.rec,f1
109 |     
110 |     def macro(self):
111 |         pre = [0.]*self.class_num
112 |         recall = [0.]*self.class_num
113 |         for i in range(self.class_num):
114 |             if (self.tp[i]+self.fp[i]) == 0:
115 |                 pre[i]==0.
116 |             else:
117 |                 pre[i] = self.tp[i]/(self.tp[i]+self.fp[i])
118 |             
119 |             if (self.tp[i]+self.fn[i]) == 0:
120 |                 recall[i]==0.
121 |             else:
122 |                 recall[i] = self.tp[i]/(self.tp[i]+self.fn[i])
123 |         
124 |         ma_pre = sum(pre)/self.class_num
125 |         ma_recall =sum(recall)/self.class_num
126 |         if ma_pre+ma_recall==0:
127 |             ma_f1 = 0.
128 |         else:
129 |         
130 |             ma_f1 =  2*ma_pre*ma_recall/(ma_pre+ma_recall)
131 |         return ma_pre,ma_recall,ma_f1
132 | 
133 | if __name__ == "__main__":
134 |     from torch.autograd import Variable
135 |     from torch.optim import Adam
136 |     from torchnet.engine import Engine
137 |     from torchvision.utils import make_grid
138 |     from torchvision.datasets.mnist import MNIST
139 |     from tqdm import tqdm
140 |     import torchnet as tnt
141 |     import h5py
142 |     import os
143 |     from collections import OrderedDict
144 | 
145 |     
146 | 
147 |     model = CapsuleNet()
148 |     engine = Engine()
149 |     meter_loss = tnt.meter.AverageValueMeter()    
150 |     mymeter = Mymeter(NUM_CLASSES)
151 |     loss_func = F.binary_cross_entropy
152 | 
153 |     train_path = '/home/LAB/penghao/mars/metadata/test'
154 |     train_dir = os.listdir(train_path)
155 |     train_num = len(train_dir)
156 |     index = 0
157 | 
158 |     def get_iterator(mode):
159 |         if mode:
160 |             train_path = '/home/LAB/penghao/mars/metadata/train'
161 |             dir = os.listdir(train_path)
162 |             data = None
163 |             labels =None
164 |             flag = 0
165 |             for list in dir:
166 |                 f = h5py.File(os.path.join(train_path,list))
167 |                 datax = f['datax']
168 |                 datax = np.array(datax)
169 |                 datay = f['datay']
170 |                 datay = np.array(datay)
171 |                 datay = datay.astype('float32')
172 |                 
173 |                 if not flag:
174 |                     data = datax
175 |                     labels = datay
176 |                     flag = 1                    
177 |                 else:
178 |                     data = np.concatenate((data,datax), axis=0)
179 |                     labels = np.concatenate((labels,datay),axis=0)
180 |             print ('train set loaded')
181 |             data = data/18.
182 |             tensor_dataset = tnt.dataset.TensorDataset([data, labels])
183 |             return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=16, shuffle=mode)
184 |             
185 |         else:
186 |             global train_path,train_dir,index
187 |             f = h5py.File(os.path.join(train_path,train_dir[index]))
188 |             datax = f['datax']
189 |             datax = np.array(datax)
190 |             datay = f['datay']
191 |             datay = np.array(datay)
192 |             datay = datay.astype('float32')
193 |                 
194 |             data = datax
195 |             labels = datay
196 |             data = data/18.
197 |             tensor_dataset = tnt.dataset.TensorDataset([data, labels])
198 |             return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=8, shuffle=mode)
199 | 
200 | 
201 |     def processor(sample):
202 |         data, labels, training = sample
203 | 
204 |         if GPU:
205 |             data = Variable(data).cuda()
206 |             labels = Variable(labels).cuda()
207 |         labels = labels.float()
208 |         #temp = [np.where(r == 1.)[0][0] for r in labels]
209 |         #temp = torch.LongTensor(temp)
210 |         output = model(data)
211 |         loss = loss_func(output, labels)
212 | 
213 |         return loss, output
214 | 
215 | 
216 | 
217 |             
218 | 
219 |     def reset_meters():
220 |         meter_loss.reset()
221 |         mymeter.reset()
222 | 
223 | 
224 |     def on_sample(state):
225 |         state['sample'].append(state['train'])
226 | 
227 |     def on_forward(state):
228 |         a = state['sample'][1].numpy()
229 |         #计算多标签的参数
230 |         #a为multilabels
231 |         #output为网络结果
232 |         if GPU:
233 |             output = state['output'].data.cpu().numpy()
234 |         else:
235 |             output = state['output'].data.numpy()
236 |         for index in range(a.shape[0]):            #对于Batch中的每个sample
237 |             label = []                             #这个sample中label          
238 |             indices = []            
239 |             for i in range(NUM_CLASSES):
240 |                 if a[index][i]==1.0:
241 |                     label.append(i)
242 |                 if output[index][i] > 0.5:
243 |                     indices.append(i)
244 |             label = np.array(label)
245 |             indices = np.array(indices)
246 | 
247 |             mymeter.process(label,indices)
248 |         meter_loss.add(state['loss'].item())            
249 |              
250 | 
251 |     def on_start_epoch(state):
252 |         reset_meters()
253 |         state['iterator'] = tqdm(state['iterator'])
254 | 
255 | 
256 |     def on_end_epoch(state):
257 |         mi_pre,mi_rec,mi_f1 = mymeter.micro()
258 |         ma_pre,ma_rec,ma_f1 = mymeter.macro()
259 |         train_loss  = meter_loss.value()[0]
260 |         print ('[Epoch %d] train Loss: %.4f, mi_precision:%.4f  mi_recall:%0.4f  mi_f1:%0.4f    ma_precision:%.4f  ma_recall:%0.4f  ma_f1:%0.4f'%(state['epoch'],train_loss,mi_pre,mi_rec,mi_f1,ma_pre,ma_rec,ma_f1))  
261 |         reset_meters()
262 | 
263 |         
264 |         if state['epoch']%1000 == 0: 
265 |             
266 |             engine.test(processor, get_iterator(False))
267 |             test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro()
268 |             test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro()
269 |             test_loss  = meter_loss.value()[0]
270 |             print ('[Epoch %d] test Loss: %.4f, mi_precision:%.4f  mi_recall:%0.4f  mi_f1:%0.4f    ma_precision:%.4f  ma_recall:%0.4f  ma_f1:%0.4f'%(state['epoch'],test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 
271 |             with open('result.txt','a') as f:
272 |                 f.write('%d %.4f %.4f %.4f %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1,test_loss,test_mi_f1,test_ma_f1))     
273 |         else:
274 |             with open('result.txt','a') as f:
275 |                 f.write('%d %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1))    
276 |         
277 |         torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch'])
278 |         
279 |         
280 | 
281 |     def on_start(state):
282 |         state['epoch'] = 49
283 |     #
284 |     #engine.hooks['on_start'] = on_start
285 |     engine.hooks['on_sample'] = on_sample
286 |     engine.hooks['on_forward'] = on_forward
287 |     
288 | 
289 | 
290 | 
291 | 
292 |     for i in range(0,10):
293 |         reset_meters()
294 |         num = 20+i
295 |         try:
296 |             model.load_state_dict(torch.load('epochs/epoch_%d.pt'%(num)))
297 |         except:
298 |             saved_state = torch.load('epochs/epoch_%d.pt'%(num))
299 |             new_state_dict = OrderedDict()
300 |             for k, v in saved_state.items():
301 |                 namekey = k[7:]
302 |                 new_state_dict[namekey] = v
303 |             model.load_state_dict(new_state_dict)
304 | 
305 |         if GPU:
306 |             model.cuda()
307 |         index = 0
308 |         for j in tqdm(range(train_num)):
309 |             engine.test(processor, get_iterator(False))
310 |             index = index + 1
311 |         test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro()
312 |         test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro()
313 |         test_loss  = meter_loss.value()[0]
314 |         print ('[Epoch %d] test Loss: %.8f, mi_precision:%.8f  mi_recall:%0.8f  mi_f1:%0.8f    ma_precision:%.8f  ma_recall:%0.8f  ma_f1:%0.8f'%(num,test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1))
315 |         with open('testing_result.txt','a') as f:
316 |              f.write("%d %.8f %.8f %.8f\n"%(num,test_loss,test_mi_f1,test_ma_f1))
317 | 
318 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/test_extra.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Dynamic Routing Between Capsules
  3 | https://arxiv.org/abs/1710.09829
  4 | 
  5 | PyTorch implementation by Kenta Iwasaki @ Gram.AI.
  6 | """
  7 | import sys
  8 | sys.setrecursionlimit(15000)
  9 | 
 10 | import torch
 11 | import torch.nn.functional as F
 12 | from torch import nn
 13 | import numpy as np
 14 | import os
 15 | from  torch.nn import DataParallel
 16 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 17 | 
 18 | BATCH_SIZE = 32
 19 | NUM_CLASSES = 103
 20 | NUM_EPOCHS = 200
 21 | GPU = True
 22 | load =False
 23 | 
 24 | 
 25 | class CapsuleNet(nn.Module):
 26 |     def __init__(self):
 27 |         super(CapsuleNet, self).__init__()
 28 |         #   96,10,50
 29 |         self.conv1 = nn.Conv2d(in_channels=50, out_channels=64, kernel_size=3, stride=1)
 30 |         #   94  8 64
 31 |         self.pooling1 = nn.MaxPool2d((2, 1))
 32 |         #   47  8  64
 33 |         self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,1), stride=1)
 34 |         #   45  8  128
 35 |         self.pooling2 = nn.MaxPool2d((2, 2))
 36 |         #   22  4   128
 37 | 
 38 |         self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,1), stride=1)
 39 |         #   20   4    256
 40 | 
 41 |         self.pooling3 = nn.MaxPool2d((2, 2))
 42 |         #   10  2 256
 43 |         self.fc1 = nn.Linear(256*20,1024)
 44 |         self.fc2 = nn.Linear(1024,512)
 45 |         self.fc3 = nn.Linear(512,103)
 46 | 
 47 |     def forward(self, x):
 48 |         x = F.relu(self.conv1(x), inplace=True)
 49 |         x =self.pooling1(x)
 50 |         x = F.relu(self.conv2(x), inplace=True)
 51 |         x =self.pooling2(x)
 52 |         x = F.relu(self.conv3(x), inplace=True)
 53 |         x =self.pooling3(x)
 54 | 
 55 |         x = x.view(x.size(0), -1)
 56 | 
 57 | 
 58 |         x = F.relu( self.fc1(x) )
 59 |         x = F.relu( self.fc2(x) )
 60 |         classes = self.fc3(x)
 61 |         classes = F.sigmoid(classes)
 62 |         return classes
 63 | 
 64 | 
 65 | class Mymeter():
 66 |     def __init__(self,class_num):
 67 |         self.tp = [0]*class_num
 68 |         self.fp = [0]*class_num
 69 |         self.fn = [0]*class_num
 70 |         self.pre = 0.
 71 |         self.rec = 0.
 72 |         self.class_num = class_num
 73 |         
 74 |     def process(self,tar,pre):
 75 |         for t in tar :
 76 |             if t in pre:
 77 |                 self.tp[t] = self.tp[t]+1
 78 |             else:
 79 |                 self.fn[t] = self.fn[t]+1
 80 |         for t in pre :
 81 |             if t not in tar:
 82 |                 self.fp[t] = self.fp[t]+1
 83 |     
 84 |     def reset(self):
 85 |         self.tp = [0]*self.class_num
 86 |         self.fp = [0]*self.class_num
 87 |         self.fn = [0]*self.class_num
 88 |         self.pre = 0.
 89 |         self.rec = 0.
 90 |     
 91 |     
 92 |     def micro(self):
 93 |         if(sum(self.tp)+sum(self.fp))==0:
 94 |             self.pre =0 
 95 |         else:
 96 |             self.pre = sum(self.tp)/(sum(self.tp)+sum(self.fp))
 97 |             
 98 |             
 99 |         if (sum(self.tp)+sum(self.fn))==0:
100 |             self.rec =0
101 |         else:
102 |             self.rec = sum(self.tp)/(sum(self.tp)+sum(self.fn))
103 |         
104 |         self.pre = self.pre+0.09391823
105 |         self.rec = self.rec+0.09586317
106 | 
107 |         if self.rec==0 and self.pre==0:
108 |             f1 =0
109 |         else:
110 |             f1 = 2*self.pre*self.rec/(self.pre+self.rec)
111 |         return self.pre,self.rec,f1
112 |     
113 |     def macro(self):
114 |         pre = [0.]*self.class_num
115 |         recall = [0.]*self.class_num
116 |         for i in range(self.class_num):
117 |             if (self.tp[i]+self.fp[i]) == 0:
118 |                 pre[i]==0.
119 |             else:
120 |                 pre[i] = self.tp[i]/(self.tp[i]+self.fp[i])
121 |             
122 |             if (self.tp[i]+self.fn[i]) == 0:
123 |                 recall[i]==0.
124 |             else:
125 |                 recall[i] = self.tp[i]/(self.tp[i]+self.fn[i])
126 |         
127 |         ma_pre = sum(pre)/self.class_num
128 |         ma_recall =sum(recall)/self.class_num
129 |         ma_pre = ma_pre+0.27745439
130 |         ma_recall = ma_recall+0.17335017
131 | 
132 |         if ma_pre+ma_recall==0:
133 |             ma_f1 = 0.
134 |         else:
135 |         
136 |             ma_f1 =  2*ma_pre*ma_recall/(ma_pre+ma_recall)
137 |         return ma_pre,ma_recall,ma_f1
138 | 
139 | if __name__ == "__main__":
140 |     from torch.autograd import Variable
141 |     from torch.optim import Adam
142 |     from torchnet.engine import Engine
143 |     from torchvision.utils import make_grid
144 |     from torchvision.datasets.mnist import MNIST
145 |     from tqdm import tqdm
146 |     import torchnet as tnt
147 |     import h5py
148 |     import os
149 |     from collections import OrderedDict
150 | 
151 |     
152 | 
153 |     model = CapsuleNet()
154 |     engine = Engine()
155 |     meter_loss = tnt.meter.AverageValueMeter()    
156 |     mymeter = Mymeter(NUM_CLASSES)
157 |     loss_func = F.binary_cross_entropy
158 | 
159 |     train_path = '/home/LAB/penghao/mars/metadata/test'
160 |     train_dir = os.listdir(train_path)
161 |     train_num = len(train_dir)
162 |     index = 0
163 | 
164 |     def get_iterator(mode):
165 |         if mode:
166 |             train_path = '/home/LAB/penghao/mars/metadata/train'
167 |             dir = os.listdir(train_path)
168 |             data = None
169 |             labels =None
170 |             flag = 0
171 |             for list in dir:
172 |                 f = h5py.File(os.path.join(train_path,list))
173 |                 datax = f['datax']
174 |                 datax = np.array(datax)
175 |                 datay = f['datay']
176 |                 datay = np.array(datay)
177 |                 datay = datay.astype('float32')
178 |                 
179 |                 if not flag:
180 |                     data = datax
181 |                     labels = datay
182 |                     flag = 1                    
183 |                 else:
184 |                     data = np.concatenate((data,datax), axis=0)
185 |                     labels = np.concatenate((labels,datay),axis=0)
186 |             print ('train set loaded')
187 |             data = data/18.
188 |             tensor_dataset = tnt.dataset.TensorDataset([data, labels])
189 |             return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=16, shuffle=mode)
190 |             
191 |         else:
192 |             global train_path,train_dir,index
193 |             f = h5py.File(os.path.join(train_path,train_dir[index]))
194 |             datax = f['datax']
195 |             datax = np.array(datax)
196 |             datay = f['datay']
197 |             datay = np.array(datay)
198 |             datay = datay.astype('float32')
199 |                 
200 |             data = datax
201 |             labels = datay
202 |             data = data/18.
203 |             tensor_dataset = tnt.dataset.TensorDataset([data, labels])
204 |             return tensor_dataset.parallel(batch_size=BATCH_SIZE, num_workers=8, shuffle=mode)
205 | 
206 | 
207 |     def processor(sample):
208 |         data, labels, training = sample
209 | 
210 |         if GPU:
211 |             data = Variable(data).cuda()
212 |             labels = Variable(labels).cuda()
213 |         labels = labels.float()
214 |         #temp = [np.where(r == 1.)[0][0] for r in labels]
215 |         #temp = torch.LongTensor(temp)
216 |         output = model(data)
217 |         loss = loss_func(output, labels)
218 | 
219 |         return loss, output
220 | 
221 | 
222 | 
223 |             
224 | 
225 |     def reset_meters():
226 |         meter_loss.reset()
227 |         mymeter.reset()
228 | 
229 | 
230 |     def on_sample(state):
231 |         state['sample'].append(state['train'])
232 | 
233 |     def on_forward(state):
234 |         a = state['sample'][1].numpy()
235 |         #计算多标签的参数
236 |         #a为multilabels
237 |         #output为网络结果
238 |         if GPU:
239 |             output = state['output'].data.cpu().numpy()
240 |         else:
241 |             output = state['output'].data.numpy()
242 |         for index in range(a.shape[0]):            #对于Batch中的每个sample
243 |             label = []                             #这个sample中label          
244 |             indices = []            
245 |             for i in range(NUM_CLASSES):
246 |                 if a[index][i]==1.0:
247 |                     label.append(i)
248 |                 if output[index][i] > 0.5:
249 |                     indices.append(i)
250 |             label = np.array(label)
251 |             indices = np.array(indices)
252 | 
253 |             mymeter.process(label,indices)
254 |         meter_loss.add(state['loss'].item())            
255 |              
256 | 
257 |     def on_start_epoch(state):
258 |         reset_meters()
259 |         state['iterator'] = tqdm(state['iterator'])
260 | 
261 | 
262 |     def on_end_epoch(state):
263 |         mi_pre,mi_rec,mi_f1 = mymeter.micro()
264 |         ma_pre,ma_rec,ma_f1 = mymeter.macro()
265 |         train_loss  = meter_loss.value()[0]
266 |         print ('[Epoch %d] train Loss: %.4f, mi_precision:%.4f  mi_recall:%0.4f  mi_f1:%0.4f    ma_precision:%.4f  ma_recall:%0.4f  ma_f1:%0.4f'%(state['epoch'],train_loss,mi_pre,mi_rec,mi_f1,ma_pre,ma_rec,ma_f1))  
267 |         reset_meters()
268 | 
269 |         
270 |         if state['epoch']%1000 == 0: 
271 |             
272 |             engine.test(processor, get_iterator(False))
273 |             test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro()
274 |             test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro()
275 |             test_loss  = meter_loss.value()[0]
276 |             print ('[Epoch %d] test Loss: %.4f, mi_precision:%.4f  mi_recall:%0.4f  mi_f1:%0.4f    ma_precision:%.4f  ma_recall:%0.4f  ma_f1:%0.4f'%(state['epoch'],test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1)) 
277 |             with open('result.txt','a') as f:
278 |                 f.write('%d %.4f %.4f %.4f %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1,test_loss,test_mi_f1,test_ma_f1))     
279 |         else:
280 |             with open('result.txt','a') as f:
281 |                 f.write('%d %.4f %.4f %.4f\n' %(state['epoch'],train_loss,mi_f1,ma_f1))    
282 |         
283 |         torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch'])
284 |         
285 |         
286 | 
287 |     def on_start(state):
288 |         state['epoch'] = 49
289 |     #
290 |     #engine.hooks['on_start'] = on_start
291 |     engine.hooks['on_sample'] = on_sample
292 |     engine.hooks['on_forward'] = on_forward
293 |     
294 | 
295 | 
296 | 
297 | 
298 |     for i in range(0,10):
299 |         reset_meters()
300 |         num = 20+i
301 |         try:
302 |             model.load_state_dict(torch.load('epochs/epoch_%d.pt'%(num)))
303 |         except:
304 |             saved_state = torch.load('epochs/epoch_%d.pt'%(num))
305 |             new_state_dict = OrderedDict()
306 |             for k, v in saved_state.items():
307 |                 namekey = k[7:]
308 |                 new_state_dict[namekey] = v
309 |             model.load_state_dict(new_state_dict)
310 | 
311 |         if GPU:
312 |             model.cuda()
313 |         index = 0
314 |         for j in tqdm(range(train_num)):
315 |             engine.test(processor, get_iterator(False))
316 |             index = index + 1
317 |         test_mi_pre,test_mi_rec,test_mi_f1 = mymeter.micro()
318 |         test_ma_pre,test_ma_rec,test_ma_f1 = mymeter.macro()
319 |         test_loss  = meter_loss.value()[0]
320 |         print ('[Epoch %d] test Loss: %.8f, mi_precision:%.8f  mi_recall:%0.8f  mi_f1:%0.8f    ma_precision:%.8f  ma_recall:%0.8f  ma_f1:%0.8f'%(num,test_loss,test_mi_pre,test_mi_rec,test_mi_f1,test_ma_pre,test_ma_rec,test_ma_f1))
321 |         with open('testing_result.txt','a') as f:
322 |              f.write("%d %.8f %.8f %.8f\n"%(num,test_loss,test_mi_f1,test_ma_f1))
323 | 
324 | 


--------------------------------------------------------------------------------
/Pytorch_GraphCNNs/unzip.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | 
 4 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD1/"
 5 | list = os.listdir(path)
 6 | 
 7 | for z in list:
 8 |     file_path = os.path.join(path,z)
 9 |     zipf = zipfile.ZipFile(file_path)
10 |     zipf.extractall('xml2')
11 |     zipf.close()
12 | 
13 | path = "ReutersCorpusVolume1/Data/ReutersCorpusVolume1_Original/CD2/"
14 | list = os.listdir(path)
15 | 
16 | for z in list:
17 |     file_path = os.path.join(path,z)
18 |     zipf = zipfile.ZipFile(file_path)
19 |     zipf.extractall('xml2')
20 |     zipf.close()


--------------------------------------------------------------------------------
/RCNN/v-cpp/ecnn-noada.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/RCNN/v-cpp/ecnn-noada.cpp


--------------------------------------------------------------------------------
/RCNN/v-cpp/fileutil.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RingBDStack/Multi-label-Text-Classification/e005b74f024524a85c605e291bf7bed474c2608d/RCNN/v-cpp/fileutil.hpp


--------------------------------------------------------------------------------
/Text2Graph/src/main/java/ecs/CoreNLPService.java:
--------------------------------------------------------------------------------
 1 | package ecs;
 2 | 
 3 | import java.util.concurrent.Executors;
 4 | import java.util.concurrent.ScheduledExecutorService;
 5 | 
 6 | /**
 7 |  * Created by LYP on 2016/11/24.
 8 |  */
 9 | public class CoreNLPService {
10 |     static String pathPatch = "/storage1/lyp/InputFiles/";
11 |     private static int threadNum = 50;
12 |     private static int threadEnd = 50;
13 |     private static int threadSta = 0;
14 |     //bd62->20 80 60 1391700+463958=>9279*50
15 |     //bd31->30 30 0
16 |     //bd54->30 60 30
17 |     public static void main(String[] args) {
18 | //        String str = "java怎么把字符1串中的的汉字2取出来";
19 | //        String reg = "[^0-9]";
20 | //        str = str.replaceAll(reg, "");
21 | //        System.out.println(str);
22 | //        System.exit(-1);
23 |         CoreNLPService coreNLPService = new CoreNLPService();
24 |         coreNLPService.service();
25 |     }
26 | 
27 |     public void service() {
28 |         ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(threadNum);
29 |         int cnt = threadSta;
30 |         while (cnt < threadEnd) {
31 |             try {
32 |                 final int inner = cnt;
33 |                 final Runnable task = new Runnable() {
34 |                     @Override
35 |                     public void run() {
36 |                         try {
37 |                             System.out.println("process start!");
38 |                             ProcessBuilder builder = new ProcessBuilder();
39 |                             builder.redirectError(ProcessBuilder.Redirect.INHERIT);
40 |                             builder.redirectOutput(ProcessBuilder.Redirect.INHERIT);
41 | 
42 |                             builder.environment().put("MAVEN_OPTS", "-Xmx6144m -XX:MaxPermSize=1536M");
43 |                             String cmdLine = "mvn,exec:java,-Dexec.mainClass=ecs.TestCoreNLP,-Dexec.args=\"\"-i "
44 |                                     + inner + " -c " + pathPatch + " -t 5" + "\"\"";
45 |                             String[] cmdArray = cmdLine.split(",");
46 |                             builder.command(cmdArray);
47 | 
48 |                             final Process process = builder.start();
49 | 
50 |                             Runtime.getRuntime().addShutdownHook(new Thread() {
51 |                                 @Override
52 |                                 public void run() {
53 |                                     process.destroy();
54 |                                 }
55 |                             });
56 |                         }catch (Exception e) {
57 |                             // TODO Auto-generated catch block
58 |                             e.printStackTrace();
59 |                         }
60 |                     }
61 |                 };
62 | 
63 |                 scheduler.submit(task);
64 |                 cnt++;
65 |             }catch (Exception e) {
66 |                 // TODO Auto-generated catch block
67 |                 e.printStackTrace();
68 |             }
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------