├── LICENSE.txt
├── README.md
├── data-sample.txt
├── definitions
    ├── OieExample.py
    ├── OieFeatures.py
    ├── __init__.py
    └── settings.py
├── evaluation
    ├── OieEvaluation.py
    └── __init__.py
├── learning
    ├── NegativeExampleGenerator.py
    ├── OieData.py
    ├── OieInduction.py
    ├── OieModel.py
    ├── Optimizers.py
    ├── __init__.py
    └── models
    │   ├── __init__.py
    │   ├── decoders
    │       ├── Bilinear.py
    │       ├── BilinearPlusSP.py
    │       ├── SelectionalPreferences.py
    │       └── __init__.py
    │   └── encoders
    │       ├── RelationClassifier.py
    │       └── __init__.py
└── processing
    ├── OiePreprocessor.py
    └── __init__.py


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # relation-autoencoder
 2 | This is the code used in the paper [Discrete-State Variational Autoencoders for Joint Discovery and Factorization of Relations](https://transacl.org/ojs/index.php/tacl/article/viewFile/761/190) by Diego Marcheggiani and Ivan Titov.
 3 | 
 4 | If you use this code, please cite us.
 5 | 
 6 | Dependencies
 7 | -----------
 8 | - [theano](http://deeplearning.net/software/theano/)
 9 | - [numpy](http://http://www.numpy.org/)
10 | - [scipy](http://https://www.scipy.org/)
11 | - [nltk](http://http://www.nltk.org/)
12 | 
13 | 
14 | Data Processing
15 | --------------
16 | To run the model the first thing to do is create a dataset.
17 | You need a file like data-sample.txt.
18 | The file must be tab-separated an with the following fields:
19 | 
20 | lexicalized dependency path between arguments (entities) of the relation,
21 | first entity
22 | second entity
23 | entity types of the first and second entity
24 | trigger word
25 | id of the sentence
26 | raw sentence
27 | pos tags of the entire sentence
28 | relation between the two entities if any (used only for evaluation)
29 | 
30 | 
31 | In order to create the dataset you need the OiePreprocessor.py script once for each dataset partition: train, dev, and test.
32 | <pre><code>
33 | python processing/OiePreprocessor.py --batch-name train data-sample.txt sample.pk 
34 | python processing/OiePreprocessor.py --batch-name dev data-sample.txt sample.pk
35 | python processing/OiePreprocessor.py --batch-name test data-sample.txt sample.pk
36 | </code></pre>
37 | 
38 | Now, your dataset with all the indexed features is in sample.pk
39 | 
40 | Training Models
41 | ------------
42 | To train the model run the OieInduction.py file with all the required arguments:
43 | <pre><code>
44 | python learning/OieInduction.py --pickled_dataset sample.pk --model_name discrete-autoencoder --model AC --optimization 1 --epochs 10 --batch_size 100 --relations_number 10 --negative_samples_number 5 --l2_regularization 0.1 --alpha 0.1 --seed 2 --embed_size 10 --learning_rate 0.1
45 | </code></pre>
46 | 
47 | 
48 | For any questions, please drop me a mail at marcheggiani [at] uva [dot] nl. 
49 | 


--------------------------------------------------------------------------------
/definitions/OieExample.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'diego'
 2 | 
 3 | class OieExample (object):
 4 | 
 5 |     def __init__(self, arg1, arg2, features, trigger, relation=''):
 6 |         self.features = features
 7 |         self.arg1 = arg1
 8 |         self.arg2 = arg2
 9 |         self.relation = relation
10 |         self.trigger = trigger
11 | 
12 |     def setFeatures(self, features):
13 |         self.features = features


--------------------------------------------------------------------------------
/definitions/OieFeatures.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | import nltk
  4 | import re, string
  5 | import settings
  6 | import pickle
  7 | 
  8 | parsing = 0
  9 | entities = 1
 10 | trig = 2
 11 | sentence = 3
 12 | pos = 4
 13 | docPath = 5
 14 | #  ======= Relation features =======
 15 | stopwords_list = nltk.corpus.stopwords.words('english')
 16 | _digits = re.compile('\d')
 17 | def bow(info, arg1, arg2):
 18 |     return info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 19 | 
 20 | def bow_clean(info, arg1, arg2):
 21 |     bow = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 22 |     result = []
 23 |     tmp = []
 24 |     for word in bow:
 25 |         for pun in string.punctuation:
 26 |             word = word.strip(pun)
 27 |         if word != '':
 28 |             tmp.append(word.lower())
 29 |     for word in tmp:
 30 |         if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
 31 |             result.append(word)
 32 |     return result
 33 | 
 34 | def before_arg1(info, arg1, arg2):
 35 |     before = info[sentence][:info[sentence].find(arg1)]
 36 |     beforeSplit = before.lower().strip().split(' ')
 37 |     beforeSplit = [word for word in beforeSplit if word not in string.punctuation]
 38 |     # print beforeSplit
 39 |     if len(beforeSplit) > 1:
 40 |         return [beforeSplit[-2], beforeSplit[-1]]
 41 |     elif len(beforeSplit) == 1:
 42 |         if beforeSplit[0] != '':
 43 |             return [beforeSplit[-1]]
 44 |         else:
 45 |             return []
 46 |     else:
 47 |         return []
 48 | 
 49 | 
 50 | def after_arg2(info, arg1, arg2):
 51 |     after = info[sentence][info[sentence].rfind(arg2)+len(arg2):]
 52 |     afterSplit = after.lower().strip().split(' ')
 53 |     afterSplit = [word for word in afterSplit if word not in string.punctuation]
 54 |     if len(afterSplit) > 1:
 55 |         return [a for a in afterSplit[0: 2]]
 56 |     elif len(afterSplit) == 1:
 57 |         if afterSplit[0] != '':
 58 |             return [afterSplit[0]]
 59 |         else:
 60 |             return []
 61 |     else:
 62 |         return []
 63 | 
 64 | def bigrams(info, arg1, arg2):
 65 |     between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 66 |     tmp = []
 67 |     for word in between:
 68 |         for pun in string.punctuation:
 69 |             word = word.strip(pun)
 70 |         if word != '':
 71 |             tmp.append(word.lower())
 72 |     return [x[0]+'_'+x[1] for x in zip(tmp, tmp[1:])]
 73 | 
 74 | def trigrams(info, arg1, arg2):
 75 |     between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 76 |     tmp = []
 77 |     for word in between:
 78 |         for pun in string.punctuation:
 79 |             word = word.strip(pun)
 80 |         if word != '':
 81 |             tmp.append(word.lower())
 82 |     return [x[0]+'_'+x[1]+'_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]
 83 | 
 84 | def skiptrigrams(info, arg1, arg2):
 85 |     between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 86 |     tmp = []
 87 |     for word in between:
 88 |         for pun in string.punctuation:
 89 |             word = word.strip(pun)
 90 |         if word != '':
 91 |             tmp.append(word.lower())
 92 |     return [x[0]+'_X_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]
 93 | 
 94 | def skipfourgrams(info, arg1, arg2):
 95 |     between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
 96 |     tmp = []
 97 |     for word in between:
 98 |         for pun in string.punctuation:
 99 |             word = word.strip(pun)
100 |         if word != '':
101 |             tmp.append(word.lower())
102 |     return [x[0]+'_X_'+x[2] + '_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] +\
103 |            [x[0]+'_'+x[1]+'_X_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])]
104 | 
105 | def trigger(info, arg1, arg2):
106 |     return info[trig].replace('TRIGGER:', '')
107 | 
108 | def entityTypes(info, arg1, arg2):
109 |     return info[entities]
110 | 
111 | def entity1Type(info, arg1, arg2):
112 |     return info[entities].split('-')[0]
113 | 
114 | def entity2Type(info, arg1, arg2):
115 |     return info[entities].split('-')[1]
116 | 
117 | def arg1(info, arg1, arg2):
118 |     return arg1
119 | 
120 | def arg1_lower(info, arg1, arg2):
121 |     return arg1.lower()
122 | 
123 | def arg1unigrams(info, arg1, arg2):
124 |     return arg1.lower().split()
125 | 
126 | def arg2(info, arg1, arg2):
127 |     return arg2
128 | 
129 | def arg2_lower(info, arg1, arg2):
130 |     return arg2.lower()
131 | 
132 | def arg2unigrams(info, arg1, arg2):
133 |     return arg2.lower().split()
134 | 
135 | def lexicalPattern(info, arg1, arg2):
136 |     # return info[parsing]
137 |     p = info[parsing].replace('->', ' ').replace('<-', ' ').split()
138 |     result = []
139 |     for num, x in enumerate(p):
140 |         if num % 2 != 0:
141 |             result.append(x)
142 |     return '_'.join(result)
143 | 
144 | def dependencyParsing(info, arg1, arg2):
145 |     return info[parsing]
146 | 
147 | 
148 | def rightDep(info, arg1, arg2):
149 |     p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
150 |     return ''.join(p[:3])
151 | 
152 | def leftDep(info, arg1, arg2):
153 |     p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
154 |     return ''.join(p[-3:])
155 | 
156 | def posPatternPath(info, arg1, arg2):
157 |     words = info[sentence].split()
158 |     postags = info[pos].split()
159 |     assert len(postags) == len(words), 'error'
160 |     a = []
161 |     for w in xrange(len(words)):
162 |         a.append((words[w], postags[w]))
163 |     # a = info[4].split()
164 |     if a:
165 |         # print arg1, words
166 |         # print [a.index(item) for item in a if item[0] == arg1.split()[-1]],'aaaaaaa'
167 |         beginList = [a.index(item) for item in a if item[0] == arg1.split()[-1]]
168 |         # print beginList
169 |         endList = [a.index(item) for item in a if item[0] == arg2.split()[0]]
170 |         # print endList
171 |         if len(beginList) > 0 and len(endList) > 0:
172 |             # posPattern = [item[1] for item in a if beginList[0] > a.index(item) > endList[0]]
173 |             posPattern = []
174 |             for num, item in enumerate(a):
175 |                 if beginList[0] < num < endList[0]:
176 |                     posPattern.append(item[1])
177 |             # print posPattern
178 |             return '_'.join(posPattern)
179 |         else:
180 |             return ''
181 |     else:
182 |         return ''
183 | 
184 | 
185 | def getBasicCleanFeatures():
186 |     features = [trigger, entityTypes, arg1_lower, arg2_lower, bow_clean, entity1Type, entity2Type, lexicalPattern,
187 |                 posPatternPath]
188 |     return features
189 | 
190 | 


--------------------------------------------------------------------------------
/definitions/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 | 


--------------------------------------------------------------------------------
/definitions/settings.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'diego'
 2 | 
 3 | models_path = ''
 4 | clusters_path = ''
 5 | 
 6 | lda_pairs_path = ''
 7 | relations2IdDictionary = ''
 8 | 
 9 | external_embeddings_path = ''
10 | debug = True
11 | 
12 | elems_to_visualize = 5
13 | 
14 | low = -1.e-3
15 | high = 1.e-3
16 | 
17 | 


--------------------------------------------------------------------------------
/evaluation/OieEvaluation.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | import pickle
  4 | import math
  5 | import argparse
  6 | import os
  7 | import sys
  8 | from processing.OiePreprocessor import FeatureLexicon
  9 | 
 10 | class singleLabelClusterEvaluation:
 11 |     def __init__(self, referencePath, file, validationPath=''):
 12 |         self.relations = {}
 13 |         if file:
 14 |             if validationPath != '':
 15 |                 self.referenceSets, self.assessableElemSet = self.createValidationReferenceSets(referencePath,
 16 |                                                                                                 validationPath)
 17 |             else:
 18 |                 self.referenceSets, self.assessableElemSet = self.createReferenceSets(referencePath)
 19 | 
 20 |         else:
 21 |             self.referenceSets, self.assessableElemSet = self.createReferenceSetsFromData(referencePath)
 22 |             # print self.referenceSets
 23 |             # print self.assessableElemSet
 24 | 
 25 |     def createResponse(self, response):
 26 |         self.numberOfElements, self.responseSets = self.createResponseSets(response)
 27 |         # print self.responseSets
 28 | 
 29 | 
 30 | 
 31 |     def b3precision(self, response_a, reference_a):
 32 |         # print response_a.intersection(self.assessableElemSet), 'in precision'
 33 |         return len(response_a.intersection(reference_a)) / float(len(response_a.intersection(self.assessableElemSet)))
 34 | 
 35 |     def b3recall(self, response_a, reference_a):
 36 |         return len(response_a.intersection(reference_a)) / float(len(reference_a))
 37 | 
 38 | 
 39 | 
 40 |     def b3TotalElementPrecision(self):
 41 |         totalPrecision = 0.0
 42 |         for c in self.responseSets:
 43 |             for r in self.responseSets[c]:
 44 |                 if r in self.assessableElemSet:
 45 |                     # print r
 46 |                     totalPrecision += self.b3precision(self.responseSets[c],
 47 |                                                        self.findCluster(r, self.referenceSets))
 48 | 
 49 |         return totalPrecision / float(len(self.assessableElemSet))
 50 | 
 51 |     def b3TotalElementRecall(self):
 52 |         totalRecall = 0.0
 53 |         for c in self.responseSets:
 54 |             for r in self.responseSets[c]:
 55 |                 if r in self.assessableElemSet:
 56 |                     totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets))
 57 | 
 58 |         return totalRecall / float(len(self.assessableElemSet))
 59 | 
 60 | 
 61 |     def b3TotalClusterPrecision(self):
 62 |         totalPrecision = 0.0
 63 |         for c in self.responseSets:
 64 |             for r in self.responseSets[c]:
 65 |                 if r in self.assessableElemSet:
 66 |                     totalPrecision += self.b3precision(self.responseSets[c],
 67 |                                                        self.findCluster(r, self.referenceSets)) / \
 68 |                                       float(len(self.responseSets)*len(self.responseSets[c]))
 69 |         return totalPrecision
 70 | 
 71 |     def b3TotalClusterRecall(self):
 72 |         totalRecall = 0.0
 73 |         for c in self.responseSets:
 74 |             for r in self.responseSets[c]:
 75 |                 if r in self.assessableElemSet:
 76 |                     totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets)) / \
 77 |                                    float(len(self.responseSets)*len(self.responseSets[c]))
 78 | 
 79 |         return totalRecall
 80 | 
 81 | 
 82 |     def createResponseSets(self, response):
 83 |         responseSets = {}
 84 |         numElem = 0
 85 |         for c in response:
 86 |             if len(response[c]) > 0:
 87 |                 numElem += len(response[c])
 88 |                 responseSets[c] = set(response[c])
 89 | 
 90 |         return numElem, responseSets
 91 | 
 92 | 
 93 | 
 94 |     def createReferenceSets(self, referencePath):
 95 |         with open(referencePath, 'r') as f:
 96 |             relations = {}
 97 |             c = 0
 98 |             for line in f:
 99 |                 lineSplit = line.split('\t')
100 |                 relations[c] = lineSplit[-1].strip().split(' ')
101 |                 c += 1
102 |         self.relations = relations
103 |         referenceSets = {}
104 |         assessableElems = set()
105 |         for rel in relations:
106 |             if relations[rel][0] != '':
107 |                 assessableElems.add(rel)
108 |                 if relations[rel][0] in referenceSets:
109 |                     referenceSets[relations[rel][0]].add(rel)
110 |                 else:
111 |                     referenceSets[relations[rel][0]] = set([rel])
112 |         return referenceSets, assessableElems
113 | 
114 |     def createValidationReferenceSets(self, referencePath, validationPath):
115 |         # referencePath is usually the entire training set
116 |         with open(referencePath, 'r') as f, open(validationPath, 'r') as f1:
117 |             validationSet = {}
118 |             for line in f1:
119 |                 if line not in validationSet:
120 |                     validationSet[line] = 1
121 | 
122 |             relations = {}
123 |             c = 0
124 |             for line in f:
125 |                 if line in validationSet:
126 |                     lineSplit = line.split('\t')
127 |                     relations[c] = lineSplit[-1].strip().split(' ')
128 |                 else:
129 |                     relations[c] = ['']
130 |                 c += 1
131 |         # self.relationsValid = relations
132 |         referenceSets = {}
133 |         assessableElems = set()
134 |         for rel in relations:
135 |             if relations[rel][0] != '':
136 |                 assessableElems.add(rel)
137 |                 if relations[rel][0] in referenceSets:
138 |                     referenceSets[relations[rel][0]].add(rel)
139 |                 else:
140 |                     referenceSets[relations[rel][0]] = set([rel])
141 |         return referenceSets, assessableElems
142 | 
143 | 
144 |     def createReferenceSetsFromData(self, relations):
145 |         self.relations = relations
146 |         referenceSets = {}
147 |         assessableElems = set()
148 |         for rel in relations:
149 |             if relations[rel][0] != '':
150 |                 # print 'category', category
151 |                 assessableElems.add(rel)
152 |                 if relations[rel][0] in referenceSets:
153 |                     referenceSets[relations[rel][0]].add(rel)
154 |                 else:
155 |                     referenceSets[relations[rel][0]] = set([rel])
156 |         return referenceSets, assessableElems
157 | 
158 |     def findCluster(self, a, setsDictionary):
159 |         foundClusters = []
160 |         for c in setsDictionary:
161 |             if a in setsDictionary[c]:
162 |                 return setsDictionary[c]
163 |         #         foundClusters.append(setsDictionary[c])
164 |         # return foundClusters
165 | 
166 |     def muc3Recall(self):
167 |         numerator = 0.0
168 |         denominator = 0.0
169 |         for c in self.referenceSets:
170 |             numerator += len(self.referenceSets[c]) - self.overlap(self.referenceSets[c], self.responseSets)
171 |             denominator += len(self.referenceSets[c]) - 1
172 |         if denominator == 0.0:
173 |             return 0.0
174 |         else:
175 |             return numerator / denominator
176 | 
177 |     def muc3Precision(self):
178 |         numerator = 0.0
179 |         denominator = 0.0
180 |         for c in self.responseSets:
181 |             if len(self.responseSets[c]) > 0:
182 |                 # print self.lenAssessableResponseCat(self.responseSets[c]), self.overlap(self.responseSets[c], self.referenceSets)
183 |                 numerator += self.lenAssessableResponseCat(self.responseSets[c]) - self.overlap(self.responseSets[c], self.referenceSets)
184 |                 lenRespo = self.lenAssessableResponseCat(self.responseSets[c])
185 |                 if lenRespo != 0:
186 |                     denominator += self.lenAssessableResponseCat(self.responseSets[c]) - 1
187 |         if denominator == 0.0:
188 |             return 0.0
189 |         else:
190 |             return numerator / denominator
191 | 
192 |     def overlap(self, a, setsDictionary):
193 |         numberIntersections = 0
194 |         for c in setsDictionary:
195 |             if len(a.intersection(setsDictionary[c])) > 0:
196 |                 numberIntersections += 1
197 |         return numberIntersections
198 | 
199 | 
200 |     def lenAssessableResponseCat(self, responesSet_c):
201 |         length = 0
202 |         for r in responesSet_c:
203 |             if r in self.assessableElemSet:
204 |                 length += 1
205 |         return length
206 | 
207 |     def printEvaluation(self, validOrTrain):
208 | 
209 | 
210 |         recB3 = self.b3TotalElementRecall()
211 |         precB3 = self.b3TotalElementPrecision()
212 |         betasquare = math.pow(0.5, 2)
213 |         if recB3 == 0.0 and precB3 == 0.0:
214 |             F1B3 = 0.0
215 |             F05B3 = 0.0
216 |         else:
217 |             betasquare = math.pow(0.5, 2)
218 |             F1B3 = (2 * recB3 * precB3) / (recB3 + precB3)
219 |             F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3)
220 | 
221 |         print validOrTrain, ' Elementwise B3 F1 =', F1B3, 'F0.5 =', F05B3, 'B3 recall =', recB3, 'B3 precision =', precB3
222 | 
223 | 
224 |         
225 | 
226 |     def getF05(self):
227 |         recB3 = self.b3TotalElementRecall()
228 |         precB3 = self.b3TotalElementPrecision()
229 |         betasquare = math.pow(0.5, 2)
230 |         if recB3 == 0.0 and precB3 == 0.0:
231 |             F05B3 = 0.0
232 |         else:
233 |             F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3)
234 |         return F05B3
235 | 
236 |     def getF1(self):
237 |         recB3 = self.b3TotalElementRecall()
238 |         precB3 = self.b3TotalElementPrecision()
239 | 
240 |         if recB3 == 0.0 and precB3 == 0.0:
241 |             F1B3 = 0.0
242 |         else:
243 |             F1B3 = (2 * recB3 * precB3) / (recB3 + precB3)
244 |         return F1B3
245 | 
246 | def loadData(pickled_dataset):
247 | 
248 |     if not os.path.exists(pickled_dataset):
249 |         print "Pickled dataset not found"
250 |         sys.exit()
251 | 
252 |     pklFile = open(pickled_dataset, 'rb')
253 | 
254 |     featureExtrs = pickle.load(pklFile)
255 | 
256 |     relationLexicon = pickle.load(pklFile)
257 | 
258 |     data = pickle.load(pklFile)
259 | 
260 |     goldStandard = pickle.load(pklFile)
261 | 
262 |     pklFile.close()
263 | 
264 | 
265 |     return goldStandard
266 | 
267 | def getCommandArgs():
268 |     parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model')
269 | 
270 |     parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True,
271 |                         help='the pickled dataset file (produced by OiePreprocessor.py)')
272 |     parser.add_argument('--pickled_results', metavar='pickled_results', nargs='?', required=True,
273 |                         help='the pickled results file (produced by OiePreprocessor.py)')
274 | 
275 | 
276 |     return parser.parse_args()
277 | 
278 | 
279 | 


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 | 


--------------------------------------------------------------------------------
/learning/NegativeExampleGenerator.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'diego'
 2 | 
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class NegativeExampleGenerator(object):
 8 |     def __init__(self, rand, negSamplingCum):
 9 |         self._rand = rand
10 |         self._negSamplingCum = negSamplingCum
11 |         # self._neg2SamplingCum = neg2SamplingCum
12 | #         self._negSamplingDistrPower = negSamplingDistrPower
13 | #         self._compute_unigram_distribution()
14 | 
15 |     def _univariate_distr_sample(self, sampleSize=1):
16 |         return [self._negSamplingCum.searchsorted(self._rand.uniform(0, self._negSamplingCum[-1]))
17 |                 for i in xrange(0, sampleSize)]
18 | 
19 |     def generate_random_negative_example(self, positiveArgs, negativeExampleNum):
20 |         l = positiveArgs.shape[0]  # number of positive instances
21 |         n = negativeExampleNum  # number of negative examples generated per instance
22 | 
23 |         negativeArgs = np.zeros((n, l), dtype=np.int32)
24 |         for instance_idx in xrange(l):
25 |             samples = self._univariate_distr_sample(n)
26 |             for negNum_idx in xrange(n):
27 |                 negativeArgs[negNum_idx, instance_idx] = samples[negNum_idx]
28 |         return negativeArgs
29 | 


--------------------------------------------------------------------------------
/learning/OieData.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | 
  4 | import math as m
  5 | import numpy as np
  6 | import scipy.sparse as sp
  7 | import theano
  8 | from definitions import settings
  9 | import cPickle as pickle
 10 | 
 11 | class MatrixDataSet:
 12 |     # matrix formatted dataset
 13 |     def __init__(self, arguments1, arguments2, argFeatures, negArgs1, negArgs2):
 14 |         self.args1 = arguments1  # (l)
 15 |         self.args2 = arguments2  # (l)
 16 |         self.xFeats = argFeatures  # (l, h)
 17 |         self.neg1 = negArgs1  # (n, l)
 18 |         self.neg2 = negArgs2  # (n, l)
 19 | 
 20 | 
 21 | class MatrixDataSetNoEncoding:
 22 |     # matrix formatted dataset
 23 |     def __init__(self, arguments1, arguments2, realProbs):
 24 |         self.args1 = arguments1  # (l)
 25 |         self.args2 = arguments2  # (l)
 26 |         self.realProbs = realProbs  # (l, r)
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | class DataSetManager:
 33 |     def __init__(self, oieDataset, featureLex, rng, negSamplesNum, relationNum, negSamplingDistrPower=0.75):
 34 | 
 35 |         self.negSamplesNum = negSamplesNum  # the number of negative samples considered
 36 | 
 37 |         self.negSamplingDistrPower = negSamplingDistrPower  # the sampling distribution for negative sampling
 38 | 
 39 |         self.rng = rng
 40 | 
 41 |         self.relationNum = relationNum
 42 | 
 43 |         # id2Str, str2Id
 44 |         self.featureLex = featureLex
 45 | 
 46 |         # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr
 47 |         self._extractArgsMappings(oieDataset)
 48 | 
 49 |         # each examples csr_matrix[exampleNum x getDimensionality()], labels are numpy.array
 50 | 
 51 | 
 52 |         # self.validExs = self._extractExamples(oieDataset['dev'])
 53 | 
 54 |         self.trainExs = self._extractExamples(oieDataset['train'])
 55 |         if 'dev' in oieDataset:
 56 |             self.validExs = self._extractExamples(oieDataset['dev'])
 57 |         else:
 58 |             self.validExs = None
 59 | 
 60 |         if 'test' in oieDataset:
 61 |             self.testExs = self._extractExamples(oieDataset["test"])
 62 |         else:
 63 |             self.testExs = None
 64 | 
 65 |     def _sample(self, cutoffs):
 66 |         idx = cutoffs.searchsorted(self.rng.uniform(0, cutoffs[-1]))
 67 |         return idx
 68 | 
 69 | 
 70 |     def _sample1(self, distr):
 71 | 
 72 |         # check numpy, it should have some efficient ways to sample from multinomials
 73 |         val = self.rng.uniform()
 74 |         pos = 0
 75 |         for idx in xrange(len(distr)):
 76 |             pos += distr[idx]
 77 |             if pos > val:
 78 |                 return idx
 79 |         return len(distr) - 1
 80 | 
 81 | 
 82 |     def _extractExamples(self, oieExamples):
 83 | 
 84 |         l = len(oieExamples)
 85 |         n = self.negSamplesNum
 86 | 
 87 |         args1 = np.zeros(l, dtype=np.int32)  #
 88 |         args2 = np.zeros(l, dtype=np.int32)  #
 89 | 
 90 | 
 91 |         neg1 = np.zeros((n, l), dtype=np.int32)  #
 92 |         neg2 = np.zeros((n, l), dtype=np.int32)  #
 93 | 
 94 | 
 95 |         # print self.featureLex.getDimensionality()
 96 |         xFeatsDok = sp.dok_matrix((l, self.featureLex.getDimensionality()), dtype=theano.config.floatX)
 97 |                                                                           #  @UndefinedVariable float32
 98 | 
 99 |         for i, oieEx in enumerate(oieExamples):
100 |             args1[i] = self.arg2Id[oieEx.arg1]
101 |             args2[i] = self.arg2Id[oieEx.arg2]
102 | 
103 |             for feat in oieEx.features:
104 |                 xFeatsDok[i, feat] = 1
105 | 
106 |             # should do it differently (sample random indexes during training), see below
107 | 
108 |             for k in xrange(n):
109 |                 neg1[k, i] = self._sample(self.negSamplingCum)
110 | 
111 |             for k in xrange(n):
112 |                 neg2[k, i] = self._sample(self.negSamplingCum)
113 |             
114 | 
115 | 
116 |         xFeats = sp.csr_matrix(xFeatsDok, dtype="float32")
117 | 
118 |         return MatrixDataSet(args1, args2, xFeats, neg1, neg2)
119 | 
120 |     def _indexElements(self, elements):
121 | 
122 |         idx = 0
123 |         id2Elem = {}
124 |         elem2Id = {}
125 |         for x in elements:
126 |             id2Elem[idx] = x
127 |             elem2Id[x] = idx
128 |             idx += 1
129 |         return id2Elem, elem2Id
130 | 
131 |     def _extractArgsMappings(self, oieDataset):
132 | 
133 |         # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr
134 |         argFreqs = {}
135 |         for key in oieDataset:
136 |             for oieEx in oieDataset[key]:  # here it iterates over train, test, dev.
137 |                 if oieEx.arg1 not in argFreqs:
138 |                     argFreqs[oieEx.arg1] = 1
139 |                 else:
140 |                     argFreqs[oieEx.arg1] += 1
141 | 
142 |                 if oieEx.arg2 not in argFreqs:
143 |                     argFreqs[oieEx.arg2] = 1
144 |                 else:
145 |                     argFreqs[oieEx.arg2] += 1
146 | 
147 | 
148 | 
149 |         self.id2Arg, self.arg2Id = self._indexElements(argFreqs)
150 | 
151 | 
152 |         argSampFreqs = [float(argFreqs[self.id2Arg[val]]) for val in xrange(len(self.id2Arg))]
153 |         argSampFreqsPowered = map(lambda x: m.pow(x, self.negSamplingDistrPower),  argSampFreqs)
154 |         norm1 = reduce(lambda x, y: x + y,  argSampFreqsPowered)
155 |         self.negSamplingDistr = map(lambda x: x / norm1, argSampFreqsPowered)
156 |         self.negSamplingCum = np.cumsum(self.negSamplingDistr)
157 | 
158 | 
159 | 
160 | 
161 |     def getArgVocSize(self):
162 |         return len(self.arg2Id)
163 | 
164 | 
165 |     def getDimensionality(self):
166 |         return self.featureLex.getDimensionality()
167 | 
168 |     def getNegNum(self):
169 |         return self.negSamplesNum
170 | 
171 |     def getTrainSet(self):
172 |         return self.trainExs
173 | 
174 |     def getValidSet(self):
175 |         return self.validExs
176 | 
177 |     def getTestSet(self):
178 |         return self.testExs
179 | 
180 |     def getRelationNum(self):
181 |         return self.relationNum
182 | 
183 |     def getExampleFeatures(self, id):
184 |         a = []
185 |         for e in self.trainExs.xFeats[id].nonzero()[1]:
186 |             feat = self.featureLex.getStrPruned(e)
187 |             if (self.featureLex.getStrPruned(e).find('trigger') > -1 or
188 |                 self.featureLex.getStrPruned(e).find('arg1') > -1 or
189 |                 self.featureLex.getStrPruned(e).find('arg2') > -1):
190 |                 a.append(feat)
191 |             # else:  # only for debugging purposes, should be commented
192 |             #     a.append(feat)
193 |         return a
194 | 
195 |     def getExampleFeature(self, id, feature):
196 |         for e in self.trainExs.xFeats[id].nonzero()[1]:
197 |             feat = self.featureLex.getStrPruned(e)
198 |             if self.featureLex.getStrPruned(e).find(feature) > -1:
199 |                 return feat
200 |         return None
201 | 
202 |     def getExampleFeatureValid(self, id, feature):
203 |         for e in self.validExs.xFeats[id].nonzero()[1]:
204 |             feat = self.featureLex.getStrPruned(e)
205 |             if self.featureLex.getStrPruned(e).find(feature) > -1:
206 |                 return feat
207 |         return None
208 | 
209 |     def getExampleFeatureTest(self, id, feature):
210 |         for e in self.testExs.xFeats[id].nonzero()[1]:
211 |             feat = self.featureLex.getStrPruned(e)
212 |             if self.featureLex.getStrPruned(e).find(feature) > -1:
213 |                 return feat
214 |         return None
215 | 
216 |     def getNegSamplingCum(self):
217 |         return self.negSamplingCum
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------
/learning/OieInduction.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | import argparse
  4 | import os
  5 | 
  6 | import numpy as np
  7 | 
  8 | import sys
  9 | import time
 10 | import cPickle as pickle
 11 | import operator
 12 | from theano import sparse
 13 | import theano
 14 | import theano.tensor as T
 15 | from learning.OieModel import OieModelFunctions
 16 | 
 17 | from learning.OieData import DataSetManager
 18 | from learning.OieData import MatrixDataSet
 19 | from processing.OiePreprocessor import FeatureLexicon
 20 | from evaluation.OieEvaluation import singleLabelClusterEvaluation
 21 | import definitions.settings as settings
 22 | from learning.NegativeExampleGenerator import NegativeExampleGenerator
 23 | from collections import OrderedDict
 24 | 
 25 | class ReconstructInducer(object):
 26 | 
 27 |     def __init__(self, data, goldStandard, rand, epochNum, learningRate, batchSize, embedSize, lambdaL1, lambdaL2,
 28 |                  optimization, modelName, model, fixedSampling, extEmb, extendedReg,
 29 |                  frequentEval, alpha):
 30 |         self.rand = rand
 31 |         self.data = data
 32 |         self.goldStandard = goldStandard
 33 |         self.optimization = optimization
 34 |         self.modelName = modelName
 35 |         self.model = model
 36 |         self.relationNum = data.getRelationNum()
 37 |         self.extEmb = extEmb
 38 |         self.extendedReg = extendedReg
 39 |         self.frequentEval = frequentEval
 40 |         self.alpha = alpha
 41 | 
 42 |         self.modelID = model + '_' + modelName+'_maxepoch'+str(epochNum)+'_lr'+str(learningRate)\
 43 |                         + '_embedsize' + str(embedSize) + '_l1' + str(lambdaL1) + '_l2' + str(lambdaL2) \
 44 |                         + '_opt' + str(optimization) + '_rel_num' + str(self.relationNum)+ \
 45 |                        '_batch' + str(batchSize) + '_negs' + str(data.negSamplesNum)
 46 | 
 47 |         self.modelFunc = OieModelFunctions(rand, data.getDimensionality(), embedSize,  self.relationNum,
 48 |                                            data.getArgVocSize(), model, self.data, self.extEmb, self.extendedReg,
 49 |                                            self.alpha)
 50 | 
 51 |         self.embedSize = embedSize
 52 |         self.epochNum = epochNum
 53 |         self.learningRate = learningRate
 54 |         self.batchSize = batchSize
 55 |         self.lambdaL1 = lambdaL1
 56 |         self.lambdaL2 = lambdaL2
 57 |         self.fixedSampling = fixedSampling
 58 |         self.negativeSampler = NegativeExampleGenerator(rand, data.getNegSamplingCum())
 59 |         self.accumulator = []
 60 | 
 61 | 
 62 | 
 63 |     def _makeShared(self, matrixDataset, borrow=True):
 64 | 
 65 |         sharedMatrix = MatrixDataSet(
 66 |                 arguments1=theano.shared(matrixDataset.args1, borrow=borrow),
 67 |                 arguments2=theano.shared(matrixDataset.args2, borrow=borrow),
 68 |                 argFeatures=theano.shared(matrixDataset.xFeats, borrow=borrow),
 69 |                 negArgs1=theano.shared(matrixDataset.neg1, borrow=borrow),
 70 |                 negArgs2=theano.shared(matrixDataset.neg2, borrow=borrow)
 71 |         )
 72 |         return sharedMatrix
 73 | 
 74 | 
 75 |     def compileFunction(self, learningRate, epochNum, batchSize, lambda1, lambda2):
 76 | 
 77 |         trainDataNP = self.data.getTrainSet()
 78 |         trainData = self._makeShared(trainDataNP)
 79 | 
 80 |         validDataNP = self.data.getValidSet()
 81 | 
 82 |         testDataNP = self.data.getTestSet()
 83 | 
 84 |         if validDataNP is not None:
 85 |             validData = self._makeShared(validDataNP)
 86 | 
 87 |         if testDataNP is not None:
 88 |             testData = self._makeShared(testDataNP)
 89 | 
 90 |         # build the symbolic computation
 91 | 
 92 |         batchIdx = T.lscalar()  # index to a [mini]batch
 93 |         xFeats = sparse.csr_matrix(name='x', dtype='float32')  # l, h
 94 | 
 95 |         args1 = T.ivector()  # l
 96 |         args2 = T.ivector()  # l
 97 |         neg1 = T.imatrix()  # n, l
 98 |         neg2 = T.imatrix()  # n, l
 99 | 
100 |         print "Starting to build train err computation (not compiling it yet)"
101 |         adjust = float(batchSize) / float(trainDataNP.args1.shape[0])
102 | 
103 |         cost = self.modelFunc.buildTrainErrComputation(batchSize, self.data.getNegNum(),
104 |                                                            xFeats, args1, args2, neg1, neg2) + \
105 |                        (lambda1 * self.modelFunc.L1 * adjust) + \
106 |                        (lambda2 * self.modelFunc.L2 * adjust)
107 | 
108 |         if self.optimization == 1:
109 |             from learning.Optimizers import AdaGrad
110 |             ada = AdaGrad(self.modelFunc.params)
111 |             updates = ada.update(self.learningRate, self.modelFunc.params, cost)
112 |             if False:
113 |                 adaEncoder = AdaGrad(self.modelFunc.relationClassifiers.params)
114 |                 updatesEncoder = adaEncoder.update(self.learningRate, self.modelFunc.relationClassifiers.params, cost)
115 | 
116 |                 adaDecoder = AdaGrad(self.modelFunc.argProjector.params)
117 |                 updatesDecoder = adaDecoder.update(self.learningRate, self.modelFunc.argProjector.params, cost)
118 | 
119 |         elif self.optimization == 0:
120 |             from learning.Optimizers import SGD
121 |             sgd = SGD()
122 |             updates = sgd.update(self.learningRate, self.modelFunc.params, cost)
123 | 
124 | 
125 | 
126 |         print "Compiling train function..."
127 | 
128 | 
129 | 
130 |         trainModel = theano.function(inputs=[batchIdx, neg1, neg2],
131 |                                      outputs=cost,
132 |                                      updates=updates,
133 |                                      givens={
134 |                 xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
135 |                 args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
136 |                 args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
137 |                                      }
138 |             )
139 |         if False:
140 |             trainEncoder = theano.function(inputs=[batchIdx, neg1, neg2],
141 |                                      outputs=cost,
142 |                                      updates=updatesEncoder,
143 |                                      givens={
144 |                 xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
145 |                 args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
146 |                 args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
147 |                                      }
148 |             )
149 |             trainDecoder = theano.function(inputs=[batchIdx, neg1, neg2],
150 |                                      outputs=cost,
151 |                                      updates=updatesDecoder,
152 |                                      givens={
153 |                 xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
154 |                 args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
155 |                 args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
156 |                                      }
157 |             )
158 | 
159 |         prediction = self.modelFunc.buildLabelComputation(batchSize, xFeats)
160 | 
161 |         print "Compiling label function (for training)..."
162 |         labelTrain = theano.function(inputs=[batchIdx],
163 |                                      outputs=prediction,
164 |                                      updates=[],
165 |                                      givens={
166 |                 xFeats: trainData.xFeats[batchIdx * batchSize:(batchIdx + 1) * batchSize]})
167 | 
168 |         if validDataNP is not None:
169 |             print "Compiling label function (for validation)..."
170 |             labelValid = theano.function(inputs=[batchIdx],
171 |                                          outputs=prediction,
172 |                                          updates=[],
173 |                                          givens={xFeats: validData.xFeats[batchIdx * batchSize:
174 |                                          (batchIdx + 1) * batchSize]})
175 |         if testDataNP is not None:
176 |             print "Compiling label function (for test)..."
177 |             labelTest = theano.function(inputs=[batchIdx],
178 |                                          outputs=prediction,
179 |                                          updates=[],
180 |                                          givens={xFeats: testData.xFeats[batchIdx * batchSize:
181 |                                          (batchIdx + 1) * batchSize]})
182 | 
183 | 
184 |         print "Done with compiling function."
185 |         if validDataNP is not None and testDataNP is not None:
186 | 
187 |             return trainModel, labelTest, labelValid
188 |         else:
189 |             if False:
190 |                 return trainEncoder, trainDecoder, labelTrain
191 |             else:
192 |                 return trainModel, labelTrain
193 | 
194 |     def learn(self):
195 |         trainDataNP = self.data.getTrainSet()
196 |         validDataNP = self.data.getValidSet()
197 |         testDataNP = self.data.getTestSet()
198 | 
199 |         print "Starting to compile functions"
200 | 
201 | 
202 |         if validDataNP is not None and testDataNP is not None:
203 |             trainModel, labelTest, labelValid = self.compileFunction(self.learningRate, self.epochNum,
204 |                                                                  self.batchSize, self.lambdaL1, self.lambdaL2)
205 |         else:
206 |             if False:
207 |                 trainEncoder, trainDecoder, labelTrain = self.compileFunction(self.learningRate, self.epochNum,
208 |                                                       self.batchSize, self.lambdaL1, self.lambdaL2)
209 |             else:
210 |                 trainModel, labelTrain = self.compileFunction(self.learningRate, self.epochNum,
211 |                                                       self.batchSize, self.lambdaL1, self.lambdaL2)
212 | 
213 | 
214 |         ###############
215 |         # TRAIN MODEL #
216 |         ###############
217 | 
218 |         # compute number of minibatches for training, validation and testing
219 |         trainBatchNum = trainDataNP.args1.shape[0] / self.batchSize
220 | 
221 |         if validDataNP is not None and testDataNP is not None:
222 |             validBatchNum = validDataNP.args1.shape[0] / self.batchSize
223 |             validEval = singleLabelClusterEvaluation(self.goldStandard['dev'], False)
224 | 
225 |             testBatchNum = testDataNP.args1.shape[0] / self.batchSize
226 |             testEval = singleLabelClusterEvaluation(self.goldStandard['test'], False)
227 |         else:
228 |             trainEval = singleLabelClusterEvaluation(self.goldStandard['train'], False)
229 | 
230 |         print str(trainBatchNum * self.batchSize) + " training examples, "
231 |         # print trainDataNP.args1.shape[0], self.batchSize, trainBatchNum
232 |         print '... training the model'
233 |         startTime = time.clock()
234 | 
235 |         doneLooping = False
236 |         epoch = 0
237 | 
238 | 
239 |         while (epoch < self.epochNum) and (not doneLooping):
240 |             negativeSamples1 = self.negativeSampler.generate_random_negative_example(trainDataNP.args1,
241 |                                                                                      self.data.getNegNum())
242 |             negativeSamples2 = self.negativeSampler.generate_random_negative_example(trainDataNP.args2,
243 |                                                                                      self.data.getNegNum())
244 | 
245 |             err = 0
246 |             epochStartTime = time.clock()
247 | 
248 |             epoch += 1
249 |             print '\nEPOCH ' + str(epoch)
250 |             for idx in xrange(trainBatchNum):
251 |                 if not self.fixedSampling:
252 |                     neg1 = negativeSamples1[:, idx * self.batchSize: (idx + 1) * self.batchSize]
253 |                     neg2 = negativeSamples2[:, idx * self.batchSize: (idx + 1) * self.batchSize]
254 |                 else:
255 |                     neg1 = trainDataNP.neg1[:, idx * self.batchSize: (idx + 1) * self.batchSize]
256 |                     neg2 = trainDataNP.neg2[:, idx * self.batchSize: (idx + 1) * self.batchSize]
257 | 
258 | 
259 |                 ls = trainModel(idx, neg1, neg2)
260 |                 err += ls
261 | 
262 |                 # self.modelFunc.argProjector.normalize()
263 |                 # print('.'),
264 |                 if self.frequentEval:
265 |                     if validDataNP is not None and testDataNP is not None:
266 |                         if idx % 1 == 0:
267 |                             print(str(idx * batchSize)),
268 |                             print idx, '############################################################'
269 |                             validCluster = self.getClustersSets(labelValid, validBatchNum)
270 |                             validEval.createResponse(validCluster)
271 |                             validEval.printEvaluation('Validation')
272 | 
273 |                             testCluster = self.getClustersSets(labelTest, testBatchNum)
274 |                             testEval.createResponse(testCluster)
275 |                             testEval.printEvaluation('Test')
276 |                     else:
277 |                         print(str(idx * batchSize)),
278 |                         print idx, '############################################################'
279 |                         trainClusters = self.getClustersPopulation(labelTrain, trainBatchNum)
280 |                         print trainClusters
281 |                         print
282 | 
283 | 
284 |             epochEndTime = time.clock()
285 | 
286 |             print 'Training error ', str(err)
287 |             print "Epoch time = " + str(epochEndTime - epochStartTime)
288 | 
289 |             if validDataNP is None or testDataNP is None:
290 |                 print 'Training Set'
291 |                 # print labelTrain(1)[1]
292 |                 trainClusters = self.getClustersSets(labelTrain, trainBatchNum)
293 |                 posteriorsTrain = [labelTrain(i)[1] for i in xrange(trainBatchNum)]
294 |                 trainPosteriors = [item for sublist in posteriorsTrain for item in sublist]
295 |                 # for p, probs in enumerate(predictions):
296 |                 #     print p, probs
297 |                 trainEval.createResponse(trainClusters)
298 |                 if self.modelName != 'Test':
299 |                     trainEval.printEvaluation('Training')
300 | 
301 |                 if self.modelName == 'Test':
302 |                     self.getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize)
303 |                 else:
304 |                     getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize)
305 |                 if not settings.debug:
306 |                     pickleClustering(trainClusters, self.modelID+'_epoch'+str(epoch))
307 |                     if epoch % 5 == 0 and epoch > 0:
308 |                         picklePosteriors(trainPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch))
309 | 
310 |             if validDataNP is not None and testDataNP is not None:
311 | 
312 |                 validCluster = self.getClustersSets(labelValid, validBatchNum)
313 |                 posteriorsValid = [labelValid(i)[1] for i in xrange(validBatchNum)]
314 |                 validPosteriors = [item for sublist in posteriorsValid for item in sublist]
315 |                 validEval.createResponse(validCluster)
316 |                 validEval.printEvaluation('Validation')
317 |                 getClustersWithFrequenciesValid(validCluster, self.data, settings.elems_to_visualize)
318 |                 if not settings.debug:
319 |                     pickleClustering(validCluster, self.modelID+'_epoch'+str(epoch)+'_valid')
320 |                     if epoch % 5 == 0 and epoch > 0:
321 |                         picklePosteriors(validPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_valid')
322 | 
323 |                 testCluster = self.getClustersSets(labelTest, testBatchNum)
324 |                 posteriorsTest = [labelTest(i)[1] for i in xrange(testBatchNum)]
325 |                 testPosteriors = [item for sublist in posteriorsTest for item in sublist]
326 |                 testEval.createResponse(testCluster)
327 |                 testEval.printEvaluation('Test')
328 |                 getClustersWithFrequenciesTest(testCluster, self.data, settings.elems_to_visualize)
329 |                 if not settings.debug:
330 |                     pickleClustering(testCluster, self.modelID+'_epoch'+str(epoch)+'_test')
331 |                     if epoch % 5 == 0 and epoch > 0:
332 |                         picklePosteriors(testPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_test')
333 | 
334 | 
335 |         endTime = time.clock()
336 |         print 'Optimization complete'
337 |         print 'The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (endTime - startTime))
338 |         print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
339 |                               ' ran for %.1fs' % ((endTime - startTime)))
340 | 
341 | 
342 | 
343 | 
344 |     def getClustersSets(self, labelTrain, trainBatchNum):
345 |         clusters = {}
346 |         for i in xrange(self.relationNum):
347 |             clusters[i] = set()
348 |         predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
349 |         predictions = [item for sublist in predictionsTrain for item in sublist]  # returns the flatten() list
350 |         for j in xrange(len(predictions)):
351 |             clusters[predictions[j]].add(j)
352 |         return clusters
353 | 
354 |     def getClustersPopulation(self, labelTrain, trainBatchNum):
355 |         clusters = {}
356 |         for i in xrange(self.relationNum):
357 |             clusters[i] = 0
358 |         predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
359 |         predictions = [item for sublist in predictionsTrain for item in sublist]  # returns the flatten() list
360 |         for j in xrange(len(predictions)):
361 |             clusters[predictions[j]] += 1
362 |         return clusters
363 | 
364 |     def getClusters(self, labelTrain, trainBatchNum, train_dev):
365 |         clusters = {}
366 |         for i in xrange(self.relationNum):
367 |             clusters[i] = []
368 |         predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
369 |         predictions = [item for sublist in predictionsTrain for item in sublist]  # returns the flatten() list
370 |         for j in xrange(len(predictions)):
371 |             clusters[predictions[j]].append(self.data.getExampleRelation(j, train_dev))
372 |         return clusters
373 | 
374 | 
375 |     def getClusteredFreq(self, clusters):
376 |         clustFreq = {}
377 |         for i in xrange(self.relationNum):
378 |             clustFreq[i] = {}
379 |         j = 0
380 |         for c in clusters:
381 |             for feat in clusters[c]:
382 |                 if feat in clustFreq[j]:
383 |                     clustFreq[j][feat] += 1
384 |                 else:
385 |                     clustFreq[j][feat] = 1
386 |             clustFreq[j] = sorted(clustFreq[j].iteritems(), key=operator.itemgetter(1), reverse=True)
387 |             j += 1
388 |         return clustFreq
389 | 
390 |     def printFirstK(self, k, clusterFreq):
391 |         for c in clusterFreq:
392 |             print clusterFreq[c][:k]
393 | 
394 | 
395 |     def getClustersWithFrequencies(self, clusterSets, data, threshold):
396 |         for c in clusterSets:
397 |             frequency = {}
398 |             print c,
399 |             for elem in clusterSets[c]:
400 |                 trig = self.goldStandard['train'][elem][0]
401 |                 if trig in frequency:
402 |                     frequency[trig] += 1
403 |                 else:
404 |                     frequency[trig] = 1
405 |             sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
406 |             if len(sorted_freq) < threshold:
407 |                 for el in sorted_freq:
408 |                     print el,
409 |             else:
410 |                 count = 0
411 |                 for el in sorted_freq:
412 |                     if count > threshold:
413 |                         break
414 |                     else:
415 |                         print el,
416 |                         count += 1
417 |             print ''
418 | 
419 | 
420 | def saveModel(model, name):
421 |     pklProtocol = 2
422 |     pklFile = open(settings.models_path + name, 'wb')
423 |     pickle.dump(model, pklFile, protocol=pklProtocol)
424 | 
425 | def loadModel(name):
426 |     pklFile = open(settings.models_path + name, 'rb')
427 |     return pickle.load(pklFile)
428 | 
429 | def loadData(args, rng, negativeSamples, relationNum, modelType):
430 | 
431 |     if not os.path.exists(args.pickled_dataset):
432 |         print "Pickled dataset not found"
433 |         sys.exit()
434 | 
435 |     tStart = time.time()
436 |     print "Found existing pickled dataset, loading...",
437 | 
438 |     pklFile = open(args.pickled_dataset, 'rb')
439 | 
440 |     featureExtrs = pickle.load(pklFile)
441 | 
442 |     relationLexicon = pickle.load(pklFile)
443 | 
444 |     data = pickle.load(pklFile)
445 | 
446 |     goldStandard = pickle.load(pklFile)
447 | 
448 |     pklFile.close()
449 |     tEnd = time.time()
450 |     print "Done (" + str(tEnd - tStart) + "s.)"
451 | 
452 |     trigs = False
453 | 
454 | 
455 |     indexedDataset = DataSetManager(data, relationLexicon, rng, negativeSamples, relationNum, trigs)
456 | 
457 |     print "Produced indexed dataset"
458 | 
459 |     return indexedDataset, goldStandard
460 | 
461 | def pickleClustering(clustering, clusteringName):
462 |     pklProtocol = 2
463 |     pklFile = open(settings.clusters_path + clusteringName, 'wb')
464 |     pickle.dump(clustering, pklFile, protocol=pklProtocol)
465 | 
466 | 
467 | def picklePosteriors(posteriors, posteriorsName):
468 |     pklProtocol = 2
469 |     pklFile = open(settings.clusters_path + posteriorsName, 'wb')
470 |     pickle.dump(posteriors, pklFile, protocol=pklProtocol)
471 | 
472 | def getClustersWithInfo(clusterSets, data, threshold):
473 |     for c in clusterSets:
474 |         print c,
475 |         if len(clusterSets[c]) < threshold:
476 |             for elem in clusterSets[c]:
477 |                 print elem, data.getExampleFeatures(elem),
478 |         else:
479 |             count = 0
480 |             for elem in clusterSets[c]:
481 |                 if count > threshold:
482 |                     break
483 |                 else:
484 |                     print elem, data.getExampleFeatures(elem),
485 |                     count += 1
486 |         print ''
487 | 
488 | 
489 | def getClustersWithFrequencies(clusterSets, data, threshold):
490 |     for c in clusterSets:
491 |         frequency = {}
492 |         print c,
493 |         for elem in clusterSets[c]:
494 |             trig = data.getExampleFeature(elem, 'trigger')
495 |             if trig is not None:
496 |                 trig = trig.replace('trigger#', '')
497 |                 if trig in frequency:
498 |                     frequency[trig] += 1
499 |                 else:
500 |                     frequency[trig] = 1
501 |         sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
502 |         if len(sorted_freq) < threshold:
503 |             for el in sorted_freq:
504 |                 print el,
505 |         else:
506 |             count = 0
507 |             for el in sorted_freq:
508 |                 if count > threshold:
509 |                     break
510 |                 else:
511 |                     print el,
512 |                     count += 1
513 |         print ''
514 | 
515 | 
516 | def getClustersWithFrequenciesValid(clusterSets, data, threshold):
517 |     for c in clusterSets:
518 |         frequency = {}
519 |         print c,
520 |         for elem in clusterSets[c]:
521 |             trig = data.getExampleFeatureValid(elem, 'trigger')
522 |             if trig is not None:
523 |                 trig = trig.replace('trigger#', '')
524 |                 if trig in frequency:
525 |                     frequency[trig] += 1
526 |                 else:
527 |                     frequency[trig] = 1
528 |         sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
529 |         if len(sorted_freq) < threshold:
530 |             for el in sorted_freq:
531 |                 print el,
532 |         else:
533 |             count = 0
534 |             for el in sorted_freq:
535 |                 if count > threshold:
536 |                     break
537 |                 else:
538 |                     print el,
539 |                     count += 1
540 |         print ''
541 | 
542 | 
543 | def getClustersWithFrequenciesTest(clusterSets, data, threshold):
544 |     for c in clusterSets:
545 |         frequency = {}
546 |         print c,
547 |         for elem in clusterSets[c]:
548 |             trig = data.getExampleFeatureTest(elem, 'trigger')
549 |             if trig is not None:
550 |                 trig = trig.replace('trigger#', '')
551 |                 if trig in frequency:
552 |                     frequency[trig] += 1
553 |                 else:
554 |                     frequency[trig] = 1
555 |         sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
556 |         if len(sorted_freq) < threshold:
557 |             for el in sorted_freq:
558 |                 print el,
559 |         else:
560 |             count = 0
561 |             for el in sorted_freq:
562 |                 if count > threshold:
563 |                     break
564 |                 else:
565 |                     print el,
566 |                     count += 1
567 |         print ''
568 | 
569 | def getClustersWithRelationLabels(clusterSets, data, evaluation, threshold):
570 |     for c in clusterSets:
571 |         print c,
572 |         if len(clusterSets[c]) < threshold:
573 |             for elem in clusterSets[c]:
574 |                 if evaluation.relations[elem][0] != '':
575 |                     print elem, data.getExampleFeatures(elem), evaluation.relations[elem],
576 |         else:
577 |             count = 0
578 |             for elem in clusterSets[c]:
579 |                 if count > threshold:
580 |                     break
581 |                 else:
582 |                     if evaluation.relations[elem][0] != '':
583 |                         print elem, data.getExampleFeatures(elem), evaluation.relations[elem],
584 |                         count += 1
585 |         print ''
586 | 
587 | 
588 | def getCommandArgs():
589 |     parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model')
590 | 
591 |     parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True,
592 |                         help='the pickled dataset file (produced by OiePreprocessor.py)')
593 | 
594 |     parser.add_argument('--epochs', metavar='epochs', nargs='?', type=int, default=100,
595 |                         help='maximum number of epochs')
596 | 
597 |     parser.add_argument('--learning_rate', metavar='learning_rate', nargs='?', type=float, default=0.1,
598 |                         help='initial learning rate')
599 | 
600 |     parser.add_argument('--batch_size', metavar='batch_size', nargs='?', type=int, default=50,
601 |                         help='size of the minibatches')
602 | 
603 |     parser.add_argument('--embed_size', metavar='embed_size', nargs='?', type=int, default=30,
604 |                         help='initial learning rate')
605 | 
606 |     parser.add_argument('--relations_number', metavar='relations_number', type=int, nargs='?', default=3,
607 |                         help='number of relations to induce')
608 | 
609 |     parser.add_argument('--negative_samples_number', metavar='negative_samples_number', nargs='?', type=int, default=5,
610 |                         help='number of negative samples')
611 | 
612 |     parser.add_argument('--l1_regularization', metavar='l1_regularization', nargs='?', type=float, default=0.0,
613 |                         help='lambda value of L1 regulatization')
614 | 
615 |     parser.add_argument('--l2_regularization', metavar='l2_regularization', nargs='?', type=float, default=0.0,
616 |                         help='lambda value of L2 regulatization')
617 | 
618 |     parser.add_argument('--optimization', metavar='optimization', nargs='?', type=int, default='0',
619 |                         help='optimization algorithm 0 SGD, 1 ADAGrad, 2 ADADelta. Default SDG.')
620 | 
621 |     parser.add_argument('--model_name', metavar='model_name', nargs='?', required=True, type=str,
622 |                         help='Name or ID of the model')
623 | 
624 |     parser.add_argument('--model', metavar='model', nargs='?', type=str, required=True,
625 |                         help='Model Type choose among A, C, AC.')
626 | 
627 |     parser.add_argument('--fixed_sampling', metavar='fixed_sampling', nargs='?', default='False',
628 |                         help='fixed/dynamic sampling switch, default fixed sampling')
629 | 
630 |     parser.add_argument('--ext_emb', metavar='ext_emb', nargs='?', default='False',
631 |                         help='external embeddings, default False')
632 | 
633 |     parser.add_argument('--extended_reg', metavar='extended_reg', nargs='?', default='False',
634 |                         help='extended regularization on reconstruction parameters, default False')
635 | 
636 |     parser.add_argument('--frequent_eval', metavar='frequent_eval', nargs='?', default='False',
637 |                         help='using frequent evaluation, default False')
638 | 
639 |     parser.add_argument('--seed', metavar='seed', nargs='?', type=int, default=2,
640 |                         help='random seed, default 2')
641 | 
642 |     parser.add_argument('--alpha', metavar='alpha', nargs='?', type=float, default=1.0,
643 |                         help='alpha coefficient for scaling the entropy term')
644 | 
645 | 
646 |     return parser.parse_args()
647 | 
648 | 
649 | 
650 | 
651 | 
652 | if __name__ == '__main__':
653 |     print "Relation Learner"
654 | 
655 |     args = getCommandArgs()
656 |     print args
657 |     rseed = args.seed
658 |     rand = np.random.RandomState(seed=rseed)
659 | 
660 | 
661 |     negativeSamples = args.negative_samples_number
662 |     numberRelations = args.relations_number
663 |     indexedData, goldStandard = loadData(args, rand, negativeSamples, numberRelations, args.model)
664 | 
665 | 
666 |     maxEpochs = args.epochs
667 |     learningRate = args.learning_rate
668 |     batchSize = args.batch_size
669 |     embedSize = args.embed_size
670 |     lambdaL1 = args.l1_regularization
671 |     lambdaL2 = args.l2_regularization
672 |     optimization = args.optimization
673 |     modelName = args.model_name
674 |     model = args.model
675 |     fixedSampling = eval(args.fixed_sampling)
676 |     extEmb = eval(args.ext_emb)
677 |     extendedReg = eval(args.extended_reg)
678 |     frequentEval = eval(args.frequent_eval)
679 |     alpha = args.alpha
680 |     inducer = ReconstructInducer(indexedData, goldStandard, rand, maxEpochs, learningRate,
681 |                                  batchSize, embedSize, lambdaL1, lambdaL2, optimization, modelName,
682 |                                  model, fixedSampling, extEmb, extendedReg,
683 |                                  frequentEval, alpha)
684 | 
685 | 
686 | 
687 |     inducer.learn()
688 | 
689 |     saveModel(inducer, inducer.modelName)
690 | 
691 | 


--------------------------------------------------------------------------------
/learning/OieModel.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | 
  4 | import theano.tensor as T
  5 | import theano
  6 | from models.encoders.RelationClassifier import IndependentRelationClassifiers
  7 | 
  8 | class OieModelFunctions(object):
  9 | 
 10 |     def __init__(self, rng, featureDim, embedSize, relationNum, argVocSize, model,
 11 |                   data, extEmb, extendedReg, alpha):
 12 |         self.rng = rng
 13 | 
 14 |         self.h = featureDim
 15 |         self.k = embedSize
 16 |         self.r = relationNum
 17 |         
 18 |         self.a = argVocSize
 19 |         self.model = model
 20 |         self.relationClassifiers = IndependentRelationClassifiers(rng, featureDim, relationNum)
 21 |         self.params = self.relationClassifiers.params
 22 |         self.alpha = alpha
 23 |         print 'Feature space size =', self.h
 24 |         print 'Argument vocabulary size =', argVocSize
 25 | 
 26 |         self.L1 = T.sum(abs(self.relationClassifiers.W))
 27 | 
 28 |         self.L2 = T.sum(T.sqr(self.relationClassifiers.W))  # + T.sum(T.sqr(self.relationClassifiers.Wb))
 29 | 
 30 |         if self.model == 'A':
 31 |             print 'Bilinear Model'
 32 |             from models.decoders.Bilinear import Bilinear
 33 | 
 34 |             self.argProjector = Bilinear(rng, embedSize, relationNum, self.a, data, extEmb)
 35 |             self.params += self.argProjector.params
 36 |             if extendedReg:
 37 |                 self.L1 += T.sum(abs(self.argProjector.C))
 38 |                 self.L2 += T.sum(T.sqr(self.argProjector.C))
 39 | 
 40 |         elif self.model == 'AC':
 41 |             print 'Bilinear + Selectional Preferences Model'
 42 |             from models.decoders.BilinearPlusSP import BilinearPlusSP
 43 | 
 44 |             self.argProjector = BilinearPlusSP(rng, embedSize, relationNum, self.a, data, extEmb)
 45 |             self.params += self.argProjector.params
 46 |             if extendedReg:
 47 |                 self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2)) + T.sum(abs(self.argProjector.C))
 48 |                 self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2)) + T.sum(T.sqr(self.argProjector.C))
 49 | 
 50 | 
 51 |         elif self.model == 'C':
 52 |             print 'Selectional Preferences'
 53 |             from models.decoders.SelectionalPreferences import SelectionalPreferences
 54 | 
 55 |             self.argProjector = SelectionalPreferences(rng, embedSize, relationNum, self.a, data, extEmb)
 56 |             self.params += self.argProjector.params
 57 |             if extendedReg:
 58 |                 self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2))
 59 |                 self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2))
 60 | 
 61 | 
 62 | 
 63 |     def buildTrainErrComputation(self, batchSize, negNum, xFeats, args1, args2, neg1, neg2):
 64 |         l = batchSize
 65 |         n = negNum
 66 | 
 67 |         # print xFeats
 68 |         print "Relation classifiers..."
 69 |         # relationLabeler.output are probabilities of relations assignment arranged in a tensor [l, r]
 70 |         relationProbs = self.relationClassifiers.compRelationProbsFunc(xFeats=xFeats)
 71 |         print "Arg projection..."
 72 | 
 73 |         entropy = self.alpha * -T.sum(T.log(relationProbs) * relationProbs, axis=1)  # [l,r] * [l,r] = [l]
 74 | 
 75 |         if self.model == 'A':
 76 |             allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
 77 | 
 78 | 
 79 |         elif self.model == 'AC':
 80 |             allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
 81 | 
 82 | 
 83 |         elif self.model == 'C':
 84 |             allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
 85 | 
 86 | 
 87 |         resError = -T.mean(allScores)
 88 |         print "Done with building the graph..."
 89 |         # resError = theano.printing.Print("resError ")(resError)
 90 |         return resError
 91 | 
 92 | 
 93 | 
 94 | 
 95 |     def buildLabelComputation(self, batchSize, xFeats):
 96 |         #  xFeats [ l * e, h ] matrix
 97 |         return self.relationClassifiers.labelFunct(batchSize, xFeats)
 98 | 
 99 | 
100 |     def buildRelationProbComputation(self, batchSize, xFeats):
101 |         return self.relationClassifiers.compRelationProbsFunc(xFeats)
102 | 
103 | 


--------------------------------------------------------------------------------
/learning/Optimizers.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'diego'
 2 | 
 3 | import numpy as np
 4 | import theano
 5 | import theano.tensor as T
 6 | 
 7 | 
 8 | 
 9 | class AdaGrad(object):
10 |     def __init__(self, params):
11 |         self.accumulator = []
12 |         for para_i in params:
13 |             eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX)
14 |             self.accumulator.append(theano.shared(eps_p, borrow=True))
15 | 
16 |     def update(self, learningRate, params, cost):
17 |         print 'AdaGrad takes the floor'
18 |         grads = T.grad(cost, params)
19 |         updates = []
20 |         for param_i, grad_i, acc_i in zip(params, grads, self.accumulator):
21 |             acc = acc_i + T.sqr(grad_i)
22 |             updates.append((param_i, param_i - learningRate * grad_i / (T.sqrt(acc)+1e-6)))
23 |             updates.append((acc_i, acc))
24 |         return updates
25 | 
26 | 
27 | class SGD(object):
28 |     def update(self, learningRate, params, cost):
29 |         print 'SGD takes the floor'
30 |         grads = T.grad(cost, params)
31 |         updates = []
32 |         for param_i, grad_i in zip(params, grads):
33 |             updates.append((param_i, param_i - learningRate * grad_i))
34 |         return updates
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/learning/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 | 


--------------------------------------------------------------------------------
/learning/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 | 


--------------------------------------------------------------------------------
/learning/models/decoders/Bilinear.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'enfry'
  2 | 
  3 | import math
  4 | import theano
  5 | from definitions import settings
  6 | import numpy as np
  7 | import theano.tensor as T
  8 | from collections import OrderedDict
  9 | import cPickle as pickle
 10 | 
 11 | class Bilinear(object):
 12 | 
 13 |     def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb):
 14 | 
 15 |         self.k = embedSize
 16 |         self.r = relationNum
 17 |         self.a = argVocSize
 18 | 
 19 |         a = self.a
 20 |         k = self.k
 21 |         r = self.r
 22 | 
 23 | 
 24 | 
 25 |         # KxK matrix for each argument-argument for each relation
 26 |         CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
 27 | 
 28 | 
 29 |         self.C = theano.shared(value=CNP, name='C')
 30 |         # self.C = theano.printing.Print("C = ")(self.C)
 31 |                 # argument embeddings
 32 |         ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)
 33 | 
 34 |         if ex_emb:
 35 |             import gensim
 36 |             external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
 37 |             for idArg in xrange(self.a):
 38 |                 arg = data.id2Arg[idArg].lower().split(' ')
 39 |                 new = np.zeros(k, dtype=theano.config.floatX)
 40 |                 size = 0
 41 |                 for ar in arg:
 42 |                     if ar in external_embeddings:
 43 |                         new += external_embeddings[ar]
 44 |                         size += 1
 45 |                 if size > 0:
 46 |                     ANP[idArg] = new/size
 47 | 
 48 |         self.A = theano.shared(value=ANP, name='A')  # (a1, k)
 49 | 
 50 |         self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
 51 |                                  name='Ab', borrow=True)
 52 | 
 53 |         self.updates = OrderedDict({self.A: self.A / T.sqrt(T.sum(T.sqr(self.A), axis=0))})
 54 |         self.normalize = theano.function([], [], updates=self.updates)
 55 | 
 56 |         # self.params = [self.C, self.A]
 57 |         self.params = [self.C, self.A, self.Ab]
 58 | 
 59 | 
 60 | 
 61 |     def factorization(self, batchSize, argsEmbA, argsEmbB, wC):
 62 | 
 63 |         # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]])  # [l,r] * [k,k,r] = [l, k, k]
 64 |         Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # [l, k, k] * [l, k] = [l, k]
 65 |         Asecond = T.batched_dot(Afirst, argsEmbB)  # [l, k] * [l, k] = [l]
 66 |         # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1)  # [l,r] * [l,r] = [l]
 67 |         return Asecond
 68 | 
 69 |     def negFactorization1(self, batchSize, negEmbA, argsEmbB, wC):
 70 |         # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]])  # [l,r] * [k,k,r] = [l, k, k]
 71 |         Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l, k, k] * [n, l, k] = [l, k, n]
 72 |         Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]])  # [l, k, n] * [l, k] = [l, n]
 73 |         return Asecond
 74 | 
 75 |     def negFactorization2(self, batchSize, argsEmbA, negEmbB, wC):
 76 |         # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]])  # [l,r] * [k,k,r] = [l, k, k]
 77 |         Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # [l, k, k] * [l, k] = [l, k]
 78 |         Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l, k] * [l, k, n] = [l, n]
 79 |         return Asecond
 80 | 
 81 | 
 82 |     def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
 83 |         argembed1 = self.A[args1]
 84 |         argembed2 = self.A[args2]
 85 | 
 86 |         weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
 87 |         one = self.factorization(batchSize=l,
 88 |                                  argsEmbA=argembed1,
 89 |                                  argsEmbB=argembed2,
 90 |                                  wC=weightedC)  # [l,n]
 91 | 
 92 |         u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
 93 | 
 94 |         logScoresP = T.log(T.nnet.sigmoid(u))
 95 | 
 96 |         allScores = logScoresP
 97 |         allScores = T.concatenate([allScores, entropy, entropy])
 98 | 
 99 | 
100 |         negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
101 |         negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
102 |         negOne = self.negFactorization1(batchSize=l,
103 |                                         negEmbA=negembed1,
104 |                                         argsEmbB=argembed2,
105 |                                         wC=weightedC)
106 | 
107 |         negTwo = self.negFactorization2(batchSize=l,
108 |                                         argsEmbA=argembed1,
109 |                                         negEmbB=negembed2,
110 |                                         wC=weightedC)
111 | 
112 |         g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
113 |                            negTwo + self.Ab[neg2].dimshuffle(1, 0)])
114 |         logScores = T.log(T.nnet.sigmoid(-g))
115 |         allScores = T.concatenate([allScores, logScores.flatten()])
116 |         return allScores
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/learning/models/decoders/BilinearPlusSP.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'enfry'
  2 | 
  3 | import math
  4 | import theano
  5 | from definitions import settings
  6 | import numpy as np
  7 | import theano.tensor as T
  8 | import cPickle as pickle
  9 | 
 10 | class BilinearPlusSP(object):
 11 | 
 12 |     def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb, ):
 13 | 
 14 |         self.k = embedSize
 15 |         self.r = relationNum
 16 |         self.a = argVocSize
 17 | 
 18 |         a = self.a
 19 |         k = self.k
 20 |         r = self.r
 21 | 
 22 | 
 23 |         # KxK matrix for each argument-argument for each relation
 24 |         CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
 25 |                                                                                             # @UndefinedVariable
 26 |         self.C = theano.shared(value=CNP, name='C')
 27 |         # self.C = theano.printing.Print("C = ")(self.C)
 28 | 
 29 |         # Selectional Preferences
 30 |         Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
 31 |         Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
 32 |         self.C1 = theano.shared(value=Ca1NP, name='C1')
 33 |         self.C2 = theano.shared(value=Ca2NP, name='C2')
 34 |         # argument embeddings
 35 |         ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)  # @UndefinedVariable
 36 | 
 37 |         if ex_emb:
 38 |             import gensim
 39 |             external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
 40 | 
 41 |             for idArg in xrange(self.a):
 42 |                 arg = data.id2Arg[idArg].lower().split(' ')
 43 |                 new = np.zeros(k, dtype=theano.config.floatX)
 44 |                 size = 0
 45 |                 for ar in arg:
 46 |                     if ar in external_embeddings:
 47 |                         new += external_embeddings[ar]
 48 |                         size += 1
 49 |                 if size > 0:
 50 |                     ANP[idArg] = new/size
 51 | 
 52 |         self.A = theano.shared(value=ANP, name='A')  # (a1, k)
 53 | 
 54 |         self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
 55 |                                  name='Ab', borrow=True)
 56 | 
 57 |         self.params = [self.C, self.A, self.Ab, self.C1, self.C2]
 58 | 
 59 | 
 60 | 
 61 | 
 62 |     def factorization(self, batchSize, argsEmbA, argsEmbB, wC, wC1, wC2):
 63 |         # l = batchSize
 64 |         # k = self.k  # embed size
 65 |         # r = self.r  # relation number
 66 | 
 67 |         Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # + self.Cb  # [l, k, k] * [l, k] = [l, k]
 68 |         Asecond = T.batched_dot(Afirst, argsEmbB)  # [l, k] * [l, k] = [l]
 69 |         spFirst = T.batched_dot(wC1, argsEmbA)
 70 |         spSecond = T.batched_dot(wC2, argsEmbB)
 71 |         return Asecond + spFirst + spSecond
 72 | 
 73 | 
 74 | 
 75 |     def negLeftFactorization(self, batchSize, negEmbA, argsEmbB, wC, wC1, wC2):
 76 |         # l = batchSize
 77 |         # k = self.k  # embed size
 78 |         # r = self.r  # relation number
 79 | 
 80 |         Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l, k, k] * [n, l, k] = [l, k, n]
 81 |         Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]])  # [l, k, n] * [l, k] = [l, n]
 82 | 
 83 |         spAfirst = T.batched_tensordot(wC1, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l,k] [l,k,n] = [l,n]
 84 | 
 85 |         spSecond = T.batched_dot(wC2, argsEmbB)
 86 | 
 87 |         return Asecond + spAfirst + spSecond.reshape((batchSize, 1))
 88 | 
 89 |     def negRightFactorization(self, batchSize, argsEmbA, negEmbB, wC, wC1, wC2):
 90 |         Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]])  # [l, k, k] * [l, k] = [l, k]
 91 |         Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l, k] * [l, k, n] = [l, n]
 92 |         spFirst = T.batched_dot(wC1, argsEmbA)
 93 |         spAsecond = T.batched_tensordot(wC2, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l,k] [l,k,n] = [l,n]
 94 |         return Asecond + spAsecond + spFirst.reshape((batchSize, 1))
 95 | 
 96 | 
 97 | 
 98 |     def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
 99 |         weightedC1 = T.dot(relationProbs, self.C1.dimshuffle(1, 0))
100 |         weightedC2 = T.dot(relationProbs, self.C2.dimshuffle(1, 0))
101 |         weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
102 | 
103 | 
104 |         argembed1 = self.A[args1]
105 |         argembed2 = self.A[args2]
106 | 
107 |         one = self.factorization(batchSize=l,
108 |                                  argsEmbA=argembed1,
109 |                                  argsEmbB=argembed2,
110 |                                  wC=weightedC,
111 |                                  wC1=weightedC1,
112 |                                  wC2=weightedC2)
113 | 
114 |         u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
115 |         logScoresP = T.log(T.nnet.sigmoid(u))
116 | 
117 |         allScores = logScoresP
118 |         allScores = T.concatenate([allScores, entropy, entropy])
119 | 
120 | 
121 |         negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
122 |         negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
123 |         negOne = self.negLeftFactorization(batchSize=l,
124 |                                            negEmbA=negembed1,
125 |                                            argsEmbB=argembed2,
126 |                                            wC=weightedC,
127 |                                            wC1=weightedC1,
128 |                                            wC2=weightedC2)
129 | 
130 |         negTwo = self.negRightFactorization(batchSize=l,
131 |                                             argsEmbA=argembed1,
132 |                                             negEmbB=negembed2,
133 |                                             wC=weightedC,
134 |                                             wC1=weightedC1,
135 |                                             wC2=weightedC2)
136 |         g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
137 |                            negTwo + self.Ab[neg2].dimshuffle(1, 0)])
138 |         logScores = T.log(T.nnet.sigmoid(-g))
139 |         allScores = T.concatenate([allScores, logScores.flatten()])
140 | 
141 |         return allScores
142 | 
143 | 


--------------------------------------------------------------------------------
/learning/models/decoders/SelectionalPreferences.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'enfry'
  2 | 
  3 | import math
  4 | import theano
  5 | from definitions import settings
  6 | import numpy as np
  7 | import theano.tensor as T
  8 | import cPickle as pickle
  9 | 
 10 | class SelectionalPreferences(object):
 11 | 
 12 |     def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb):
 13 | 
 14 |         self.k = embedSize
 15 |         self.r = relationNum
 16 |         self.a = argVocSize
 17 | 
 18 |         a = self.a
 19 |         k = self.k
 20 |         r = self.r
 21 | 
 22 | 
 23 |         # Selectional Preferences
 24 |         Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
 25 |         Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
 26 |         self.C1 = theano.shared(value=Ca1NP, name='C1')
 27 |         self.C2 = theano.shared(value=Ca2NP, name='C2')
 28 | 
 29 |         # argument embeddings
 30 |         ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)  # @UndefinedVariable
 31 | 
 32 |         if ex_emb:
 33 |             import gensim
 34 |             external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
 35 | 
 36 |             for idArg in xrange(self.a):
 37 |                 arg = data.id2Arg[idArg].lower().split(' ')
 38 |                 new = np.zeros(k, dtype=theano.config.floatX)
 39 |                 size = 0
 40 |                 for ar in arg:
 41 |                     if ar in external_embeddings:
 42 |                         new += external_embeddings[ar]
 43 |                         size += 1
 44 |                 if size > 0:
 45 |                     ANP[idArg] = new/size
 46 | 
 47 |         self.A = theano.shared(value=ANP, name='A')  # (a1, k)
 48 | 
 49 |         self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
 50 |                                 name='Ab', borrow=True)
 51 | 
 52 |         self.params = [self.A, self.C1, self.C2, self.Ab]
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 |     def leftMostFactorization(self, batchSize, args, wC1):
 59 |         l = batchSize
 60 |         k = self.k  # embed size
 61 |         r = self.r  # relation number
 62 |         argEmbeds = self.A[args.flatten()]
 63 |         Afirst = T.batched_dot(wC1, argEmbeds)
 64 |         return Afirst
 65 | 
 66 |     def rightMostFactorization(self, batchSize, args, wC2):
 67 |         l = batchSize
 68 |         k = self.k  # embed size
 69 |         r = self.r  # relation number
 70 |         argEmbeds2 = self.A[args.flatten()]
 71 |         Asecond = T.batched_dot(wC2, argEmbeds2)
 72 |         return Asecond
 73 | 
 74 | 
 75 | 
 76 |     def negLeftMostFactorization(self, batchSize, negEmbed, wC1):
 77 |         # l = batchSize
 78 |         # k = self.k  # embed size
 79 |         # r = self.r  # relation number
 80 |         Afirst = T.batched_tensordot(wC1, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l,k] [l,k,n] = [l,n]
 81 |         return Afirst
 82 | 
 83 |     def negRightMostFactorization(self, batchSize, negEmbed, wC2):
 84 |         # l = batchSize
 85 |         # k = self.k  # embed size
 86 |         # r = self.r  # relation number
 87 |         Asecond = T.batched_tensordot(wC2, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]])  # [l,k] [l,k,n] = [l,n]
 88 |         return Asecond
 89 | 
 90 | 
 91 | 
 92 |     def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
 93 |         weightedC1= T.dot(relationProbs, self.C1.dimshuffle(1, 0))
 94 |         weightedC2= T.dot(relationProbs, self.C2.dimshuffle(1, 0))
 95 | 
 96 |         left1 = self.leftMostFactorization(batchSize=l, args=args1, wC1=weightedC1)
 97 |         right1 = self.rightMostFactorization(batchSize=l, args=args2, wC2=weightedC2)
 98 |         one = left1 + right1
 99 | 
100 |         u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
101 |         logScoresP = T.log(T.nnet.sigmoid(u))
102 |         allScores = logScoresP
103 |         allScores = T.concatenate([allScores, entropy, entropy])
104 | 
105 |         negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
106 |         negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
107 |         negative1 = self.negLeftMostFactorization(batchSize=l,
108 |                                                   negEmbed=negembed1,
109 |                                                   wC1=weightedC1)
110 |         negative2 = self.negRightMostFactorization(batchSize=l,
111 |                                                   negEmbed=negembed2,
112 |                                                   wC2=weightedC2)
113 | 
114 |         negOne = negative1.dimshuffle(1, 0) + right1
115 |         negTwo = negative2.dimshuffle(1, 0) + left1
116 |         g = T.concatenate([negOne + self.Ab[neg1], negTwo + self.Ab[neg2]])
117 |         logScores = T.log(T.nnet.sigmoid(-g))
118 |         allScores = T.concatenate([allScores, logScores.flatten()])
119 | 
120 |         return allScores
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/learning/models/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 | 


--------------------------------------------------------------------------------
/learning/models/encoders/RelationClassifier.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'diego'
 2 | 
 3 | from theano import sparse
 4 | import theano
 5 | from definitions import settings
 6 | import numpy as np
 7 | import theano.tensor as T
 8 | 
 9 | class IndependentRelationClassifiers(object):
10 |     # rng is a random generator,
11 |     # featureDim is the dimension of the feature space
12 |     # relationNum is the number of possible relations (classes of relations)
13 | 
14 |     def __init__(self, rng, featureDim, relationNum):
15 | 
16 |         # dimensionality of feature space
17 |         self.h = featureDim
18 |         # relation num
19 |         self.r = relationNum
20 |         # print str(np.sqrt(6. / (self.h + self.r)))
21 |         # w_bound = np.sqrt(self.h * self.r)
22 | 
23 |         # print str(1.0 / w_bound)
24 |         print 'low bound =', settings.low, 'high bound =', settings.high
25 |         self.W = theano.shared(np.asarray(rng.uniform(
26 |             low=settings.low,
27 |             high=settings.high,
28 |             size=(self.h, self.r)), dtype=theano.config.floatX),  # @UndefinedVariable
29 |             name='W', borrow=True)
30 |         # npW = np.zeros((3,3),dtype=theano.config.floatX)
31 |         # npW[0,0] = 1.e+40
32 |         # npW[1,1] = 1.e+40
33 |         # npW[2,2] = 1.e+40
34 | 
35 |                                                                                             # @UndefinedVariable
36 |         # self.W = theano.shared(value=np.asarray(npW))
37 | 
38 |         self.Wb = theano.shared(value=np.zeros(self.r,
39 |                                                dtype=theano.config.floatX),  # @UndefinedVariable
40 |                                                name='Wb', borrow=True)
41 | 
42 |         self.params = [self.W, self.Wb]
43 |         # self.params = [self.Wb]
44 |         # self.params = []
45 | 
46 |     def compRelationProbsFunc(self, xFeats):
47 |         #  xFeats [l, h] matrix
48 |         # xFeats = theano.printing.Print("xFeats")(xFeats)
49 |         # self.Wb = theano.printing.Print("Wb ") (self.Wb)
50 |         # self.W = theano.printing.Print("W ") (self.W)
51 |         # scores of each role by a classifier
52 |         relationScores = sparse.dot(xFeats, self.W) + self.Wb   # [l, h] x [h, r] => [l, r]
53 |         #relationScores = theano.printing.Print("relationScores=")(relationScores)
54 | 
55 |         # convert it to probabilities
56 |         relationProbs = T.nnet.softmax(relationScores)
57 |         #relationProbs = theano.printing.Print("relationProbs = ")(relationProbs)
58 | 
59 | 
60 |         return relationProbs  # [l, r]
61 | 
62 | 
63 |     def labelFunct(self, batchSize, xFeats):
64 |         #  xFeats [l, h]
65 |         # l = batchSize
66 |         # self.W = theano.printing.Print("W ") (self.W)
67 |         # self.Wb = theano.printing.Print("Wb ") (self.Wb)
68 |         scores = sparse.dot(xFeats, self.W) + self.Wb  # [l, h] x [h, r] => [l, r]
69 |         relationProbs = T.nnet.softmax(scores)
70 |         # scores = theano.printing.Print("scores ") (scores)
71 |         labels = T.argmax(scores, axis=1)  #  [l, r] => [l]
72 |         # labels = theano.printing.Print("labels ") (labels)
73 |         return (labels, relationProbs)


--------------------------------------------------------------------------------
/learning/models/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 | 


--------------------------------------------------------------------------------
/processing/OiePreprocessor.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'diego'
  2 | 
  3 | import argparse
  4 | import os
  5 | import sys
  6 | import time
  7 | from definitions import OieFeatures
  8 | from definitions import OieExample
  9 | print sys.path
 10 | import cPickle as pickle
 11 | 
 12 | 
 13 | class FeatureLexicon:
 14 | 
 15 |     def __init__(self):
 16 |         self.nextId = 0
 17 |         self.id2Str = {}
 18 |         self.str2Id = {}
 19 |         self.id2freq = {}
 20 |         self.nextIdPruned = 0
 21 |         self.id2StrPruned = {}
 22 |         self.str2IdPruned = {}
 23 | 
 24 |     def getOrAdd(self, s):
 25 |         if s not in self.str2Id:
 26 |             self.id2Str[self.nextId] = s
 27 |             self.str2Id[s] = self.nextId
 28 |             self.id2freq[self.nextId] = 1
 29 |             self.nextId += 1
 30 |         else:
 31 |             self.id2freq[self.str2Id[s]] += 1
 32 |         return self.str2Id[s]
 33 | 
 34 | 
 35 |     def getOrAddPruned(self, s):
 36 |         if s not in self.str2IdPruned:
 37 |             self.id2StrPruned[self.nextIdPruned] = s
 38 |             self.str2IdPruned[s] = self.nextIdPruned
 39 |             self.nextIdPruned += 1
 40 |         return self.str2IdPruned[s]
 41 | 
 42 |     def getId(self, s):
 43 |         if s not in self.str2Id:
 44 |             return None
 45 |         return self.str2Id[s]
 46 | 
 47 |     def getStr(self, idx):
 48 |         if idx not in self.id2Str:
 49 |             return None
 50 |         else:
 51 |             return self.id2Str[idx]
 52 | 
 53 |     def getStrPruned(self, idx):
 54 |         if idx not in self.id2StrPruned:
 55 |             return None
 56 |         else:
 57 |             return self.id2StrPruned[idx]
 58 | 
 59 |     def getFreq(self, idx):
 60 |         if idx not in self.id2freq:
 61 |             return None
 62 |         return self.id2freq[idx]
 63 | 
 64 | 
 65 |     def getDimensionality(self):
 66 |         return self.nextIdPruned
 67 |         # return self.nextId
 68 | 
 69 | 
 70 | def getFeatures(lexicon, featureExs, info, arg1=None, arg2=None, expand=False):
 71 |     feats = []
 72 |     for f in featureExs:
 73 |         res = f(info, arg1, arg2)
 74 |         if res is not None:
 75 |             if type(res) == list:
 76 |                 for el in res:
 77 |                     featStrId = f.__name__ + "#" + el
 78 |                     if expand:
 79 |                         feats.append(lexicon.getOrAdd(featStrId))
 80 |                     else:
 81 |                         featId = lexicon.getId(featStrId)
 82 |                         if featId is not None:
 83 |                             feats.append(featId)
 84 |             else:
 85 |                 featStrId = f.__name__ + "#" + res
 86 |                 if expand:
 87 |                     feats.append(lexicon.getOrAdd(featStrId))
 88 |                 else:
 89 |                     featId = lexicon.getId(featStrId)
 90 |                     if featId is not None:
 91 |                         feats.append(featId)
 92 | 
 93 |     return feats
 94 | 
 95 | def getFeaturesThreshold(lexicon, featureExs, info, arg1=None, arg2=None, expand=False, threshold=0):
 96 |     feats = []
 97 |     for f in featureExs:
 98 |         res = f(info, arg1, arg2)
 99 |         if res is not None:
100 |             if type(res) == list:
101 |                 for el in res:
102 |                     featStrId = f.__name__ + "#" + el
103 |                     if expand:
104 |                         if lexicon.id2freq[lexicon.getId(featStrId)] > threshold:
105 |                             feats.append(lexicon.getOrAddPruned(featStrId))
106 |                     else:
107 |                         featId = lexicon.getId(featStrId)
108 |                         if featId is not None:
109 |                             if lexicon.id2freq[featId] > threshold:
110 |                                 feats.append(lexicon.getOrAddPruned(featStrId))
111 |             else:
112 |                 featStrId = f.__name__ + "#" + res
113 |                 if expand:
114 |                     if lexicon.id2freq[lexicon.getId(featStrId)] > threshold:
115 |                         feats.append(lexicon.getOrAddPruned(featStrId))
116 |                 else:
117 |                     featId = lexicon.getId(featStrId)
118 |                     if featId is not None:
119 |                         if lexicon.id2freq[featId] > threshold:
120 |                             feats.append(lexicon.getOrAddPruned(featStrId))
121 | 
122 |     return feats
123 | 
124 | def prepareArgParser():
125 |     parser = argparse.ArgumentParser(description='Processes an Oie file and add its representations '
126 |                                                  'to a Python pickled file.')
127 | 
128 |     parser.add_argument('input_file', metavar='input-file',  help='input file in the Yao format')
129 | 
130 |     parser.add_argument('pickled_dataset', metavar='pickled-dataset', help='pickle file to be used to store output '
131 |                                                                            '(created if empty)')
132 | 
133 |     parser.add_argument('--batch-name', default="train", nargs="?", help='name used as a reference in the pickled file')
134 | 
135 |     parser.add_argument('--features', default="basic", nargs="?", help='features (basic vs ?)')
136 |     parser.add_argument('--threshold', default="0", nargs="?", type=int, help='minimum feature frequency')
137 | 
138 | 
139 | 
140 |     parser.add_argument('--test-mode', action='store_true',
141 |                          help='used for test files '
142 |                               '(the feature space is not expanded to include previously unseen features)')
143 | 
144 | 
145 |     return parser
146 | 
147 | def loadExamples(fileName):
148 |     count = 0
149 |     with open(fileName, 'r') as fp:
150 |         relationExamples = []
151 |         for line in fp:
152 |             line.strip()
153 |             if len(line) == 0 or len(line.split()) == 0:
154 |                 raise IOError
155 | 
156 |             else:
157 |                 fields = line.split('\t')
158 |                 assert len(fields) == 9, "a problem with the file format (# fields is wrong) len is " \
159 |                                          + str(len(fields)) + "instead of 9"
160 |                 # this will be 10
161 |                 relationExamples.append([str(count)] + fields)
162 |                 count += 1
163 | 
164 |     return relationExamples
165 | 
166 | # if __name__ == '__main__':
167 | #     examples = loadExamples('/Users/admin/isti/amsterdam/data/candidate-100.txt')
168 | #     print "Using basic features"
169 | #     argFeatureExtrs = OieFeatures.getBasicFeatures()
170 | #     ex = examples[0]
171 | #     print ex
172 | #     features = argFeatureExtrs
173 | #
174 | #     s = []
175 | #     for f in features:
176 | #         res = f([ex[1], ex[4], ex[5], ex[7]], ex[2], ex[3])
177 | #         if res is not None:
178 | #             s.append(f.__name__ + "#" + res)
179 | #
180 | #     print s, 'dd'
181 | 
182 | if __name__ == '__main__':
183 | 
184 |     tStart = time.time()
185 | 
186 |     print "Parameters: " + str(sys.argv[1::])
187 |     parser = prepareArgParser()
188 |     args = parser.parse_args()
189 | 
190 |     print "Parsed params: " + str(args)
191 | 
192 |     print "Loading sentences...",
193 |     relationExamples = loadExamples(args.input_file)
194 | 
195 |     tEnd = time.time()
196 |     print "Done (" + str(tEnd - tStart) + "s.)"
197 | 
198 |     # predFeatureExtrs = definitions.SrlFeatures.getJohanssonPredDisFeatures()
199 |     #
200 |     featureExtrs = None
201 |     if args.features == "basic":
202 |         print "Using rich features"
203 |         featureExtrs = OieFeatures.getBasicCleanFeatures()
204 | 
205 |     relationLexicon = FeatureLexicon()
206 | 
207 |     dataset = {}
208 |     goldstandard = {}
209 | 
210 |     if os.path.exists(args.pickled_dataset):
211 |         tStart = time.time()
212 |         print "Found existing pickled dataset, loading...",
213 | 
214 |         pklFile = open(args.pickled_dataset, 'rb')
215 | 
216 |         featureExtrs = pickle.load(pklFile)
217 |         relationLexicon = pickle.load(pklFile)
218 |         dataset = pickle.load(pklFile)
219 |         goldstandard = pickle.load(pklFile)
220 | 
221 |         pklFile.close()
222 |         tEnd = time.time()
223 |         print "Done (" + str(tEnd - tStart) + "s.)"
224 | 
225 |     tStart = time.time()
226 |     print "Processing relation Examples",
227 | 
228 |     examples = []
229 |     relationLabels = {}
230 |     if args.batch_name in dataset:
231 |         examples = dataset[args.batch_name]
232 |         relationLabels = goldstandard[args.batch_name]
233 |     else:
234 |         dataset[args.batch_name] = examples
235 |         goldstandard[args.batch_name] = relationLabels
236 | 
237 |     reIdx = 0
238 |     c = 0
239 |     for re in relationExamples:
240 |         getFeatures(relationLexicon, featureExtrs, [re[1], re[4], re[5], re[7], re[8], re[6]],
241 |                                                              re[2], re[3], True)
242 |     for re in relationExamples:
243 |         reIdx += 1
244 |         if reIdx % 1000 == 0:
245 |             print ".",
246 |         if reIdx % 10000 == 0:
247 |             print reIdx,
248 | 
249 | 
250 |         relationE = ''
251 |         if re[9] != '':
252 |             relationE = re[9]
253 |         # print re[9]
254 |         # if re[10] != '':
255 |         #     if relationE != '':
256 |         #         relationE += ' '+re[10]
257 |         #     else:
258 |         #         relationE = re[10]
259 | 
260 |         ex = OieExample.OieExample(re[2], re[3], getFeaturesThreshold(relationLexicon,
261 |                                                              featureExtrs,
262 |                                                              [re[1], re[4], re[5], re[7], re[8], re[6]],
263 |                                                              # [re[1], re[4], re[5], re[7]],
264 |                                                              re[2], re[3], True, threshold=args.threshold), re[5]
265 |                                                              ,relation=relationE
266 |                                    )
267 |         relationLabels[c] = re[-1].strip().split(' ')
268 |         c += 1
269 | 
270 |         examples.append(ex)
271 | 
272 | 
273 |     tEnd = time.time()
274 |     print "Done (" + str(tEnd - tStart) + "s.), processed " + str(len(examples))
275 | 
276 |     tStart = time.time()
277 |     print "Pickling the dataset...",
278 | 
279 |     pklFile = open(args.pickled_dataset, 'wb')
280 |     #pklFile = gzip.GzipFile(args.pickled_dataset, 'wb')
281 | 
282 |     pklProtocol = 2
283 |     pickle.dump(featureExtrs, pklFile, protocol=pklProtocol)
284 |     pickle.dump(relationLexicon, pklFile, protocol=pklProtocol)
285 |     pickle.dump(dataset, pklFile, protocol=pklProtocol)
286 |     pickle.dump(goldstandard, pklFile, protocol=pklProtocol)
287 | 
288 |     tEnd = time.time()
289 |     print "Done (" + str(tEnd - tStart) + "s.)"


--------------------------------------------------------------------------------
/processing/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 | 


--------------------------------------------------------------------------------