├── LICENSE.txt ├── README.md ├── data-sample.txt ├── definitions ├── OieExample.py ├── OieFeatures.py ├── __init__.py └── settings.py ├── evaluation ├── OieEvaluation.py └── __init__.py ├── learning ├── NegativeExampleGenerator.py ├── OieData.py ├── OieInduction.py ├── OieModel.py ├── Optimizers.py ├── __init__.py └── models │ ├── __init__.py │ ├── decoders │ ├── Bilinear.py │ ├── BilinearPlusSP.py │ ├── SelectionalPreferences.py │ └── __init__.py │ └── encoders │ ├── RelationClassifier.py │ └── __init__.py └── processing ├── OiePreprocessor.py └── __init__.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # relation-autoencoder 2 | This is the code used in the paper [Discrete-State Variational Autoencoders for Joint Discovery and Factorization of Relations](https://transacl.org/ojs/index.php/tacl/article/viewFile/761/190) by Diego Marcheggiani and Ivan Titov. 3 | 4 | If you use this code, please cite us. 5 | 6 | Dependencies 7 | ----------- 8 | - [theano](http://deeplearning.net/software/theano/) 9 | - [numpy](http://http://www.numpy.org/) 10 | - [scipy](http://https://www.scipy.org/) 11 | - [nltk](http://http://www.nltk.org/) 12 | 13 | 14 | Data Processing 15 | -------------- 16 | To run the model the first thing to do is create a dataset. 17 | You need a file like data-sample.txt. 18 | The file must be tab-separated an with the following fields: 19 | 20 | lexicalized dependency path between arguments (entities) of the relation, 21 | first entity 22 | second entity 23 | entity types of the first and second entity 24 | trigger word 25 | id of the sentence 26 | raw sentence 27 | pos tags of the entire sentence 28 | relation between the two entities if any (used only for evaluation) 29 | 30 | 31 | In order to create the dataset you need the OiePreprocessor.py script once for each dataset partition: train, dev, and test. 32 |
33 | python processing/OiePreprocessor.py --batch-name train data-sample.txt sample.pk
34 | python processing/OiePreprocessor.py --batch-name dev data-sample.txt sample.pk
35 | python processing/OiePreprocessor.py --batch-name test data-sample.txt sample.pk
36 |
37 |
38 | Now, your dataset with all the indexed features is in sample.pk
39 |
40 | Training Models
41 | ------------
42 | To train the model run the OieInduction.py file with all the required arguments:
43 |
44 | python learning/OieInduction.py --pickled_dataset sample.pk --model_name discrete-autoencoder --model AC --optimization 1 --epochs 10 --batch_size 100 --relations_number 10 --negative_samples_number 5 --l2_regularization 0.1 --alpha 0.1 --seed 2 --embed_size 10 --learning_rate 0.1
45 |
46 |
47 |
48 | For any questions, please drop me a mail at marcheggiani [at] uva [dot] nl.
49 |
--------------------------------------------------------------------------------
/definitions/OieExample.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | class OieExample (object):
4 |
5 | def __init__(self, arg1, arg2, features, trigger, relation=''):
6 | self.features = features
7 | self.arg1 = arg1
8 | self.arg2 = arg2
9 | self.relation = relation
10 | self.trigger = trigger
11 |
12 | def setFeatures(self, features):
13 | self.features = features
--------------------------------------------------------------------------------
/definitions/OieFeatures.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | import nltk
4 | import re, string
5 | import settings
6 | import pickle
7 |
8 | parsing = 0
9 | entities = 1
10 | trig = 2
11 | sentence = 3
12 | pos = 4
13 | docPath = 5
14 | # ======= Relation features =======
15 | stopwords_list = nltk.corpus.stopwords.words('english')
16 | _digits = re.compile('\d')
17 | def bow(info, arg1, arg2):
18 | return info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
19 |
20 | def bow_clean(info, arg1, arg2):
21 | bow = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
22 | result = []
23 | tmp = []
24 | for word in bow:
25 | for pun in string.punctuation:
26 | word = word.strip(pun)
27 | if word != '':
28 | tmp.append(word.lower())
29 | for word in tmp:
30 | if word not in stopwords_list and not _digits.search(word) and not word[0].isupper():
31 | result.append(word)
32 | return result
33 |
34 | def before_arg1(info, arg1, arg2):
35 | before = info[sentence][:info[sentence].find(arg1)]
36 | beforeSplit = before.lower().strip().split(' ')
37 | beforeSplit = [word for word in beforeSplit if word not in string.punctuation]
38 | # print beforeSplit
39 | if len(beforeSplit) > 1:
40 | return [beforeSplit[-2], beforeSplit[-1]]
41 | elif len(beforeSplit) == 1:
42 | if beforeSplit[0] != '':
43 | return [beforeSplit[-1]]
44 | else:
45 | return []
46 | else:
47 | return []
48 |
49 |
50 | def after_arg2(info, arg1, arg2):
51 | after = info[sentence][info[sentence].rfind(arg2)+len(arg2):]
52 | afterSplit = after.lower().strip().split(' ')
53 | afterSplit = [word for word in afterSplit if word not in string.punctuation]
54 | if len(afterSplit) > 1:
55 | return [a for a in afterSplit[0: 2]]
56 | elif len(afterSplit) == 1:
57 | if afterSplit[0] != '':
58 | return [afterSplit[0]]
59 | else:
60 | return []
61 | else:
62 | return []
63 |
64 | def bigrams(info, arg1, arg2):
65 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
66 | tmp = []
67 | for word in between:
68 | for pun in string.punctuation:
69 | word = word.strip(pun)
70 | if word != '':
71 | tmp.append(word.lower())
72 | return [x[0]+'_'+x[1] for x in zip(tmp, tmp[1:])]
73 |
74 | def trigrams(info, arg1, arg2):
75 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
76 | tmp = []
77 | for word in between:
78 | for pun in string.punctuation:
79 | word = word.strip(pun)
80 | if word != '':
81 | tmp.append(word.lower())
82 | return [x[0]+'_'+x[1]+'_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]
83 |
84 | def skiptrigrams(info, arg1, arg2):
85 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
86 | tmp = []
87 | for word in between:
88 | for pun in string.punctuation:
89 | word = word.strip(pun)
90 | if word != '':
91 | tmp.append(word.lower())
92 | return [x[0]+'_X_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])]
93 |
94 | def skipfourgrams(info, arg1, arg2):
95 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split()
96 | tmp = []
97 | for word in between:
98 | for pun in string.punctuation:
99 | word = word.strip(pun)
100 | if word != '':
101 | tmp.append(word.lower())
102 | return [x[0]+'_X_'+x[2] + '_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] +\
103 | [x[0]+'_'+x[1]+'_X_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])]
104 |
105 | def trigger(info, arg1, arg2):
106 | return info[trig].replace('TRIGGER:', '')
107 |
108 | def entityTypes(info, arg1, arg2):
109 | return info[entities]
110 |
111 | def entity1Type(info, arg1, arg2):
112 | return info[entities].split('-')[0]
113 |
114 | def entity2Type(info, arg1, arg2):
115 | return info[entities].split('-')[1]
116 |
117 | def arg1(info, arg1, arg2):
118 | return arg1
119 |
120 | def arg1_lower(info, arg1, arg2):
121 | return arg1.lower()
122 |
123 | def arg1unigrams(info, arg1, arg2):
124 | return arg1.lower().split()
125 |
126 | def arg2(info, arg1, arg2):
127 | return arg2
128 |
129 | def arg2_lower(info, arg1, arg2):
130 | return arg2.lower()
131 |
132 | def arg2unigrams(info, arg1, arg2):
133 | return arg2.lower().split()
134 |
135 | def lexicalPattern(info, arg1, arg2):
136 | # return info[parsing]
137 | p = info[parsing].replace('->', ' ').replace('<-', ' ').split()
138 | result = []
139 | for num, x in enumerate(p):
140 | if num % 2 != 0:
141 | result.append(x)
142 | return '_'.join(result)
143 |
144 | def dependencyParsing(info, arg1, arg2):
145 | return info[parsing]
146 |
147 |
148 | def rightDep(info, arg1, arg2):
149 | p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
150 | return ''.join(p[:3])
151 |
152 | def leftDep(info, arg1, arg2):
153 | p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split()
154 | return ''.join(p[-3:])
155 |
156 | def posPatternPath(info, arg1, arg2):
157 | words = info[sentence].split()
158 | postags = info[pos].split()
159 | assert len(postags) == len(words), 'error'
160 | a = []
161 | for w in xrange(len(words)):
162 | a.append((words[w], postags[w]))
163 | # a = info[4].split()
164 | if a:
165 | # print arg1, words
166 | # print [a.index(item) for item in a if item[0] == arg1.split()[-1]],'aaaaaaa'
167 | beginList = [a.index(item) for item in a if item[0] == arg1.split()[-1]]
168 | # print beginList
169 | endList = [a.index(item) for item in a if item[0] == arg2.split()[0]]
170 | # print endList
171 | if len(beginList) > 0 and len(endList) > 0:
172 | # posPattern = [item[1] for item in a if beginList[0] > a.index(item) > endList[0]]
173 | posPattern = []
174 | for num, item in enumerate(a):
175 | if beginList[0] < num < endList[0]:
176 | posPattern.append(item[1])
177 | # print posPattern
178 | return '_'.join(posPattern)
179 | else:
180 | return ''
181 | else:
182 | return ''
183 |
184 |
185 | def getBasicCleanFeatures():
186 | features = [trigger, entityTypes, arg1_lower, arg2_lower, bow_clean, entity1Type, entity2Type, lexicalPattern,
187 | posPatternPath]
188 | return features
189 |
190 |
--------------------------------------------------------------------------------
/definitions/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 |
--------------------------------------------------------------------------------
/definitions/settings.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | models_path = ''
4 | clusters_path = ''
5 |
6 | lda_pairs_path = ''
7 | relations2IdDictionary = ''
8 |
9 | external_embeddings_path = ''
10 | debug = True
11 |
12 | elems_to_visualize = 5
13 |
14 | low = -1.e-3
15 | high = 1.e-3
16 |
17 |
--------------------------------------------------------------------------------
/evaluation/OieEvaluation.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | import pickle
4 | import math
5 | import argparse
6 | import os
7 | import sys
8 | from processing.OiePreprocessor import FeatureLexicon
9 |
10 | class singleLabelClusterEvaluation:
11 | def __init__(self, referencePath, file, validationPath=''):
12 | self.relations = {}
13 | if file:
14 | if validationPath != '':
15 | self.referenceSets, self.assessableElemSet = self.createValidationReferenceSets(referencePath,
16 | validationPath)
17 | else:
18 | self.referenceSets, self.assessableElemSet = self.createReferenceSets(referencePath)
19 |
20 | else:
21 | self.referenceSets, self.assessableElemSet = self.createReferenceSetsFromData(referencePath)
22 | # print self.referenceSets
23 | # print self.assessableElemSet
24 |
25 | def createResponse(self, response):
26 | self.numberOfElements, self.responseSets = self.createResponseSets(response)
27 | # print self.responseSets
28 |
29 |
30 |
31 | def b3precision(self, response_a, reference_a):
32 | # print response_a.intersection(self.assessableElemSet), 'in precision'
33 | return len(response_a.intersection(reference_a)) / float(len(response_a.intersection(self.assessableElemSet)))
34 |
35 | def b3recall(self, response_a, reference_a):
36 | return len(response_a.intersection(reference_a)) / float(len(reference_a))
37 |
38 |
39 |
40 | def b3TotalElementPrecision(self):
41 | totalPrecision = 0.0
42 | for c in self.responseSets:
43 | for r in self.responseSets[c]:
44 | if r in self.assessableElemSet:
45 | # print r
46 | totalPrecision += self.b3precision(self.responseSets[c],
47 | self.findCluster(r, self.referenceSets))
48 |
49 | return totalPrecision / float(len(self.assessableElemSet))
50 |
51 | def b3TotalElementRecall(self):
52 | totalRecall = 0.0
53 | for c in self.responseSets:
54 | for r in self.responseSets[c]:
55 | if r in self.assessableElemSet:
56 | totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets))
57 |
58 | return totalRecall / float(len(self.assessableElemSet))
59 |
60 |
61 | def b3TotalClusterPrecision(self):
62 | totalPrecision = 0.0
63 | for c in self.responseSets:
64 | for r in self.responseSets[c]:
65 | if r in self.assessableElemSet:
66 | totalPrecision += self.b3precision(self.responseSets[c],
67 | self.findCluster(r, self.referenceSets)) / \
68 | float(len(self.responseSets)*len(self.responseSets[c]))
69 | return totalPrecision
70 |
71 | def b3TotalClusterRecall(self):
72 | totalRecall = 0.0
73 | for c in self.responseSets:
74 | for r in self.responseSets[c]:
75 | if r in self.assessableElemSet:
76 | totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets)) / \
77 | float(len(self.responseSets)*len(self.responseSets[c]))
78 |
79 | return totalRecall
80 |
81 |
82 | def createResponseSets(self, response):
83 | responseSets = {}
84 | numElem = 0
85 | for c in response:
86 | if len(response[c]) > 0:
87 | numElem += len(response[c])
88 | responseSets[c] = set(response[c])
89 |
90 | return numElem, responseSets
91 |
92 |
93 |
94 | def createReferenceSets(self, referencePath):
95 | with open(referencePath, 'r') as f:
96 | relations = {}
97 | c = 0
98 | for line in f:
99 | lineSplit = line.split('\t')
100 | relations[c] = lineSplit[-1].strip().split(' ')
101 | c += 1
102 | self.relations = relations
103 | referenceSets = {}
104 | assessableElems = set()
105 | for rel in relations:
106 | if relations[rel][0] != '':
107 | assessableElems.add(rel)
108 | if relations[rel][0] in referenceSets:
109 | referenceSets[relations[rel][0]].add(rel)
110 | else:
111 | referenceSets[relations[rel][0]] = set([rel])
112 | return referenceSets, assessableElems
113 |
114 | def createValidationReferenceSets(self, referencePath, validationPath):
115 | # referencePath is usually the entire training set
116 | with open(referencePath, 'r') as f, open(validationPath, 'r') as f1:
117 | validationSet = {}
118 | for line in f1:
119 | if line not in validationSet:
120 | validationSet[line] = 1
121 |
122 | relations = {}
123 | c = 0
124 | for line in f:
125 | if line in validationSet:
126 | lineSplit = line.split('\t')
127 | relations[c] = lineSplit[-1].strip().split(' ')
128 | else:
129 | relations[c] = ['']
130 | c += 1
131 | # self.relationsValid = relations
132 | referenceSets = {}
133 | assessableElems = set()
134 | for rel in relations:
135 | if relations[rel][0] != '':
136 | assessableElems.add(rel)
137 | if relations[rel][0] in referenceSets:
138 | referenceSets[relations[rel][0]].add(rel)
139 | else:
140 | referenceSets[relations[rel][0]] = set([rel])
141 | return referenceSets, assessableElems
142 |
143 |
144 | def createReferenceSetsFromData(self, relations):
145 | self.relations = relations
146 | referenceSets = {}
147 | assessableElems = set()
148 | for rel in relations:
149 | if relations[rel][0] != '':
150 | # print 'category', category
151 | assessableElems.add(rel)
152 | if relations[rel][0] in referenceSets:
153 | referenceSets[relations[rel][0]].add(rel)
154 | else:
155 | referenceSets[relations[rel][0]] = set([rel])
156 | return referenceSets, assessableElems
157 |
158 | def findCluster(self, a, setsDictionary):
159 | foundClusters = []
160 | for c in setsDictionary:
161 | if a in setsDictionary[c]:
162 | return setsDictionary[c]
163 | # foundClusters.append(setsDictionary[c])
164 | # return foundClusters
165 |
166 | def muc3Recall(self):
167 | numerator = 0.0
168 | denominator = 0.0
169 | for c in self.referenceSets:
170 | numerator += len(self.referenceSets[c]) - self.overlap(self.referenceSets[c], self.responseSets)
171 | denominator += len(self.referenceSets[c]) - 1
172 | if denominator == 0.0:
173 | return 0.0
174 | else:
175 | return numerator / denominator
176 |
177 | def muc3Precision(self):
178 | numerator = 0.0
179 | denominator = 0.0
180 | for c in self.responseSets:
181 | if len(self.responseSets[c]) > 0:
182 | # print self.lenAssessableResponseCat(self.responseSets[c]), self.overlap(self.responseSets[c], self.referenceSets)
183 | numerator += self.lenAssessableResponseCat(self.responseSets[c]) - self.overlap(self.responseSets[c], self.referenceSets)
184 | lenRespo = self.lenAssessableResponseCat(self.responseSets[c])
185 | if lenRespo != 0:
186 | denominator += self.lenAssessableResponseCat(self.responseSets[c]) - 1
187 | if denominator == 0.0:
188 | return 0.0
189 | else:
190 | return numerator / denominator
191 |
192 | def overlap(self, a, setsDictionary):
193 | numberIntersections = 0
194 | for c in setsDictionary:
195 | if len(a.intersection(setsDictionary[c])) > 0:
196 | numberIntersections += 1
197 | return numberIntersections
198 |
199 |
200 | def lenAssessableResponseCat(self, responesSet_c):
201 | length = 0
202 | for r in responesSet_c:
203 | if r in self.assessableElemSet:
204 | length += 1
205 | return length
206 |
207 | def printEvaluation(self, validOrTrain):
208 |
209 |
210 | recB3 = self.b3TotalElementRecall()
211 | precB3 = self.b3TotalElementPrecision()
212 | betasquare = math.pow(0.5, 2)
213 | if recB3 == 0.0 and precB3 == 0.0:
214 | F1B3 = 0.0
215 | F05B3 = 0.0
216 | else:
217 | betasquare = math.pow(0.5, 2)
218 | F1B3 = (2 * recB3 * precB3) / (recB3 + precB3)
219 | F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3)
220 |
221 | print validOrTrain, ' Elementwise B3 F1 =', F1B3, 'F0.5 =', F05B3, 'B3 recall =', recB3, 'B3 precision =', precB3
222 |
223 |
224 |
225 |
226 | def getF05(self):
227 | recB3 = self.b3TotalElementRecall()
228 | precB3 = self.b3TotalElementPrecision()
229 | betasquare = math.pow(0.5, 2)
230 | if recB3 == 0.0 and precB3 == 0.0:
231 | F05B3 = 0.0
232 | else:
233 | F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3)
234 | return F05B3
235 |
236 | def getF1(self):
237 | recB3 = self.b3TotalElementRecall()
238 | precB3 = self.b3TotalElementPrecision()
239 |
240 | if recB3 == 0.0 and precB3 == 0.0:
241 | F1B3 = 0.0
242 | else:
243 | F1B3 = (2 * recB3 * precB3) / (recB3 + precB3)
244 | return F1B3
245 |
246 | def loadData(pickled_dataset):
247 |
248 | if not os.path.exists(pickled_dataset):
249 | print "Pickled dataset not found"
250 | sys.exit()
251 |
252 | pklFile = open(pickled_dataset, 'rb')
253 |
254 | featureExtrs = pickle.load(pklFile)
255 |
256 | relationLexicon = pickle.load(pklFile)
257 |
258 | data = pickle.load(pklFile)
259 |
260 | goldStandard = pickle.load(pklFile)
261 |
262 | pklFile.close()
263 |
264 |
265 | return goldStandard
266 |
267 | def getCommandArgs():
268 | parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model')
269 |
270 | parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True,
271 | help='the pickled dataset file (produced by OiePreprocessor.py)')
272 | parser.add_argument('--pickled_results', metavar='pickled_results', nargs='?', required=True,
273 | help='the pickled results file (produced by OiePreprocessor.py)')
274 |
275 |
276 | return parser.parse_args()
277 |
278 |
279 |
--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 |
--------------------------------------------------------------------------------
/learning/NegativeExampleGenerator.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 |
4 | import numpy as np
5 |
6 |
7 | class NegativeExampleGenerator(object):
8 | def __init__(self, rand, negSamplingCum):
9 | self._rand = rand
10 | self._negSamplingCum = negSamplingCum
11 | # self._neg2SamplingCum = neg2SamplingCum
12 | # self._negSamplingDistrPower = negSamplingDistrPower
13 | # self._compute_unigram_distribution()
14 |
15 | def _univariate_distr_sample(self, sampleSize=1):
16 | return [self._negSamplingCum.searchsorted(self._rand.uniform(0, self._negSamplingCum[-1]))
17 | for i in xrange(0, sampleSize)]
18 |
19 | def generate_random_negative_example(self, positiveArgs, negativeExampleNum):
20 | l = positiveArgs.shape[0] # number of positive instances
21 | n = negativeExampleNum # number of negative examples generated per instance
22 |
23 | negativeArgs = np.zeros((n, l), dtype=np.int32)
24 | for instance_idx in xrange(l):
25 | samples = self._univariate_distr_sample(n)
26 | for negNum_idx in xrange(n):
27 | negativeArgs[negNum_idx, instance_idx] = samples[negNum_idx]
28 | return negativeArgs
29 |
--------------------------------------------------------------------------------
/learning/OieData.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 |
4 | import math as m
5 | import numpy as np
6 | import scipy.sparse as sp
7 | import theano
8 | from definitions import settings
9 | import cPickle as pickle
10 |
11 | class MatrixDataSet:
12 | # matrix formatted dataset
13 | def __init__(self, arguments1, arguments2, argFeatures, negArgs1, negArgs2):
14 | self.args1 = arguments1 # (l)
15 | self.args2 = arguments2 # (l)
16 | self.xFeats = argFeatures # (l, h)
17 | self.neg1 = negArgs1 # (n, l)
18 | self.neg2 = negArgs2 # (n, l)
19 |
20 |
21 | class MatrixDataSetNoEncoding:
22 | # matrix formatted dataset
23 | def __init__(self, arguments1, arguments2, realProbs):
24 | self.args1 = arguments1 # (l)
25 | self.args2 = arguments2 # (l)
26 | self.realProbs = realProbs # (l, r)
27 |
28 |
29 |
30 |
31 |
32 | class DataSetManager:
33 | def __init__(self, oieDataset, featureLex, rng, negSamplesNum, relationNum, negSamplingDistrPower=0.75):
34 |
35 | self.negSamplesNum = negSamplesNum # the number of negative samples considered
36 |
37 | self.negSamplingDistrPower = negSamplingDistrPower # the sampling distribution for negative sampling
38 |
39 | self.rng = rng
40 |
41 | self.relationNum = relationNum
42 |
43 | # id2Str, str2Id
44 | self.featureLex = featureLex
45 |
46 | # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr
47 | self._extractArgsMappings(oieDataset)
48 |
49 | # each examples csr_matrix[exampleNum x getDimensionality()], labels are numpy.array
50 |
51 |
52 | # self.validExs = self._extractExamples(oieDataset['dev'])
53 |
54 | self.trainExs = self._extractExamples(oieDataset['train'])
55 | if 'dev' in oieDataset:
56 | self.validExs = self._extractExamples(oieDataset['dev'])
57 | else:
58 | self.validExs = None
59 |
60 | if 'test' in oieDataset:
61 | self.testExs = self._extractExamples(oieDataset["test"])
62 | else:
63 | self.testExs = None
64 |
65 | def _sample(self, cutoffs):
66 | idx = cutoffs.searchsorted(self.rng.uniform(0, cutoffs[-1]))
67 | return idx
68 |
69 |
70 | def _sample1(self, distr):
71 |
72 | # check numpy, it should have some efficient ways to sample from multinomials
73 | val = self.rng.uniform()
74 | pos = 0
75 | for idx in xrange(len(distr)):
76 | pos += distr[idx]
77 | if pos > val:
78 | return idx
79 | return len(distr) - 1
80 |
81 |
82 | def _extractExamples(self, oieExamples):
83 |
84 | l = len(oieExamples)
85 | n = self.negSamplesNum
86 |
87 | args1 = np.zeros(l, dtype=np.int32) #
88 | args2 = np.zeros(l, dtype=np.int32) #
89 |
90 |
91 | neg1 = np.zeros((n, l), dtype=np.int32) #
92 | neg2 = np.zeros((n, l), dtype=np.int32) #
93 |
94 |
95 | # print self.featureLex.getDimensionality()
96 | xFeatsDok = sp.dok_matrix((l, self.featureLex.getDimensionality()), dtype=theano.config.floatX)
97 | # @UndefinedVariable float32
98 |
99 | for i, oieEx in enumerate(oieExamples):
100 | args1[i] = self.arg2Id[oieEx.arg1]
101 | args2[i] = self.arg2Id[oieEx.arg2]
102 |
103 | for feat in oieEx.features:
104 | xFeatsDok[i, feat] = 1
105 |
106 | # should do it differently (sample random indexes during training), see below
107 |
108 | for k in xrange(n):
109 | neg1[k, i] = self._sample(self.negSamplingCum)
110 |
111 | for k in xrange(n):
112 | neg2[k, i] = self._sample(self.negSamplingCum)
113 |
114 |
115 |
116 | xFeats = sp.csr_matrix(xFeatsDok, dtype="float32")
117 |
118 | return MatrixDataSet(args1, args2, xFeats, neg1, neg2)
119 |
120 | def _indexElements(self, elements):
121 |
122 | idx = 0
123 | id2Elem = {}
124 | elem2Id = {}
125 | for x in elements:
126 | id2Elem[idx] = x
127 | elem2Id[x] = idx
128 | idx += 1
129 | return id2Elem, elem2Id
130 |
131 | def _extractArgsMappings(self, oieDataset):
132 |
133 | # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr
134 | argFreqs = {}
135 | for key in oieDataset:
136 | for oieEx in oieDataset[key]: # here it iterates over train, test, dev.
137 | if oieEx.arg1 not in argFreqs:
138 | argFreqs[oieEx.arg1] = 1
139 | else:
140 | argFreqs[oieEx.arg1] += 1
141 |
142 | if oieEx.arg2 not in argFreqs:
143 | argFreqs[oieEx.arg2] = 1
144 | else:
145 | argFreqs[oieEx.arg2] += 1
146 |
147 |
148 |
149 | self.id2Arg, self.arg2Id = self._indexElements(argFreqs)
150 |
151 |
152 | argSampFreqs = [float(argFreqs[self.id2Arg[val]]) for val in xrange(len(self.id2Arg))]
153 | argSampFreqsPowered = map(lambda x: m.pow(x, self.negSamplingDistrPower), argSampFreqs)
154 | norm1 = reduce(lambda x, y: x + y, argSampFreqsPowered)
155 | self.negSamplingDistr = map(lambda x: x / norm1, argSampFreqsPowered)
156 | self.negSamplingCum = np.cumsum(self.negSamplingDistr)
157 |
158 |
159 |
160 |
161 | def getArgVocSize(self):
162 | return len(self.arg2Id)
163 |
164 |
165 | def getDimensionality(self):
166 | return self.featureLex.getDimensionality()
167 |
168 | def getNegNum(self):
169 | return self.negSamplesNum
170 |
171 | def getTrainSet(self):
172 | return self.trainExs
173 |
174 | def getValidSet(self):
175 | return self.validExs
176 |
177 | def getTestSet(self):
178 | return self.testExs
179 |
180 | def getRelationNum(self):
181 | return self.relationNum
182 |
183 | def getExampleFeatures(self, id):
184 | a = []
185 | for e in self.trainExs.xFeats[id].nonzero()[1]:
186 | feat = self.featureLex.getStrPruned(e)
187 | if (self.featureLex.getStrPruned(e).find('trigger') > -1 or
188 | self.featureLex.getStrPruned(e).find('arg1') > -1 or
189 | self.featureLex.getStrPruned(e).find('arg2') > -1):
190 | a.append(feat)
191 | # else: # only for debugging purposes, should be commented
192 | # a.append(feat)
193 | return a
194 |
195 | def getExampleFeature(self, id, feature):
196 | for e in self.trainExs.xFeats[id].nonzero()[1]:
197 | feat = self.featureLex.getStrPruned(e)
198 | if self.featureLex.getStrPruned(e).find(feature) > -1:
199 | return feat
200 | return None
201 |
202 | def getExampleFeatureValid(self, id, feature):
203 | for e in self.validExs.xFeats[id].nonzero()[1]:
204 | feat = self.featureLex.getStrPruned(e)
205 | if self.featureLex.getStrPruned(e).find(feature) > -1:
206 | return feat
207 | return None
208 |
209 | def getExampleFeatureTest(self, id, feature):
210 | for e in self.testExs.xFeats[id].nonzero()[1]:
211 | feat = self.featureLex.getStrPruned(e)
212 | if self.featureLex.getStrPruned(e).find(feature) > -1:
213 | return feat
214 | return None
215 |
216 | def getNegSamplingCum(self):
217 | return self.negSamplingCum
218 |
219 |
220 |
221 |
--------------------------------------------------------------------------------
/learning/OieInduction.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | import argparse
4 | import os
5 |
6 | import numpy as np
7 |
8 | import sys
9 | import time
10 | import cPickle as pickle
11 | import operator
12 | from theano import sparse
13 | import theano
14 | import theano.tensor as T
15 | from learning.OieModel import OieModelFunctions
16 |
17 | from learning.OieData import DataSetManager
18 | from learning.OieData import MatrixDataSet
19 | from processing.OiePreprocessor import FeatureLexicon
20 | from evaluation.OieEvaluation import singleLabelClusterEvaluation
21 | import definitions.settings as settings
22 | from learning.NegativeExampleGenerator import NegativeExampleGenerator
23 | from collections import OrderedDict
24 |
25 | class ReconstructInducer(object):
26 |
27 | def __init__(self, data, goldStandard, rand, epochNum, learningRate, batchSize, embedSize, lambdaL1, lambdaL2,
28 | optimization, modelName, model, fixedSampling, extEmb, extendedReg,
29 | frequentEval, alpha):
30 | self.rand = rand
31 | self.data = data
32 | self.goldStandard = goldStandard
33 | self.optimization = optimization
34 | self.modelName = modelName
35 | self.model = model
36 | self.relationNum = data.getRelationNum()
37 | self.extEmb = extEmb
38 | self.extendedReg = extendedReg
39 | self.frequentEval = frequentEval
40 | self.alpha = alpha
41 |
42 | self.modelID = model + '_' + modelName+'_maxepoch'+str(epochNum)+'_lr'+str(learningRate)\
43 | + '_embedsize' + str(embedSize) + '_l1' + str(lambdaL1) + '_l2' + str(lambdaL2) \
44 | + '_opt' + str(optimization) + '_rel_num' + str(self.relationNum)+ \
45 | '_batch' + str(batchSize) + '_negs' + str(data.negSamplesNum)
46 |
47 | self.modelFunc = OieModelFunctions(rand, data.getDimensionality(), embedSize, self.relationNum,
48 | data.getArgVocSize(), model, self.data, self.extEmb, self.extendedReg,
49 | self.alpha)
50 |
51 | self.embedSize = embedSize
52 | self.epochNum = epochNum
53 | self.learningRate = learningRate
54 | self.batchSize = batchSize
55 | self.lambdaL1 = lambdaL1
56 | self.lambdaL2 = lambdaL2
57 | self.fixedSampling = fixedSampling
58 | self.negativeSampler = NegativeExampleGenerator(rand, data.getNegSamplingCum())
59 | self.accumulator = []
60 |
61 |
62 |
63 | def _makeShared(self, matrixDataset, borrow=True):
64 |
65 | sharedMatrix = MatrixDataSet(
66 | arguments1=theano.shared(matrixDataset.args1, borrow=borrow),
67 | arguments2=theano.shared(matrixDataset.args2, borrow=borrow),
68 | argFeatures=theano.shared(matrixDataset.xFeats, borrow=borrow),
69 | negArgs1=theano.shared(matrixDataset.neg1, borrow=borrow),
70 | negArgs2=theano.shared(matrixDataset.neg2, borrow=borrow)
71 | )
72 | return sharedMatrix
73 |
74 |
75 | def compileFunction(self, learningRate, epochNum, batchSize, lambda1, lambda2):
76 |
77 | trainDataNP = self.data.getTrainSet()
78 | trainData = self._makeShared(trainDataNP)
79 |
80 | validDataNP = self.data.getValidSet()
81 |
82 | testDataNP = self.data.getTestSet()
83 |
84 | if validDataNP is not None:
85 | validData = self._makeShared(validDataNP)
86 |
87 | if testDataNP is not None:
88 | testData = self._makeShared(testDataNP)
89 |
90 | # build the symbolic computation
91 |
92 | batchIdx = T.lscalar() # index to a [mini]batch
93 | xFeats = sparse.csr_matrix(name='x', dtype='float32') # l, h
94 |
95 | args1 = T.ivector() # l
96 | args2 = T.ivector() # l
97 | neg1 = T.imatrix() # n, l
98 | neg2 = T.imatrix() # n, l
99 |
100 | print "Starting to build train err computation (not compiling it yet)"
101 | adjust = float(batchSize) / float(trainDataNP.args1.shape[0])
102 |
103 | cost = self.modelFunc.buildTrainErrComputation(batchSize, self.data.getNegNum(),
104 | xFeats, args1, args2, neg1, neg2) + \
105 | (lambda1 * self.modelFunc.L1 * adjust) + \
106 | (lambda2 * self.modelFunc.L2 * adjust)
107 |
108 | if self.optimization == 1:
109 | from learning.Optimizers import AdaGrad
110 | ada = AdaGrad(self.modelFunc.params)
111 | updates = ada.update(self.learningRate, self.modelFunc.params, cost)
112 | if False:
113 | adaEncoder = AdaGrad(self.modelFunc.relationClassifiers.params)
114 | updatesEncoder = adaEncoder.update(self.learningRate, self.modelFunc.relationClassifiers.params, cost)
115 |
116 | adaDecoder = AdaGrad(self.modelFunc.argProjector.params)
117 | updatesDecoder = adaDecoder.update(self.learningRate, self.modelFunc.argProjector.params, cost)
118 |
119 | elif self.optimization == 0:
120 | from learning.Optimizers import SGD
121 | sgd = SGD()
122 | updates = sgd.update(self.learningRate, self.modelFunc.params, cost)
123 |
124 |
125 |
126 | print "Compiling train function..."
127 |
128 |
129 |
130 | trainModel = theano.function(inputs=[batchIdx, neg1, neg2],
131 | outputs=cost,
132 | updates=updates,
133 | givens={
134 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
135 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
136 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
137 | }
138 | )
139 | if False:
140 | trainEncoder = theano.function(inputs=[batchIdx, neg1, neg2],
141 | outputs=cost,
142 | updates=updatesEncoder,
143 | givens={
144 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
145 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
146 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
147 | }
148 | )
149 | trainDecoder = theano.function(inputs=[batchIdx, neg1, neg2],
150 | outputs=cost,
151 | updates=updatesDecoder,
152 | givens={
153 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize],
154 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize],
155 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize]
156 | }
157 | )
158 |
159 | prediction = self.modelFunc.buildLabelComputation(batchSize, xFeats)
160 |
161 | print "Compiling label function (for training)..."
162 | labelTrain = theano.function(inputs=[batchIdx],
163 | outputs=prediction,
164 | updates=[],
165 | givens={
166 | xFeats: trainData.xFeats[batchIdx * batchSize:(batchIdx + 1) * batchSize]})
167 |
168 | if validDataNP is not None:
169 | print "Compiling label function (for validation)..."
170 | labelValid = theano.function(inputs=[batchIdx],
171 | outputs=prediction,
172 | updates=[],
173 | givens={xFeats: validData.xFeats[batchIdx * batchSize:
174 | (batchIdx + 1) * batchSize]})
175 | if testDataNP is not None:
176 | print "Compiling label function (for test)..."
177 | labelTest = theano.function(inputs=[batchIdx],
178 | outputs=prediction,
179 | updates=[],
180 | givens={xFeats: testData.xFeats[batchIdx * batchSize:
181 | (batchIdx + 1) * batchSize]})
182 |
183 |
184 | print "Done with compiling function."
185 | if validDataNP is not None and testDataNP is not None:
186 |
187 | return trainModel, labelTest, labelValid
188 | else:
189 | if False:
190 | return trainEncoder, trainDecoder, labelTrain
191 | else:
192 | return trainModel, labelTrain
193 |
194 | def learn(self):
195 | trainDataNP = self.data.getTrainSet()
196 | validDataNP = self.data.getValidSet()
197 | testDataNP = self.data.getTestSet()
198 |
199 | print "Starting to compile functions"
200 |
201 |
202 | if validDataNP is not None and testDataNP is not None:
203 | trainModel, labelTest, labelValid = self.compileFunction(self.learningRate, self.epochNum,
204 | self.batchSize, self.lambdaL1, self.lambdaL2)
205 | else:
206 | if False:
207 | trainEncoder, trainDecoder, labelTrain = self.compileFunction(self.learningRate, self.epochNum,
208 | self.batchSize, self.lambdaL1, self.lambdaL2)
209 | else:
210 | trainModel, labelTrain = self.compileFunction(self.learningRate, self.epochNum,
211 | self.batchSize, self.lambdaL1, self.lambdaL2)
212 |
213 |
214 | ###############
215 | # TRAIN MODEL #
216 | ###############
217 |
218 | # compute number of minibatches for training, validation and testing
219 | trainBatchNum = trainDataNP.args1.shape[0] / self.batchSize
220 |
221 | if validDataNP is not None and testDataNP is not None:
222 | validBatchNum = validDataNP.args1.shape[0] / self.batchSize
223 | validEval = singleLabelClusterEvaluation(self.goldStandard['dev'], False)
224 |
225 | testBatchNum = testDataNP.args1.shape[0] / self.batchSize
226 | testEval = singleLabelClusterEvaluation(self.goldStandard['test'], False)
227 | else:
228 | trainEval = singleLabelClusterEvaluation(self.goldStandard['train'], False)
229 |
230 | print str(trainBatchNum * self.batchSize) + " training examples, "
231 | # print trainDataNP.args1.shape[0], self.batchSize, trainBatchNum
232 | print '... training the model'
233 | startTime = time.clock()
234 |
235 | doneLooping = False
236 | epoch = 0
237 |
238 |
239 | while (epoch < self.epochNum) and (not doneLooping):
240 | negativeSamples1 = self.negativeSampler.generate_random_negative_example(trainDataNP.args1,
241 | self.data.getNegNum())
242 | negativeSamples2 = self.negativeSampler.generate_random_negative_example(trainDataNP.args2,
243 | self.data.getNegNum())
244 |
245 | err = 0
246 | epochStartTime = time.clock()
247 |
248 | epoch += 1
249 | print '\nEPOCH ' + str(epoch)
250 | for idx in xrange(trainBatchNum):
251 | if not self.fixedSampling:
252 | neg1 = negativeSamples1[:, idx * self.batchSize: (idx + 1) * self.batchSize]
253 | neg2 = negativeSamples2[:, idx * self.batchSize: (idx + 1) * self.batchSize]
254 | else:
255 | neg1 = trainDataNP.neg1[:, idx * self.batchSize: (idx + 1) * self.batchSize]
256 | neg2 = trainDataNP.neg2[:, idx * self.batchSize: (idx + 1) * self.batchSize]
257 |
258 |
259 | ls = trainModel(idx, neg1, neg2)
260 | err += ls
261 |
262 | # self.modelFunc.argProjector.normalize()
263 | # print('.'),
264 | if self.frequentEval:
265 | if validDataNP is not None and testDataNP is not None:
266 | if idx % 1 == 0:
267 | print(str(idx * batchSize)),
268 | print idx, '############################################################'
269 | validCluster = self.getClustersSets(labelValid, validBatchNum)
270 | validEval.createResponse(validCluster)
271 | validEval.printEvaluation('Validation')
272 |
273 | testCluster = self.getClustersSets(labelTest, testBatchNum)
274 | testEval.createResponse(testCluster)
275 | testEval.printEvaluation('Test')
276 | else:
277 | print(str(idx * batchSize)),
278 | print idx, '############################################################'
279 | trainClusters = self.getClustersPopulation(labelTrain, trainBatchNum)
280 | print trainClusters
281 | print
282 |
283 |
284 | epochEndTime = time.clock()
285 |
286 | print 'Training error ', str(err)
287 | print "Epoch time = " + str(epochEndTime - epochStartTime)
288 |
289 | if validDataNP is None or testDataNP is None:
290 | print 'Training Set'
291 | # print labelTrain(1)[1]
292 | trainClusters = self.getClustersSets(labelTrain, trainBatchNum)
293 | posteriorsTrain = [labelTrain(i)[1] for i in xrange(trainBatchNum)]
294 | trainPosteriors = [item for sublist in posteriorsTrain for item in sublist]
295 | # for p, probs in enumerate(predictions):
296 | # print p, probs
297 | trainEval.createResponse(trainClusters)
298 | if self.modelName != 'Test':
299 | trainEval.printEvaluation('Training')
300 |
301 | if self.modelName == 'Test':
302 | self.getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize)
303 | else:
304 | getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize)
305 | if not settings.debug:
306 | pickleClustering(trainClusters, self.modelID+'_epoch'+str(epoch))
307 | if epoch % 5 == 0 and epoch > 0:
308 | picklePosteriors(trainPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch))
309 |
310 | if validDataNP is not None and testDataNP is not None:
311 |
312 | validCluster = self.getClustersSets(labelValid, validBatchNum)
313 | posteriorsValid = [labelValid(i)[1] for i in xrange(validBatchNum)]
314 | validPosteriors = [item for sublist in posteriorsValid for item in sublist]
315 | validEval.createResponse(validCluster)
316 | validEval.printEvaluation('Validation')
317 | getClustersWithFrequenciesValid(validCluster, self.data, settings.elems_to_visualize)
318 | if not settings.debug:
319 | pickleClustering(validCluster, self.modelID+'_epoch'+str(epoch)+'_valid')
320 | if epoch % 5 == 0 and epoch > 0:
321 | picklePosteriors(validPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_valid')
322 |
323 | testCluster = self.getClustersSets(labelTest, testBatchNum)
324 | posteriorsTest = [labelTest(i)[1] for i in xrange(testBatchNum)]
325 | testPosteriors = [item for sublist in posteriorsTest for item in sublist]
326 | testEval.createResponse(testCluster)
327 | testEval.printEvaluation('Test')
328 | getClustersWithFrequenciesTest(testCluster, self.data, settings.elems_to_visualize)
329 | if not settings.debug:
330 | pickleClustering(testCluster, self.modelID+'_epoch'+str(epoch)+'_test')
331 | if epoch % 5 == 0 and epoch > 0:
332 | picklePosteriors(testPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_test')
333 |
334 |
335 | endTime = time.clock()
336 | print 'Optimization complete'
337 | print 'The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (endTime - startTime))
338 | print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
339 | ' ran for %.1fs' % ((endTime - startTime)))
340 |
341 |
342 |
343 |
344 | def getClustersSets(self, labelTrain, trainBatchNum):
345 | clusters = {}
346 | for i in xrange(self.relationNum):
347 | clusters[i] = set()
348 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
349 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list
350 | for j in xrange(len(predictions)):
351 | clusters[predictions[j]].add(j)
352 | return clusters
353 |
354 | def getClustersPopulation(self, labelTrain, trainBatchNum):
355 | clusters = {}
356 | for i in xrange(self.relationNum):
357 | clusters[i] = 0
358 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
359 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list
360 | for j in xrange(len(predictions)):
361 | clusters[predictions[j]] += 1
362 | return clusters
363 |
364 | def getClusters(self, labelTrain, trainBatchNum, train_dev):
365 | clusters = {}
366 | for i in xrange(self.relationNum):
367 | clusters[i] = []
368 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)]
369 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list
370 | for j in xrange(len(predictions)):
371 | clusters[predictions[j]].append(self.data.getExampleRelation(j, train_dev))
372 | return clusters
373 |
374 |
375 | def getClusteredFreq(self, clusters):
376 | clustFreq = {}
377 | for i in xrange(self.relationNum):
378 | clustFreq[i] = {}
379 | j = 0
380 | for c in clusters:
381 | for feat in clusters[c]:
382 | if feat in clustFreq[j]:
383 | clustFreq[j][feat] += 1
384 | else:
385 | clustFreq[j][feat] = 1
386 | clustFreq[j] = sorted(clustFreq[j].iteritems(), key=operator.itemgetter(1), reverse=True)
387 | j += 1
388 | return clustFreq
389 |
390 | def printFirstK(self, k, clusterFreq):
391 | for c in clusterFreq:
392 | print clusterFreq[c][:k]
393 |
394 |
395 | def getClustersWithFrequencies(self, clusterSets, data, threshold):
396 | for c in clusterSets:
397 | frequency = {}
398 | print c,
399 | for elem in clusterSets[c]:
400 | trig = self.goldStandard['train'][elem][0]
401 | if trig in frequency:
402 | frequency[trig] += 1
403 | else:
404 | frequency[trig] = 1
405 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
406 | if len(sorted_freq) < threshold:
407 | for el in sorted_freq:
408 | print el,
409 | else:
410 | count = 0
411 | for el in sorted_freq:
412 | if count > threshold:
413 | break
414 | else:
415 | print el,
416 | count += 1
417 | print ''
418 |
419 |
420 | def saveModel(model, name):
421 | pklProtocol = 2
422 | pklFile = open(settings.models_path + name, 'wb')
423 | pickle.dump(model, pklFile, protocol=pklProtocol)
424 |
425 | def loadModel(name):
426 | pklFile = open(settings.models_path + name, 'rb')
427 | return pickle.load(pklFile)
428 |
429 | def loadData(args, rng, negativeSamples, relationNum, modelType):
430 |
431 | if not os.path.exists(args.pickled_dataset):
432 | print "Pickled dataset not found"
433 | sys.exit()
434 |
435 | tStart = time.time()
436 | print "Found existing pickled dataset, loading...",
437 |
438 | pklFile = open(args.pickled_dataset, 'rb')
439 |
440 | featureExtrs = pickle.load(pklFile)
441 |
442 | relationLexicon = pickle.load(pklFile)
443 |
444 | data = pickle.load(pklFile)
445 |
446 | goldStandard = pickle.load(pklFile)
447 |
448 | pklFile.close()
449 | tEnd = time.time()
450 | print "Done (" + str(tEnd - tStart) + "s.)"
451 |
452 | trigs = False
453 |
454 |
455 | indexedDataset = DataSetManager(data, relationLexicon, rng, negativeSamples, relationNum, trigs)
456 |
457 | print "Produced indexed dataset"
458 |
459 | return indexedDataset, goldStandard
460 |
461 | def pickleClustering(clustering, clusteringName):
462 | pklProtocol = 2
463 | pklFile = open(settings.clusters_path + clusteringName, 'wb')
464 | pickle.dump(clustering, pklFile, protocol=pklProtocol)
465 |
466 |
467 | def picklePosteriors(posteriors, posteriorsName):
468 | pklProtocol = 2
469 | pklFile = open(settings.clusters_path + posteriorsName, 'wb')
470 | pickle.dump(posteriors, pklFile, protocol=pklProtocol)
471 |
472 | def getClustersWithInfo(clusterSets, data, threshold):
473 | for c in clusterSets:
474 | print c,
475 | if len(clusterSets[c]) < threshold:
476 | for elem in clusterSets[c]:
477 | print elem, data.getExampleFeatures(elem),
478 | else:
479 | count = 0
480 | for elem in clusterSets[c]:
481 | if count > threshold:
482 | break
483 | else:
484 | print elem, data.getExampleFeatures(elem),
485 | count += 1
486 | print ''
487 |
488 |
489 | def getClustersWithFrequencies(clusterSets, data, threshold):
490 | for c in clusterSets:
491 | frequency = {}
492 | print c,
493 | for elem in clusterSets[c]:
494 | trig = data.getExampleFeature(elem, 'trigger')
495 | if trig is not None:
496 | trig = trig.replace('trigger#', '')
497 | if trig in frequency:
498 | frequency[trig] += 1
499 | else:
500 | frequency[trig] = 1
501 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
502 | if len(sorted_freq) < threshold:
503 | for el in sorted_freq:
504 | print el,
505 | else:
506 | count = 0
507 | for el in sorted_freq:
508 | if count > threshold:
509 | break
510 | else:
511 | print el,
512 | count += 1
513 | print ''
514 |
515 |
516 | def getClustersWithFrequenciesValid(clusterSets, data, threshold):
517 | for c in clusterSets:
518 | frequency = {}
519 | print c,
520 | for elem in clusterSets[c]:
521 | trig = data.getExampleFeatureValid(elem, 'trigger')
522 | if trig is not None:
523 | trig = trig.replace('trigger#', '')
524 | if trig in frequency:
525 | frequency[trig] += 1
526 | else:
527 | frequency[trig] = 1
528 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
529 | if len(sorted_freq) < threshold:
530 | for el in sorted_freq:
531 | print el,
532 | else:
533 | count = 0
534 | for el in sorted_freq:
535 | if count > threshold:
536 | break
537 | else:
538 | print el,
539 | count += 1
540 | print ''
541 |
542 |
543 | def getClustersWithFrequenciesTest(clusterSets, data, threshold):
544 | for c in clusterSets:
545 | frequency = {}
546 | print c,
547 | for elem in clusterSets[c]:
548 | trig = data.getExampleFeatureTest(elem, 'trigger')
549 | if trig is not None:
550 | trig = trig.replace('trigger#', '')
551 | if trig in frequency:
552 | frequency[trig] += 1
553 | else:
554 | frequency[trig] = 1
555 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
556 | if len(sorted_freq) < threshold:
557 | for el in sorted_freq:
558 | print el,
559 | else:
560 | count = 0
561 | for el in sorted_freq:
562 | if count > threshold:
563 | break
564 | else:
565 | print el,
566 | count += 1
567 | print ''
568 |
569 | def getClustersWithRelationLabels(clusterSets, data, evaluation, threshold):
570 | for c in clusterSets:
571 | print c,
572 | if len(clusterSets[c]) < threshold:
573 | for elem in clusterSets[c]:
574 | if evaluation.relations[elem][0] != '':
575 | print elem, data.getExampleFeatures(elem), evaluation.relations[elem],
576 | else:
577 | count = 0
578 | for elem in clusterSets[c]:
579 | if count > threshold:
580 | break
581 | else:
582 | if evaluation.relations[elem][0] != '':
583 | print elem, data.getExampleFeatures(elem), evaluation.relations[elem],
584 | count += 1
585 | print ''
586 |
587 |
588 | def getCommandArgs():
589 | parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model')
590 |
591 | parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True,
592 | help='the pickled dataset file (produced by OiePreprocessor.py)')
593 |
594 | parser.add_argument('--epochs', metavar='epochs', nargs='?', type=int, default=100,
595 | help='maximum number of epochs')
596 |
597 | parser.add_argument('--learning_rate', metavar='learning_rate', nargs='?', type=float, default=0.1,
598 | help='initial learning rate')
599 |
600 | parser.add_argument('--batch_size', metavar='batch_size', nargs='?', type=int, default=50,
601 | help='size of the minibatches')
602 |
603 | parser.add_argument('--embed_size', metavar='embed_size', nargs='?', type=int, default=30,
604 | help='initial learning rate')
605 |
606 | parser.add_argument('--relations_number', metavar='relations_number', type=int, nargs='?', default=3,
607 | help='number of relations to induce')
608 |
609 | parser.add_argument('--negative_samples_number', metavar='negative_samples_number', nargs='?', type=int, default=5,
610 | help='number of negative samples')
611 |
612 | parser.add_argument('--l1_regularization', metavar='l1_regularization', nargs='?', type=float, default=0.0,
613 | help='lambda value of L1 regulatization')
614 |
615 | parser.add_argument('--l2_regularization', metavar='l2_regularization', nargs='?', type=float, default=0.0,
616 | help='lambda value of L2 regulatization')
617 |
618 | parser.add_argument('--optimization', metavar='optimization', nargs='?', type=int, default='0',
619 | help='optimization algorithm 0 SGD, 1 ADAGrad, 2 ADADelta. Default SDG.')
620 |
621 | parser.add_argument('--model_name', metavar='model_name', nargs='?', required=True, type=str,
622 | help='Name or ID of the model')
623 |
624 | parser.add_argument('--model', metavar='model', nargs='?', type=str, required=True,
625 | help='Model Type choose among A, C, AC.')
626 |
627 | parser.add_argument('--fixed_sampling', metavar='fixed_sampling', nargs='?', default='False',
628 | help='fixed/dynamic sampling switch, default fixed sampling')
629 |
630 | parser.add_argument('--ext_emb', metavar='ext_emb', nargs='?', default='False',
631 | help='external embeddings, default False')
632 |
633 | parser.add_argument('--extended_reg', metavar='extended_reg', nargs='?', default='False',
634 | help='extended regularization on reconstruction parameters, default False')
635 |
636 | parser.add_argument('--frequent_eval', metavar='frequent_eval', nargs='?', default='False',
637 | help='using frequent evaluation, default False')
638 |
639 | parser.add_argument('--seed', metavar='seed', nargs='?', type=int, default=2,
640 | help='random seed, default 2')
641 |
642 | parser.add_argument('--alpha', metavar='alpha', nargs='?', type=float, default=1.0,
643 | help='alpha coefficient for scaling the entropy term')
644 |
645 |
646 | return parser.parse_args()
647 |
648 |
649 |
650 |
651 |
652 | if __name__ == '__main__':
653 | print "Relation Learner"
654 |
655 | args = getCommandArgs()
656 | print args
657 | rseed = args.seed
658 | rand = np.random.RandomState(seed=rseed)
659 |
660 |
661 | negativeSamples = args.negative_samples_number
662 | numberRelations = args.relations_number
663 | indexedData, goldStandard = loadData(args, rand, negativeSamples, numberRelations, args.model)
664 |
665 |
666 | maxEpochs = args.epochs
667 | learningRate = args.learning_rate
668 | batchSize = args.batch_size
669 | embedSize = args.embed_size
670 | lambdaL1 = args.l1_regularization
671 | lambdaL2 = args.l2_regularization
672 | optimization = args.optimization
673 | modelName = args.model_name
674 | model = args.model
675 | fixedSampling = eval(args.fixed_sampling)
676 | extEmb = eval(args.ext_emb)
677 | extendedReg = eval(args.extended_reg)
678 | frequentEval = eval(args.frequent_eval)
679 | alpha = args.alpha
680 | inducer = ReconstructInducer(indexedData, goldStandard, rand, maxEpochs, learningRate,
681 | batchSize, embedSize, lambdaL1, lambdaL2, optimization, modelName,
682 | model, fixedSampling, extEmb, extendedReg,
683 | frequentEval, alpha)
684 |
685 |
686 |
687 | inducer.learn()
688 |
689 | saveModel(inducer, inducer.modelName)
690 |
691 |
--------------------------------------------------------------------------------
/learning/OieModel.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 |
4 | import theano.tensor as T
5 | import theano
6 | from models.encoders.RelationClassifier import IndependentRelationClassifiers
7 |
8 | class OieModelFunctions(object):
9 |
10 | def __init__(self, rng, featureDim, embedSize, relationNum, argVocSize, model,
11 | data, extEmb, extendedReg, alpha):
12 | self.rng = rng
13 |
14 | self.h = featureDim
15 | self.k = embedSize
16 | self.r = relationNum
17 |
18 | self.a = argVocSize
19 | self.model = model
20 | self.relationClassifiers = IndependentRelationClassifiers(rng, featureDim, relationNum)
21 | self.params = self.relationClassifiers.params
22 | self.alpha = alpha
23 | print 'Feature space size =', self.h
24 | print 'Argument vocabulary size =', argVocSize
25 |
26 | self.L1 = T.sum(abs(self.relationClassifiers.W))
27 |
28 | self.L2 = T.sum(T.sqr(self.relationClassifiers.W)) # + T.sum(T.sqr(self.relationClassifiers.Wb))
29 |
30 | if self.model == 'A':
31 | print 'Bilinear Model'
32 | from models.decoders.Bilinear import Bilinear
33 |
34 | self.argProjector = Bilinear(rng, embedSize, relationNum, self.a, data, extEmb)
35 | self.params += self.argProjector.params
36 | if extendedReg:
37 | self.L1 += T.sum(abs(self.argProjector.C))
38 | self.L2 += T.sum(T.sqr(self.argProjector.C))
39 |
40 | elif self.model == 'AC':
41 | print 'Bilinear + Selectional Preferences Model'
42 | from models.decoders.BilinearPlusSP import BilinearPlusSP
43 |
44 | self.argProjector = BilinearPlusSP(rng, embedSize, relationNum, self.a, data, extEmb)
45 | self.params += self.argProjector.params
46 | if extendedReg:
47 | self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2)) + T.sum(abs(self.argProjector.C))
48 | self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2)) + T.sum(T.sqr(self.argProjector.C))
49 |
50 |
51 | elif self.model == 'C':
52 | print 'Selectional Preferences'
53 | from models.decoders.SelectionalPreferences import SelectionalPreferences
54 |
55 | self.argProjector = SelectionalPreferences(rng, embedSize, relationNum, self.a, data, extEmb)
56 | self.params += self.argProjector.params
57 | if extendedReg:
58 | self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2))
59 | self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2))
60 |
61 |
62 |
63 | def buildTrainErrComputation(self, batchSize, negNum, xFeats, args1, args2, neg1, neg2):
64 | l = batchSize
65 | n = negNum
66 |
67 | # print xFeats
68 | print "Relation classifiers..."
69 | # relationLabeler.output are probabilities of relations assignment arranged in a tensor [l, r]
70 | relationProbs = self.relationClassifiers.compRelationProbsFunc(xFeats=xFeats)
71 | print "Arg projection..."
72 |
73 | entropy = self.alpha * -T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l]
74 |
75 | if self.model == 'A':
76 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
77 |
78 |
79 | elif self.model == 'AC':
80 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
81 |
82 |
83 | elif self.model == 'C':
84 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy)
85 |
86 |
87 | resError = -T.mean(allScores)
88 | print "Done with building the graph..."
89 | # resError = theano.printing.Print("resError ")(resError)
90 | return resError
91 |
92 |
93 |
94 |
95 | def buildLabelComputation(self, batchSize, xFeats):
96 | # xFeats [ l * e, h ] matrix
97 | return self.relationClassifiers.labelFunct(batchSize, xFeats)
98 |
99 |
100 | def buildRelationProbComputation(self, batchSize, xFeats):
101 | return self.relationClassifiers.compRelationProbsFunc(xFeats)
102 |
103 |
--------------------------------------------------------------------------------
/learning/Optimizers.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | import numpy as np
4 | import theano
5 | import theano.tensor as T
6 |
7 |
8 |
9 | class AdaGrad(object):
10 | def __init__(self, params):
11 | self.accumulator = []
12 | for para_i in params:
13 | eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX)
14 | self.accumulator.append(theano.shared(eps_p, borrow=True))
15 |
16 | def update(self, learningRate, params, cost):
17 | print 'AdaGrad takes the floor'
18 | grads = T.grad(cost, params)
19 | updates = []
20 | for param_i, grad_i, acc_i in zip(params, grads, self.accumulator):
21 | acc = acc_i + T.sqr(grad_i)
22 | updates.append((param_i, param_i - learningRate * grad_i / (T.sqrt(acc)+1e-6)))
23 | updates.append((acc_i, acc))
24 | return updates
25 |
26 |
27 | class SGD(object):
28 | def update(self, learningRate, params, cost):
29 | print 'SGD takes the floor'
30 | grads = T.grad(cost, params)
31 | updates = []
32 | for param_i, grad_i in zip(params, grads):
33 | updates.append((param_i, param_i - learningRate * grad_i))
34 | return updates
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/learning/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 |
--------------------------------------------------------------------------------
/learning/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
--------------------------------------------------------------------------------
/learning/models/decoders/Bilinear.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
3 | import math
4 | import theano
5 | from definitions import settings
6 | import numpy as np
7 | import theano.tensor as T
8 | from collections import OrderedDict
9 | import cPickle as pickle
10 |
11 | class Bilinear(object):
12 |
13 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb):
14 |
15 | self.k = embedSize
16 | self.r = relationNum
17 | self.a = argVocSize
18 |
19 | a = self.a
20 | k = self.k
21 | r = self.r
22 |
23 |
24 |
25 | # KxK matrix for each argument-argument for each relation
26 | CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
27 |
28 |
29 | self.C = theano.shared(value=CNP, name='C')
30 | # self.C = theano.printing.Print("C = ")(self.C)
31 | # argument embeddings
32 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)
33 |
34 | if ex_emb:
35 | import gensim
36 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
37 | for idArg in xrange(self.a):
38 | arg = data.id2Arg[idArg].lower().split(' ')
39 | new = np.zeros(k, dtype=theano.config.floatX)
40 | size = 0
41 | for ar in arg:
42 | if ar in external_embeddings:
43 | new += external_embeddings[ar]
44 | size += 1
45 | if size > 0:
46 | ANP[idArg] = new/size
47 |
48 | self.A = theano.shared(value=ANP, name='A') # (a1, k)
49 |
50 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
51 | name='Ab', borrow=True)
52 |
53 | self.updates = OrderedDict({self.A: self.A / T.sqrt(T.sum(T.sqr(self.A), axis=0))})
54 | self.normalize = theano.function([], [], updates=self.updates)
55 |
56 | # self.params = [self.C, self.A]
57 | self.params = [self.C, self.A, self.Ab]
58 |
59 |
60 |
61 | def factorization(self, batchSize, argsEmbA, argsEmbB, wC):
62 |
63 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k]
64 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k]
65 | Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l]
66 | # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l]
67 | return Asecond
68 |
69 | def negFactorization1(self, batchSize, negEmbA, argsEmbB, wC):
70 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k]
71 | Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k, k] * [n, l, k] = [l, k, n]
72 | Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]]) # [l, k, n] * [l, k] = [l, n]
73 | return Asecond
74 |
75 | def negFactorization2(self, batchSize, argsEmbA, negEmbB, wC):
76 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k]
77 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k]
78 | Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k] * [l, k, n] = [l, n]
79 | return Asecond
80 |
81 |
82 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
83 | argembed1 = self.A[args1]
84 | argembed2 = self.A[args2]
85 |
86 | weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
87 | one = self.factorization(batchSize=l,
88 | argsEmbA=argembed1,
89 | argsEmbB=argembed2,
90 | wC=weightedC) # [l,n]
91 |
92 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
93 |
94 | logScoresP = T.log(T.nnet.sigmoid(u))
95 |
96 | allScores = logScoresP
97 | allScores = T.concatenate([allScores, entropy, entropy])
98 |
99 |
100 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
101 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
102 | negOne = self.negFactorization1(batchSize=l,
103 | negEmbA=negembed1,
104 | argsEmbB=argembed2,
105 | wC=weightedC)
106 |
107 | negTwo = self.negFactorization2(batchSize=l,
108 | argsEmbA=argembed1,
109 | negEmbB=negembed2,
110 | wC=weightedC)
111 |
112 | g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
113 | negTwo + self.Ab[neg2].dimshuffle(1, 0)])
114 | logScores = T.log(T.nnet.sigmoid(-g))
115 | allScores = T.concatenate([allScores, logScores.flatten()])
116 | return allScores
117 |
118 |
119 |
--------------------------------------------------------------------------------
/learning/models/decoders/BilinearPlusSP.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
3 | import math
4 | import theano
5 | from definitions import settings
6 | import numpy as np
7 | import theano.tensor as T
8 | import cPickle as pickle
9 |
10 | class BilinearPlusSP(object):
11 |
12 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb, ):
13 |
14 | self.k = embedSize
15 | self.r = relationNum
16 | self.a = argVocSize
17 |
18 | a = self.a
19 | k = self.k
20 | r = self.r
21 |
22 |
23 | # KxK matrix for each argument-argument for each relation
24 | CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
25 | # @UndefinedVariable
26 | self.C = theano.shared(value=CNP, name='C')
27 | # self.C = theano.printing.Print("C = ")(self.C)
28 |
29 | # Selectional Preferences
30 | Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
31 | Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
32 | self.C1 = theano.shared(value=Ca1NP, name='C1')
33 | self.C2 = theano.shared(value=Ca2NP, name='C2')
34 | # argument embeddings
35 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable
36 |
37 | if ex_emb:
38 | import gensim
39 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
40 |
41 | for idArg in xrange(self.a):
42 | arg = data.id2Arg[idArg].lower().split(' ')
43 | new = np.zeros(k, dtype=theano.config.floatX)
44 | size = 0
45 | for ar in arg:
46 | if ar in external_embeddings:
47 | new += external_embeddings[ar]
48 | size += 1
49 | if size > 0:
50 | ANP[idArg] = new/size
51 |
52 | self.A = theano.shared(value=ANP, name='A') # (a1, k)
53 |
54 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
55 | name='Ab', borrow=True)
56 |
57 | self.params = [self.C, self.A, self.Ab, self.C1, self.C2]
58 |
59 |
60 |
61 |
62 | def factorization(self, batchSize, argsEmbA, argsEmbB, wC, wC1, wC2):
63 | # l = batchSize
64 | # k = self.k # embed size
65 | # r = self.r # relation number
66 |
67 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # + self.Cb # [l, k, k] * [l, k] = [l, k]
68 | Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l]
69 | spFirst = T.batched_dot(wC1, argsEmbA)
70 | spSecond = T.batched_dot(wC2, argsEmbB)
71 | return Asecond + spFirst + spSecond
72 |
73 |
74 |
75 | def negLeftFactorization(self, batchSize, negEmbA, argsEmbB, wC, wC1, wC2):
76 | # l = batchSize
77 | # k = self.k # embed size
78 | # r = self.r # relation number
79 |
80 | Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k, k] * [n, l, k] = [l, k, n]
81 | Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]]) # [l, k, n] * [l, k] = [l, n]
82 |
83 | spAfirst = T.batched_tensordot(wC1, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n]
84 |
85 | spSecond = T.batched_dot(wC2, argsEmbB)
86 |
87 | return Asecond + spAfirst + spSecond.reshape((batchSize, 1))
88 |
89 | def negRightFactorization(self, batchSize, argsEmbA, negEmbB, wC, wC1, wC2):
90 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k]
91 | Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k] * [l, k, n] = [l, n]
92 | spFirst = T.batched_dot(wC1, argsEmbA)
93 | spAsecond = T.batched_tensordot(wC2, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n]
94 | return Asecond + spAsecond + spFirst.reshape((batchSize, 1))
95 |
96 |
97 |
98 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
99 | weightedC1 = T.dot(relationProbs, self.C1.dimshuffle(1, 0))
100 | weightedC2 = T.dot(relationProbs, self.C2.dimshuffle(1, 0))
101 | weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
102 |
103 |
104 | argembed1 = self.A[args1]
105 | argembed2 = self.A[args2]
106 |
107 | one = self.factorization(batchSize=l,
108 | argsEmbA=argembed1,
109 | argsEmbB=argembed2,
110 | wC=weightedC,
111 | wC1=weightedC1,
112 | wC2=weightedC2)
113 |
114 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
115 | logScoresP = T.log(T.nnet.sigmoid(u))
116 |
117 | allScores = logScoresP
118 | allScores = T.concatenate([allScores, entropy, entropy])
119 |
120 |
121 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
122 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
123 | negOne = self.negLeftFactorization(batchSize=l,
124 | negEmbA=negembed1,
125 | argsEmbB=argembed2,
126 | wC=weightedC,
127 | wC1=weightedC1,
128 | wC2=weightedC2)
129 |
130 | negTwo = self.negRightFactorization(batchSize=l,
131 | argsEmbA=argembed1,
132 | negEmbB=negembed2,
133 | wC=weightedC,
134 | wC1=weightedC1,
135 | wC2=weightedC2)
136 | g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
137 | negTwo + self.Ab[neg2].dimshuffle(1, 0)])
138 | logScores = T.log(T.nnet.sigmoid(-g))
139 | allScores = T.concatenate([allScores, logScores.flatten()])
140 |
141 | return allScores
142 |
143 |
--------------------------------------------------------------------------------
/learning/models/decoders/SelectionalPreferences.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
3 | import math
4 | import theano
5 | from definitions import settings
6 | import numpy as np
7 | import theano.tensor as T
8 | import cPickle as pickle
9 |
10 | class SelectionalPreferences(object):
11 |
12 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb):
13 |
14 | self.k = embedSize
15 | self.r = relationNum
16 | self.a = argVocSize
17 |
18 | a = self.a
19 | k = self.k
20 | r = self.r
21 |
22 |
23 | # Selectional Preferences
24 | Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
25 | Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
26 | self.C1 = theano.shared(value=Ca1NP, name='C1')
27 | self.C2 = theano.shared(value=Ca2NP, name='C2')
28 |
29 | # argument embeddings
30 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable
31 |
32 | if ex_emb:
33 | import gensim
34 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
35 |
36 | for idArg in xrange(self.a):
37 | arg = data.id2Arg[idArg].lower().split(' ')
38 | new = np.zeros(k, dtype=theano.config.floatX)
39 | size = 0
40 | for ar in arg:
41 | if ar in external_embeddings:
42 | new += external_embeddings[ar]
43 | size += 1
44 | if size > 0:
45 | ANP[idArg] = new/size
46 |
47 | self.A = theano.shared(value=ANP, name='A') # (a1, k)
48 |
49 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
50 | name='Ab', borrow=True)
51 |
52 | self.params = [self.A, self.C1, self.C2, self.Ab]
53 |
54 |
55 |
56 |
57 |
58 | def leftMostFactorization(self, batchSize, args, wC1):
59 | l = batchSize
60 | k = self.k # embed size
61 | r = self.r # relation number
62 | argEmbeds = self.A[args.flatten()]
63 | Afirst = T.batched_dot(wC1, argEmbeds)
64 | return Afirst
65 |
66 | def rightMostFactorization(self, batchSize, args, wC2):
67 | l = batchSize
68 | k = self.k # embed size
69 | r = self.r # relation number
70 | argEmbeds2 = self.A[args.flatten()]
71 | Asecond = T.batched_dot(wC2, argEmbeds2)
72 | return Asecond
73 |
74 |
75 |
76 | def negLeftMostFactorization(self, batchSize, negEmbed, wC1):
77 | # l = batchSize
78 | # k = self.k # embed size
79 | # r = self.r # relation number
80 | Afirst = T.batched_tensordot(wC1, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n]
81 | return Afirst
82 |
83 | def negRightMostFactorization(self, batchSize, negEmbed, wC2):
84 | # l = batchSize
85 | # k = self.k # embed size
86 | # r = self.r # relation number
87 | Asecond = T.batched_tensordot(wC2, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n]
88 | return Asecond
89 |
90 |
91 |
92 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
93 | weightedC1= T.dot(relationProbs, self.C1.dimshuffle(1, 0))
94 | weightedC2= T.dot(relationProbs, self.C2.dimshuffle(1, 0))
95 |
96 | left1 = self.leftMostFactorization(batchSize=l, args=args1, wC1=weightedC1)
97 | right1 = self.rightMostFactorization(batchSize=l, args=args2, wC2=weightedC2)
98 | one = left1 + right1
99 |
100 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])
101 | logScoresP = T.log(T.nnet.sigmoid(u))
102 | allScores = logScoresP
103 | allScores = T.concatenate([allScores, entropy, entropy])
104 |
105 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
106 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
107 | negative1 = self.negLeftMostFactorization(batchSize=l,
108 | negEmbed=negembed1,
109 | wC1=weightedC1)
110 | negative2 = self.negRightMostFactorization(batchSize=l,
111 | negEmbed=negembed2,
112 | wC2=weightedC2)
113 |
114 | negOne = negative1.dimshuffle(1, 0) + right1
115 | negTwo = negative2.dimshuffle(1, 0) + left1
116 | g = T.concatenate([negOne + self.Ab[neg1], negTwo + self.Ab[neg2]])
117 | logScores = T.log(T.nnet.sigmoid(-g))
118 | allScores = T.concatenate([allScores, logScores.flatten()])
119 |
120 | return allScores
121 |
122 |
123 |
--------------------------------------------------------------------------------
/learning/models/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
--------------------------------------------------------------------------------
/learning/models/encoders/RelationClassifier.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | from theano import sparse
4 | import theano
5 | from definitions import settings
6 | import numpy as np
7 | import theano.tensor as T
8 |
9 | class IndependentRelationClassifiers(object):
10 | # rng is a random generator,
11 | # featureDim is the dimension of the feature space
12 | # relationNum is the number of possible relations (classes of relations)
13 |
14 | def __init__(self, rng, featureDim, relationNum):
15 |
16 | # dimensionality of feature space
17 | self.h = featureDim
18 | # relation num
19 | self.r = relationNum
20 | # print str(np.sqrt(6. / (self.h + self.r)))
21 | # w_bound = np.sqrt(self.h * self.r)
22 |
23 | # print str(1.0 / w_bound)
24 | print 'low bound =', settings.low, 'high bound =', settings.high
25 | self.W = theano.shared(np.asarray(rng.uniform(
26 | low=settings.low,
27 | high=settings.high,
28 | size=(self.h, self.r)), dtype=theano.config.floatX), # @UndefinedVariable
29 | name='W', borrow=True)
30 | # npW = np.zeros((3,3),dtype=theano.config.floatX)
31 | # npW[0,0] = 1.e+40
32 | # npW[1,1] = 1.e+40
33 | # npW[2,2] = 1.e+40
34 |
35 | # @UndefinedVariable
36 | # self.W = theano.shared(value=np.asarray(npW))
37 |
38 | self.Wb = theano.shared(value=np.zeros(self.r,
39 | dtype=theano.config.floatX), # @UndefinedVariable
40 | name='Wb', borrow=True)
41 |
42 | self.params = [self.W, self.Wb]
43 | # self.params = [self.Wb]
44 | # self.params = []
45 |
46 | def compRelationProbsFunc(self, xFeats):
47 | # xFeats [l, h] matrix
48 | # xFeats = theano.printing.Print("xFeats")(xFeats)
49 | # self.Wb = theano.printing.Print("Wb ") (self.Wb)
50 | # self.W = theano.printing.Print("W ") (self.W)
51 | # scores of each role by a classifier
52 | relationScores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r]
53 | #relationScores = theano.printing.Print("relationScores=")(relationScores)
54 |
55 | # convert it to probabilities
56 | relationProbs = T.nnet.softmax(relationScores)
57 | #relationProbs = theano.printing.Print("relationProbs = ")(relationProbs)
58 |
59 |
60 | return relationProbs # [l, r]
61 |
62 |
63 | def labelFunct(self, batchSize, xFeats):
64 | # xFeats [l, h]
65 | # l = batchSize
66 | # self.W = theano.printing.Print("W ") (self.W)
67 | # self.Wb = theano.printing.Print("Wb ") (self.Wb)
68 | scores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r]
69 | relationProbs = T.nnet.softmax(scores)
70 | # scores = theano.printing.Print("scores ") (scores)
71 | labels = T.argmax(scores, axis=1) # [l, r] => [l]
72 | # labels = theano.printing.Print("labels ") (labels)
73 | return (labels, relationProbs)
--------------------------------------------------------------------------------
/learning/models/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'enfry'
2 |
--------------------------------------------------------------------------------
/processing/OiePreprocessor.py:
--------------------------------------------------------------------------------
1 | __author__ = 'diego'
2 |
3 | import argparse
4 | import os
5 | import sys
6 | import time
7 | from definitions import OieFeatures
8 | from definitions import OieExample
9 | print sys.path
10 | import cPickle as pickle
11 |
12 |
13 | class FeatureLexicon:
14 |
15 | def __init__(self):
16 | self.nextId = 0
17 | self.id2Str = {}
18 | self.str2Id = {}
19 | self.id2freq = {}
20 | self.nextIdPruned = 0
21 | self.id2StrPruned = {}
22 | self.str2IdPruned = {}
23 |
24 | def getOrAdd(self, s):
25 | if s not in self.str2Id:
26 | self.id2Str[self.nextId] = s
27 | self.str2Id[s] = self.nextId
28 | self.id2freq[self.nextId] = 1
29 | self.nextId += 1
30 | else:
31 | self.id2freq[self.str2Id[s]] += 1
32 | return self.str2Id[s]
33 |
34 |
35 | def getOrAddPruned(self, s):
36 | if s not in self.str2IdPruned:
37 | self.id2StrPruned[self.nextIdPruned] = s
38 | self.str2IdPruned[s] = self.nextIdPruned
39 | self.nextIdPruned += 1
40 | return self.str2IdPruned[s]
41 |
42 | def getId(self, s):
43 | if s not in self.str2Id:
44 | return None
45 | return self.str2Id[s]
46 |
47 | def getStr(self, idx):
48 | if idx not in self.id2Str:
49 | return None
50 | else:
51 | return self.id2Str[idx]
52 |
53 | def getStrPruned(self, idx):
54 | if idx not in self.id2StrPruned:
55 | return None
56 | else:
57 | return self.id2StrPruned[idx]
58 |
59 | def getFreq(self, idx):
60 | if idx not in self.id2freq:
61 | return None
62 | return self.id2freq[idx]
63 |
64 |
65 | def getDimensionality(self):
66 | return self.nextIdPruned
67 | # return self.nextId
68 |
69 |
70 | def getFeatures(lexicon, featureExs, info, arg1=None, arg2=None, expand=False):
71 | feats = []
72 | for f in featureExs:
73 | res = f(info, arg1, arg2)
74 | if res is not None:
75 | if type(res) == list:
76 | for el in res:
77 | featStrId = f.__name__ + "#" + el
78 | if expand:
79 | feats.append(lexicon.getOrAdd(featStrId))
80 | else:
81 | featId = lexicon.getId(featStrId)
82 | if featId is not None:
83 | feats.append(featId)
84 | else:
85 | featStrId = f.__name__ + "#" + res
86 | if expand:
87 | feats.append(lexicon.getOrAdd(featStrId))
88 | else:
89 | featId = lexicon.getId(featStrId)
90 | if featId is not None:
91 | feats.append(featId)
92 |
93 | return feats
94 |
95 | def getFeaturesThreshold(lexicon, featureExs, info, arg1=None, arg2=None, expand=False, threshold=0):
96 | feats = []
97 | for f in featureExs:
98 | res = f(info, arg1, arg2)
99 | if res is not None:
100 | if type(res) == list:
101 | for el in res:
102 | featStrId = f.__name__ + "#" + el
103 | if expand:
104 | if lexicon.id2freq[lexicon.getId(featStrId)] > threshold:
105 | feats.append(lexicon.getOrAddPruned(featStrId))
106 | else:
107 | featId = lexicon.getId(featStrId)
108 | if featId is not None:
109 | if lexicon.id2freq[featId] > threshold:
110 | feats.append(lexicon.getOrAddPruned(featStrId))
111 | else:
112 | featStrId = f.__name__ + "#" + res
113 | if expand:
114 | if lexicon.id2freq[lexicon.getId(featStrId)] > threshold:
115 | feats.append(lexicon.getOrAddPruned(featStrId))
116 | else:
117 | featId = lexicon.getId(featStrId)
118 | if featId is not None:
119 | if lexicon.id2freq[featId] > threshold:
120 | feats.append(lexicon.getOrAddPruned(featStrId))
121 |
122 | return feats
123 |
124 | def prepareArgParser():
125 | parser = argparse.ArgumentParser(description='Processes an Oie file and add its representations '
126 | 'to a Python pickled file.')
127 |
128 | parser.add_argument('input_file', metavar='input-file', help='input file in the Yao format')
129 |
130 | parser.add_argument('pickled_dataset', metavar='pickled-dataset', help='pickle file to be used to store output '
131 | '(created if empty)')
132 |
133 | parser.add_argument('--batch-name', default="train", nargs="?", help='name used as a reference in the pickled file')
134 |
135 | parser.add_argument('--features', default="basic", nargs="?", help='features (basic vs ?)')
136 | parser.add_argument('--threshold', default="0", nargs="?", type=int, help='minimum feature frequency')
137 |
138 |
139 |
140 | parser.add_argument('--test-mode', action='store_true',
141 | help='used for test files '
142 | '(the feature space is not expanded to include previously unseen features)')
143 |
144 |
145 | return parser
146 |
147 | def loadExamples(fileName):
148 | count = 0
149 | with open(fileName, 'r') as fp:
150 | relationExamples = []
151 | for line in fp:
152 | line.strip()
153 | if len(line) == 0 or len(line.split()) == 0:
154 | raise IOError
155 |
156 | else:
157 | fields = line.split('\t')
158 | assert len(fields) == 9, "a problem with the file format (# fields is wrong) len is " \
159 | + str(len(fields)) + "instead of 9"
160 | # this will be 10
161 | relationExamples.append([str(count)] + fields)
162 | count += 1
163 |
164 | return relationExamples
165 |
166 | # if __name__ == '__main__':
167 | # examples = loadExamples('/Users/admin/isti/amsterdam/data/candidate-100.txt')
168 | # print "Using basic features"
169 | # argFeatureExtrs = OieFeatures.getBasicFeatures()
170 | # ex = examples[0]
171 | # print ex
172 | # features = argFeatureExtrs
173 | #
174 | # s = []
175 | # for f in features:
176 | # res = f([ex[1], ex[4], ex[5], ex[7]], ex[2], ex[3])
177 | # if res is not None:
178 | # s.append(f.__name__ + "#" + res)
179 | #
180 | # print s, 'dd'
181 |
182 | if __name__ == '__main__':
183 |
184 | tStart = time.time()
185 |
186 | print "Parameters: " + str(sys.argv[1::])
187 | parser = prepareArgParser()
188 | args = parser.parse_args()
189 |
190 | print "Parsed params: " + str(args)
191 |
192 | print "Loading sentences...",
193 | relationExamples = loadExamples(args.input_file)
194 |
195 | tEnd = time.time()
196 | print "Done (" + str(tEnd - tStart) + "s.)"
197 |
198 | # predFeatureExtrs = definitions.SrlFeatures.getJohanssonPredDisFeatures()
199 | #
200 | featureExtrs = None
201 | if args.features == "basic":
202 | print "Using rich features"
203 | featureExtrs = OieFeatures.getBasicCleanFeatures()
204 |
205 | relationLexicon = FeatureLexicon()
206 |
207 | dataset = {}
208 | goldstandard = {}
209 |
210 | if os.path.exists(args.pickled_dataset):
211 | tStart = time.time()
212 | print "Found existing pickled dataset, loading...",
213 |
214 | pklFile = open(args.pickled_dataset, 'rb')
215 |
216 | featureExtrs = pickle.load(pklFile)
217 | relationLexicon = pickle.load(pklFile)
218 | dataset = pickle.load(pklFile)
219 | goldstandard = pickle.load(pklFile)
220 |
221 | pklFile.close()
222 | tEnd = time.time()
223 | print "Done (" + str(tEnd - tStart) + "s.)"
224 |
225 | tStart = time.time()
226 | print "Processing relation Examples",
227 |
228 | examples = []
229 | relationLabels = {}
230 | if args.batch_name in dataset:
231 | examples = dataset[args.batch_name]
232 | relationLabels = goldstandard[args.batch_name]
233 | else:
234 | dataset[args.batch_name] = examples
235 | goldstandard[args.batch_name] = relationLabels
236 |
237 | reIdx = 0
238 | c = 0
239 | for re in relationExamples:
240 | getFeatures(relationLexicon, featureExtrs, [re[1], re[4], re[5], re[7], re[8], re[6]],
241 | re[2], re[3], True)
242 | for re in relationExamples:
243 | reIdx += 1
244 | if reIdx % 1000 == 0:
245 | print ".",
246 | if reIdx % 10000 == 0:
247 | print reIdx,
248 |
249 |
250 | relationE = ''
251 | if re[9] != '':
252 | relationE = re[9]
253 | # print re[9]
254 | # if re[10] != '':
255 | # if relationE != '':
256 | # relationE += ' '+re[10]
257 | # else:
258 | # relationE = re[10]
259 |
260 | ex = OieExample.OieExample(re[2], re[3], getFeaturesThreshold(relationLexicon,
261 | featureExtrs,
262 | [re[1], re[4], re[5], re[7], re[8], re[6]],
263 | # [re[1], re[4], re[5], re[7]],
264 | re[2], re[3], True, threshold=args.threshold), re[5]
265 | ,relation=relationE
266 | )
267 | relationLabels[c] = re[-1].strip().split(' ')
268 | c += 1
269 |
270 | examples.append(ex)
271 |
272 |
273 | tEnd = time.time()
274 | print "Done (" + str(tEnd - tStart) + "s.), processed " + str(len(examples))
275 |
276 | tStart = time.time()
277 | print "Pickling the dataset...",
278 |
279 | pklFile = open(args.pickled_dataset, 'wb')
280 | #pklFile = gzip.GzipFile(args.pickled_dataset, 'wb')
281 |
282 | pklProtocol = 2
283 | pickle.dump(featureExtrs, pklFile, protocol=pklProtocol)
284 | pickle.dump(relationLexicon, pklFile, protocol=pklProtocol)
285 | pickle.dump(dataset, pklFile, protocol=pklProtocol)
286 | pickle.dump(goldstandard, pklFile, protocol=pklProtocol)
287 |
288 | tEnd = time.time()
289 | print "Done (" + str(tEnd - tStart) + "s.)"
--------------------------------------------------------------------------------
/processing/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'admin'
2 |
--------------------------------------------------------------------------------