├── LICENSE.txt ├── README.md ├── data-sample.txt ├── definitions ├── OieExample.py ├── OieFeatures.py ├── __init__.py └── settings.py ├── evaluation ├── OieEvaluation.py └── __init__.py ├── learning ├── NegativeExampleGenerator.py ├── OieData.py ├── OieInduction.py ├── OieModel.py ├── Optimizers.py ├── __init__.py └── models │ ├── __init__.py │ ├── decoders │ ├── Bilinear.py │ ├── BilinearPlusSP.py │ ├── SelectionalPreferences.py │ └── __init__.py │ └── encoders │ ├── RelationClassifier.py │ └── __init__.py └── processing ├── OiePreprocessor.py └── __init__.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # relation-autoencoder 2 | This is the code used in the paper [Discrete-State Variational Autoencoders for Joint Discovery and Factorization of Relations](https://transacl.org/ojs/index.php/tacl/article/viewFile/761/190) by Diego Marcheggiani and Ivan Titov. 3 | 4 | If you use this code, please cite us. 5 | 6 | Dependencies 7 | ----------- 8 | - [theano](http://deeplearning.net/software/theano/) 9 | - [numpy](http://http://www.numpy.org/) 10 | - [scipy](http://https://www.scipy.org/) 11 | - [nltk](http://http://www.nltk.org/) 12 | 13 | 14 | Data Processing 15 | -------------- 16 | To run the model the first thing to do is create a dataset. 17 | You need a file like data-sample.txt. 18 | The file must be tab-separated an with the following fields: 19 | 20 | lexicalized dependency path between arguments (entities) of the relation, 21 | first entity 22 | second entity 23 | entity types of the first and second entity 24 | trigger word 25 | id of the sentence 26 | raw sentence 27 | pos tags of the entire sentence 28 | relation between the two entities if any (used only for evaluation) 29 | 30 | 31 | In order to create the dataset you need the OiePreprocessor.py script once for each dataset partition: train, dev, and test. 32 |

33 | python processing/OiePreprocessor.py --batch-name train data-sample.txt sample.pk 
34 | python processing/OiePreprocessor.py --batch-name dev data-sample.txt sample.pk
35 | python processing/OiePreprocessor.py --batch-name test data-sample.txt sample.pk
36 | 
37 | 38 | Now, your dataset with all the indexed features is in sample.pk 39 | 40 | Training Models 41 | ------------ 42 | To train the model run the OieInduction.py file with all the required arguments: 43 |

44 | python learning/OieInduction.py --pickled_dataset sample.pk --model_name discrete-autoencoder --model AC --optimization 1 --epochs 10 --batch_size 100 --relations_number 10 --negative_samples_number 5 --l2_regularization 0.1 --alpha 0.1 --seed 2 --embed_size 10 --learning_rate 0.1
45 | 
46 | 47 | 48 | For any questions, please drop me a mail at marcheggiani [at] uva [dot] nl. 49 | -------------------------------------------------------------------------------- /definitions/OieExample.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | class OieExample (object): 4 | 5 | def __init__(self, arg1, arg2, features, trigger, relation=''): 6 | self.features = features 7 | self.arg1 = arg1 8 | self.arg2 = arg2 9 | self.relation = relation 10 | self.trigger = trigger 11 | 12 | def setFeatures(self, features): 13 | self.features = features -------------------------------------------------------------------------------- /definitions/OieFeatures.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | import nltk 4 | import re, string 5 | import settings 6 | import pickle 7 | 8 | parsing = 0 9 | entities = 1 10 | trig = 2 11 | sentence = 3 12 | pos = 4 13 | docPath = 5 14 | # ======= Relation features ======= 15 | stopwords_list = nltk.corpus.stopwords.words('english') 16 | _digits = re.compile('\d') 17 | def bow(info, arg1, arg2): 18 | return info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 19 | 20 | def bow_clean(info, arg1, arg2): 21 | bow = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 22 | result = [] 23 | tmp = [] 24 | for word in bow: 25 | for pun in string.punctuation: 26 | word = word.strip(pun) 27 | if word != '': 28 | tmp.append(word.lower()) 29 | for word in tmp: 30 | if word not in stopwords_list and not _digits.search(word) and not word[0].isupper(): 31 | result.append(word) 32 | return result 33 | 34 | def before_arg1(info, arg1, arg2): 35 | before = info[sentence][:info[sentence].find(arg1)] 36 | beforeSplit = before.lower().strip().split(' ') 37 | beforeSplit = [word for word in beforeSplit if word not in string.punctuation] 38 | # print beforeSplit 39 | if len(beforeSplit) > 1: 40 | return [beforeSplit[-2], beforeSplit[-1]] 41 | elif len(beforeSplit) == 1: 42 | if beforeSplit[0] != '': 43 | return [beforeSplit[-1]] 44 | else: 45 | return [] 46 | else: 47 | return [] 48 | 49 | 50 | def after_arg2(info, arg1, arg2): 51 | after = info[sentence][info[sentence].rfind(arg2)+len(arg2):] 52 | afterSplit = after.lower().strip().split(' ') 53 | afterSplit = [word for word in afterSplit if word not in string.punctuation] 54 | if len(afterSplit) > 1: 55 | return [a for a in afterSplit[0: 2]] 56 | elif len(afterSplit) == 1: 57 | if afterSplit[0] != '': 58 | return [afterSplit[0]] 59 | else: 60 | return [] 61 | else: 62 | return [] 63 | 64 | def bigrams(info, arg1, arg2): 65 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 66 | tmp = [] 67 | for word in between: 68 | for pun in string.punctuation: 69 | word = word.strip(pun) 70 | if word != '': 71 | tmp.append(word.lower()) 72 | return [x[0]+'_'+x[1] for x in zip(tmp, tmp[1:])] 73 | 74 | def trigrams(info, arg1, arg2): 75 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 76 | tmp = [] 77 | for word in between: 78 | for pun in string.punctuation: 79 | word = word.strip(pun) 80 | if word != '': 81 | tmp.append(word.lower()) 82 | return [x[0]+'_'+x[1]+'_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])] 83 | 84 | def skiptrigrams(info, arg1, arg2): 85 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 86 | tmp = [] 87 | for word in between: 88 | for pun in string.punctuation: 89 | word = word.strip(pun) 90 | if word != '': 91 | tmp.append(word.lower()) 92 | return [x[0]+'_X_'+x[2] for x in zip(tmp, tmp[1:], tmp[2:])] 93 | 94 | def skipfourgrams(info, arg1, arg2): 95 | between = info[sentence][info[sentence].find(arg1):info[sentence].rfind(arg2)+len(arg2)].split() 96 | tmp = [] 97 | for word in between: 98 | for pun in string.punctuation: 99 | word = word.strip(pun) 100 | if word != '': 101 | tmp.append(word.lower()) 102 | return [x[0]+'_X_'+x[2] + '_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] +\ 103 | [x[0]+'_'+x[1]+'_X_' + x[3] for x in zip(tmp, tmp[1:], tmp[2:], tmp[3:])] 104 | 105 | def trigger(info, arg1, arg2): 106 | return info[trig].replace('TRIGGER:', '') 107 | 108 | def entityTypes(info, arg1, arg2): 109 | return info[entities] 110 | 111 | def entity1Type(info, arg1, arg2): 112 | return info[entities].split('-')[0] 113 | 114 | def entity2Type(info, arg1, arg2): 115 | return info[entities].split('-')[1] 116 | 117 | def arg1(info, arg1, arg2): 118 | return arg1 119 | 120 | def arg1_lower(info, arg1, arg2): 121 | return arg1.lower() 122 | 123 | def arg1unigrams(info, arg1, arg2): 124 | return arg1.lower().split() 125 | 126 | def arg2(info, arg1, arg2): 127 | return arg2 128 | 129 | def arg2_lower(info, arg1, arg2): 130 | return arg2.lower() 131 | 132 | def arg2unigrams(info, arg1, arg2): 133 | return arg2.lower().split() 134 | 135 | def lexicalPattern(info, arg1, arg2): 136 | # return info[parsing] 137 | p = info[parsing].replace('->', ' ').replace('<-', ' ').split() 138 | result = [] 139 | for num, x in enumerate(p): 140 | if num % 2 != 0: 141 | result.append(x) 142 | return '_'.join(result) 143 | 144 | def dependencyParsing(info, arg1, arg2): 145 | return info[parsing] 146 | 147 | 148 | def rightDep(info, arg1, arg2): 149 | p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split() 150 | return ''.join(p[:3]) 151 | 152 | def leftDep(info, arg1, arg2): 153 | p = info[parsing].replace('->', ' -> ').replace('<-', ' <- ').split() 154 | return ''.join(p[-3:]) 155 | 156 | def posPatternPath(info, arg1, arg2): 157 | words = info[sentence].split() 158 | postags = info[pos].split() 159 | assert len(postags) == len(words), 'error' 160 | a = [] 161 | for w in xrange(len(words)): 162 | a.append((words[w], postags[w])) 163 | # a = info[4].split() 164 | if a: 165 | # print arg1, words 166 | # print [a.index(item) for item in a if item[0] == arg1.split()[-1]],'aaaaaaa' 167 | beginList = [a.index(item) for item in a if item[0] == arg1.split()[-1]] 168 | # print beginList 169 | endList = [a.index(item) for item in a if item[0] == arg2.split()[0]] 170 | # print endList 171 | if len(beginList) > 0 and len(endList) > 0: 172 | # posPattern = [item[1] for item in a if beginList[0] > a.index(item) > endList[0]] 173 | posPattern = [] 174 | for num, item in enumerate(a): 175 | if beginList[0] < num < endList[0]: 176 | posPattern.append(item[1]) 177 | # print posPattern 178 | return '_'.join(posPattern) 179 | else: 180 | return '' 181 | else: 182 | return '' 183 | 184 | 185 | def getBasicCleanFeatures(): 186 | features = [trigger, entityTypes, arg1_lower, arg2_lower, bow_clean, entity1Type, entity2Type, lexicalPattern, 187 | posPatternPath] 188 | return features 189 | 190 | -------------------------------------------------------------------------------- /definitions/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'admin' 2 | -------------------------------------------------------------------------------- /definitions/settings.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | models_path = '' 4 | clusters_path = '' 5 | 6 | lda_pairs_path = '' 7 | relations2IdDictionary = '' 8 | 9 | external_embeddings_path = '' 10 | debug = True 11 | 12 | elems_to_visualize = 5 13 | 14 | low = -1.e-3 15 | high = 1.e-3 16 | 17 | -------------------------------------------------------------------------------- /evaluation/OieEvaluation.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | import pickle 4 | import math 5 | import argparse 6 | import os 7 | import sys 8 | from processing.OiePreprocessor import FeatureLexicon 9 | 10 | class singleLabelClusterEvaluation: 11 | def __init__(self, referencePath, file, validationPath=''): 12 | self.relations = {} 13 | if file: 14 | if validationPath != '': 15 | self.referenceSets, self.assessableElemSet = self.createValidationReferenceSets(referencePath, 16 | validationPath) 17 | else: 18 | self.referenceSets, self.assessableElemSet = self.createReferenceSets(referencePath) 19 | 20 | else: 21 | self.referenceSets, self.assessableElemSet = self.createReferenceSetsFromData(referencePath) 22 | # print self.referenceSets 23 | # print self.assessableElemSet 24 | 25 | def createResponse(self, response): 26 | self.numberOfElements, self.responseSets = self.createResponseSets(response) 27 | # print self.responseSets 28 | 29 | 30 | 31 | def b3precision(self, response_a, reference_a): 32 | # print response_a.intersection(self.assessableElemSet), 'in precision' 33 | return len(response_a.intersection(reference_a)) / float(len(response_a.intersection(self.assessableElemSet))) 34 | 35 | def b3recall(self, response_a, reference_a): 36 | return len(response_a.intersection(reference_a)) / float(len(reference_a)) 37 | 38 | 39 | 40 | def b3TotalElementPrecision(self): 41 | totalPrecision = 0.0 42 | for c in self.responseSets: 43 | for r in self.responseSets[c]: 44 | if r in self.assessableElemSet: 45 | # print r 46 | totalPrecision += self.b3precision(self.responseSets[c], 47 | self.findCluster(r, self.referenceSets)) 48 | 49 | return totalPrecision / float(len(self.assessableElemSet)) 50 | 51 | def b3TotalElementRecall(self): 52 | totalRecall = 0.0 53 | for c in self.responseSets: 54 | for r in self.responseSets[c]: 55 | if r in self.assessableElemSet: 56 | totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets)) 57 | 58 | return totalRecall / float(len(self.assessableElemSet)) 59 | 60 | 61 | def b3TotalClusterPrecision(self): 62 | totalPrecision = 0.0 63 | for c in self.responseSets: 64 | for r in self.responseSets[c]: 65 | if r in self.assessableElemSet: 66 | totalPrecision += self.b3precision(self.responseSets[c], 67 | self.findCluster(r, self.referenceSets)) / \ 68 | float(len(self.responseSets)*len(self.responseSets[c])) 69 | return totalPrecision 70 | 71 | def b3TotalClusterRecall(self): 72 | totalRecall = 0.0 73 | for c in self.responseSets: 74 | for r in self.responseSets[c]: 75 | if r in self.assessableElemSet: 76 | totalRecall += self.b3recall(self.responseSets[c], self.findCluster(r, self.referenceSets)) / \ 77 | float(len(self.responseSets)*len(self.responseSets[c])) 78 | 79 | return totalRecall 80 | 81 | 82 | def createResponseSets(self, response): 83 | responseSets = {} 84 | numElem = 0 85 | for c in response: 86 | if len(response[c]) > 0: 87 | numElem += len(response[c]) 88 | responseSets[c] = set(response[c]) 89 | 90 | return numElem, responseSets 91 | 92 | 93 | 94 | def createReferenceSets(self, referencePath): 95 | with open(referencePath, 'r') as f: 96 | relations = {} 97 | c = 0 98 | for line in f: 99 | lineSplit = line.split('\t') 100 | relations[c] = lineSplit[-1].strip().split(' ') 101 | c += 1 102 | self.relations = relations 103 | referenceSets = {} 104 | assessableElems = set() 105 | for rel in relations: 106 | if relations[rel][0] != '': 107 | assessableElems.add(rel) 108 | if relations[rel][0] in referenceSets: 109 | referenceSets[relations[rel][0]].add(rel) 110 | else: 111 | referenceSets[relations[rel][0]] = set([rel]) 112 | return referenceSets, assessableElems 113 | 114 | def createValidationReferenceSets(self, referencePath, validationPath): 115 | # referencePath is usually the entire training set 116 | with open(referencePath, 'r') as f, open(validationPath, 'r') as f1: 117 | validationSet = {} 118 | for line in f1: 119 | if line not in validationSet: 120 | validationSet[line] = 1 121 | 122 | relations = {} 123 | c = 0 124 | for line in f: 125 | if line in validationSet: 126 | lineSplit = line.split('\t') 127 | relations[c] = lineSplit[-1].strip().split(' ') 128 | else: 129 | relations[c] = [''] 130 | c += 1 131 | # self.relationsValid = relations 132 | referenceSets = {} 133 | assessableElems = set() 134 | for rel in relations: 135 | if relations[rel][0] != '': 136 | assessableElems.add(rel) 137 | if relations[rel][0] in referenceSets: 138 | referenceSets[relations[rel][0]].add(rel) 139 | else: 140 | referenceSets[relations[rel][0]] = set([rel]) 141 | return referenceSets, assessableElems 142 | 143 | 144 | def createReferenceSetsFromData(self, relations): 145 | self.relations = relations 146 | referenceSets = {} 147 | assessableElems = set() 148 | for rel in relations: 149 | if relations[rel][0] != '': 150 | # print 'category', category 151 | assessableElems.add(rel) 152 | if relations[rel][0] in referenceSets: 153 | referenceSets[relations[rel][0]].add(rel) 154 | else: 155 | referenceSets[relations[rel][0]] = set([rel]) 156 | return referenceSets, assessableElems 157 | 158 | def findCluster(self, a, setsDictionary): 159 | foundClusters = [] 160 | for c in setsDictionary: 161 | if a in setsDictionary[c]: 162 | return setsDictionary[c] 163 | # foundClusters.append(setsDictionary[c]) 164 | # return foundClusters 165 | 166 | def muc3Recall(self): 167 | numerator = 0.0 168 | denominator = 0.0 169 | for c in self.referenceSets: 170 | numerator += len(self.referenceSets[c]) - self.overlap(self.referenceSets[c], self.responseSets) 171 | denominator += len(self.referenceSets[c]) - 1 172 | if denominator == 0.0: 173 | return 0.0 174 | else: 175 | return numerator / denominator 176 | 177 | def muc3Precision(self): 178 | numerator = 0.0 179 | denominator = 0.0 180 | for c in self.responseSets: 181 | if len(self.responseSets[c]) > 0: 182 | # print self.lenAssessableResponseCat(self.responseSets[c]), self.overlap(self.responseSets[c], self.referenceSets) 183 | numerator += self.lenAssessableResponseCat(self.responseSets[c]) - self.overlap(self.responseSets[c], self.referenceSets) 184 | lenRespo = self.lenAssessableResponseCat(self.responseSets[c]) 185 | if lenRespo != 0: 186 | denominator += self.lenAssessableResponseCat(self.responseSets[c]) - 1 187 | if denominator == 0.0: 188 | return 0.0 189 | else: 190 | return numerator / denominator 191 | 192 | def overlap(self, a, setsDictionary): 193 | numberIntersections = 0 194 | for c in setsDictionary: 195 | if len(a.intersection(setsDictionary[c])) > 0: 196 | numberIntersections += 1 197 | return numberIntersections 198 | 199 | 200 | def lenAssessableResponseCat(self, responesSet_c): 201 | length = 0 202 | for r in responesSet_c: 203 | if r in self.assessableElemSet: 204 | length += 1 205 | return length 206 | 207 | def printEvaluation(self, validOrTrain): 208 | 209 | 210 | recB3 = self.b3TotalElementRecall() 211 | precB3 = self.b3TotalElementPrecision() 212 | betasquare = math.pow(0.5, 2) 213 | if recB3 == 0.0 and precB3 == 0.0: 214 | F1B3 = 0.0 215 | F05B3 = 0.0 216 | else: 217 | betasquare = math.pow(0.5, 2) 218 | F1B3 = (2 * recB3 * precB3) / (recB3 + precB3) 219 | F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3) 220 | 221 | print validOrTrain, ' Elementwise B3 F1 =', F1B3, 'F0.5 =', F05B3, 'B3 recall =', recB3, 'B3 precision =', precB3 222 | 223 | 224 | 225 | 226 | def getF05(self): 227 | recB3 = self.b3TotalElementRecall() 228 | precB3 = self.b3TotalElementPrecision() 229 | betasquare = math.pow(0.5, 2) 230 | if recB3 == 0.0 and precB3 == 0.0: 231 | F05B3 = 0.0 232 | else: 233 | F05B3 = ((1+betasquare) * recB3 * precB3)/((betasquare*precB3)+recB3) 234 | return F05B3 235 | 236 | def getF1(self): 237 | recB3 = self.b3TotalElementRecall() 238 | precB3 = self.b3TotalElementPrecision() 239 | 240 | if recB3 == 0.0 and precB3 == 0.0: 241 | F1B3 = 0.0 242 | else: 243 | F1B3 = (2 * recB3 * precB3) / (recB3 + precB3) 244 | return F1B3 245 | 246 | def loadData(pickled_dataset): 247 | 248 | if not os.path.exists(pickled_dataset): 249 | print "Pickled dataset not found" 250 | sys.exit() 251 | 252 | pklFile = open(pickled_dataset, 'rb') 253 | 254 | featureExtrs = pickle.load(pklFile) 255 | 256 | relationLexicon = pickle.load(pklFile) 257 | 258 | data = pickle.load(pklFile) 259 | 260 | goldStandard = pickle.load(pklFile) 261 | 262 | pklFile.close() 263 | 264 | 265 | return goldStandard 266 | 267 | def getCommandArgs(): 268 | parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model') 269 | 270 | parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True, 271 | help='the pickled dataset file (produced by OiePreprocessor.py)') 272 | parser.add_argument('--pickled_results', metavar='pickled_results', nargs='?', required=True, 273 | help='the pickled results file (produced by OiePreprocessor.py)') 274 | 275 | 276 | return parser.parse_args() 277 | 278 | 279 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'admin' 2 | -------------------------------------------------------------------------------- /learning/NegativeExampleGenerator.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | 4 | import numpy as np 5 | 6 | 7 | class NegativeExampleGenerator(object): 8 | def __init__(self, rand, negSamplingCum): 9 | self._rand = rand 10 | self._negSamplingCum = negSamplingCum 11 | # self._neg2SamplingCum = neg2SamplingCum 12 | # self._negSamplingDistrPower = negSamplingDistrPower 13 | # self._compute_unigram_distribution() 14 | 15 | def _univariate_distr_sample(self, sampleSize=1): 16 | return [self._negSamplingCum.searchsorted(self._rand.uniform(0, self._negSamplingCum[-1])) 17 | for i in xrange(0, sampleSize)] 18 | 19 | def generate_random_negative_example(self, positiveArgs, negativeExampleNum): 20 | l = positiveArgs.shape[0] # number of positive instances 21 | n = negativeExampleNum # number of negative examples generated per instance 22 | 23 | negativeArgs = np.zeros((n, l), dtype=np.int32) 24 | for instance_idx in xrange(l): 25 | samples = self._univariate_distr_sample(n) 26 | for negNum_idx in xrange(n): 27 | negativeArgs[negNum_idx, instance_idx] = samples[negNum_idx] 28 | return negativeArgs 29 | -------------------------------------------------------------------------------- /learning/OieData.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | 4 | import math as m 5 | import numpy as np 6 | import scipy.sparse as sp 7 | import theano 8 | from definitions import settings 9 | import cPickle as pickle 10 | 11 | class MatrixDataSet: 12 | # matrix formatted dataset 13 | def __init__(self, arguments1, arguments2, argFeatures, negArgs1, negArgs2): 14 | self.args1 = arguments1 # (l) 15 | self.args2 = arguments2 # (l) 16 | self.xFeats = argFeatures # (l, h) 17 | self.neg1 = negArgs1 # (n, l) 18 | self.neg2 = negArgs2 # (n, l) 19 | 20 | 21 | class MatrixDataSetNoEncoding: 22 | # matrix formatted dataset 23 | def __init__(self, arguments1, arguments2, realProbs): 24 | self.args1 = arguments1 # (l) 25 | self.args2 = arguments2 # (l) 26 | self.realProbs = realProbs # (l, r) 27 | 28 | 29 | 30 | 31 | 32 | class DataSetManager: 33 | def __init__(self, oieDataset, featureLex, rng, negSamplesNum, relationNum, negSamplingDistrPower=0.75): 34 | 35 | self.negSamplesNum = negSamplesNum # the number of negative samples considered 36 | 37 | self.negSamplingDistrPower = negSamplingDistrPower # the sampling distribution for negative sampling 38 | 39 | self.rng = rng 40 | 41 | self.relationNum = relationNum 42 | 43 | # id2Str, str2Id 44 | self.featureLex = featureLex 45 | 46 | # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr 47 | self._extractArgsMappings(oieDataset) 48 | 49 | # each examples csr_matrix[exampleNum x getDimensionality()], labels are numpy.array 50 | 51 | 52 | # self.validExs = self._extractExamples(oieDataset['dev']) 53 | 54 | self.trainExs = self._extractExamples(oieDataset['train']) 55 | if 'dev' in oieDataset: 56 | self.validExs = self._extractExamples(oieDataset['dev']) 57 | else: 58 | self.validExs = None 59 | 60 | if 'test' in oieDataset: 61 | self.testExs = self._extractExamples(oieDataset["test"]) 62 | else: 63 | self.testExs = None 64 | 65 | def _sample(self, cutoffs): 66 | idx = cutoffs.searchsorted(self.rng.uniform(0, cutoffs[-1])) 67 | return idx 68 | 69 | 70 | def _sample1(self, distr): 71 | 72 | # check numpy, it should have some efficient ways to sample from multinomials 73 | val = self.rng.uniform() 74 | pos = 0 75 | for idx in xrange(len(distr)): 76 | pos += distr[idx] 77 | if pos > val: 78 | return idx 79 | return len(distr) - 1 80 | 81 | 82 | def _extractExamples(self, oieExamples): 83 | 84 | l = len(oieExamples) 85 | n = self.negSamplesNum 86 | 87 | args1 = np.zeros(l, dtype=np.int32) # 88 | args2 = np.zeros(l, dtype=np.int32) # 89 | 90 | 91 | neg1 = np.zeros((n, l), dtype=np.int32) # 92 | neg2 = np.zeros((n, l), dtype=np.int32) # 93 | 94 | 95 | # print self.featureLex.getDimensionality() 96 | xFeatsDok = sp.dok_matrix((l, self.featureLex.getDimensionality()), dtype=theano.config.floatX) 97 | # @UndefinedVariable float32 98 | 99 | for i, oieEx in enumerate(oieExamples): 100 | args1[i] = self.arg2Id[oieEx.arg1] 101 | args2[i] = self.arg2Id[oieEx.arg2] 102 | 103 | for feat in oieEx.features: 104 | xFeatsDok[i, feat] = 1 105 | 106 | # should do it differently (sample random indexes during training), see below 107 | 108 | for k in xrange(n): 109 | neg1[k, i] = self._sample(self.negSamplingCum) 110 | 111 | for k in xrange(n): 112 | neg2[k, i] = self._sample(self.negSamplingCum) 113 | 114 | 115 | 116 | xFeats = sp.csr_matrix(xFeatsDok, dtype="float32") 117 | 118 | return MatrixDataSet(args1, args2, xFeats, neg1, neg2) 119 | 120 | def _indexElements(self, elements): 121 | 122 | idx = 0 123 | id2Elem = {} 124 | elem2Id = {} 125 | for x in elements: 126 | id2Elem[idx] = x 127 | elem2Id[x] = idx 128 | idx += 1 129 | return id2Elem, elem2Id 130 | 131 | def _extractArgsMappings(self, oieDataset): 132 | 133 | # sets id2Arg1, id2Arg2, arg12Id, arg22Id, neg1SamplingDistr, neg2SamplingDistr 134 | argFreqs = {} 135 | for key in oieDataset: 136 | for oieEx in oieDataset[key]: # here it iterates over train, test, dev. 137 | if oieEx.arg1 not in argFreqs: 138 | argFreqs[oieEx.arg1] = 1 139 | else: 140 | argFreqs[oieEx.arg1] += 1 141 | 142 | if oieEx.arg2 not in argFreqs: 143 | argFreqs[oieEx.arg2] = 1 144 | else: 145 | argFreqs[oieEx.arg2] += 1 146 | 147 | 148 | 149 | self.id2Arg, self.arg2Id = self._indexElements(argFreqs) 150 | 151 | 152 | argSampFreqs = [float(argFreqs[self.id2Arg[val]]) for val in xrange(len(self.id2Arg))] 153 | argSampFreqsPowered = map(lambda x: m.pow(x, self.negSamplingDistrPower), argSampFreqs) 154 | norm1 = reduce(lambda x, y: x + y, argSampFreqsPowered) 155 | self.negSamplingDistr = map(lambda x: x / norm1, argSampFreqsPowered) 156 | self.negSamplingCum = np.cumsum(self.negSamplingDistr) 157 | 158 | 159 | 160 | 161 | def getArgVocSize(self): 162 | return len(self.arg2Id) 163 | 164 | 165 | def getDimensionality(self): 166 | return self.featureLex.getDimensionality() 167 | 168 | def getNegNum(self): 169 | return self.negSamplesNum 170 | 171 | def getTrainSet(self): 172 | return self.trainExs 173 | 174 | def getValidSet(self): 175 | return self.validExs 176 | 177 | def getTestSet(self): 178 | return self.testExs 179 | 180 | def getRelationNum(self): 181 | return self.relationNum 182 | 183 | def getExampleFeatures(self, id): 184 | a = [] 185 | for e in self.trainExs.xFeats[id].nonzero()[1]: 186 | feat = self.featureLex.getStrPruned(e) 187 | if (self.featureLex.getStrPruned(e).find('trigger') > -1 or 188 | self.featureLex.getStrPruned(e).find('arg1') > -1 or 189 | self.featureLex.getStrPruned(e).find('arg2') > -1): 190 | a.append(feat) 191 | # else: # only for debugging purposes, should be commented 192 | # a.append(feat) 193 | return a 194 | 195 | def getExampleFeature(self, id, feature): 196 | for e in self.trainExs.xFeats[id].nonzero()[1]: 197 | feat = self.featureLex.getStrPruned(e) 198 | if self.featureLex.getStrPruned(e).find(feature) > -1: 199 | return feat 200 | return None 201 | 202 | def getExampleFeatureValid(self, id, feature): 203 | for e in self.validExs.xFeats[id].nonzero()[1]: 204 | feat = self.featureLex.getStrPruned(e) 205 | if self.featureLex.getStrPruned(e).find(feature) > -1: 206 | return feat 207 | return None 208 | 209 | def getExampleFeatureTest(self, id, feature): 210 | for e in self.testExs.xFeats[id].nonzero()[1]: 211 | feat = self.featureLex.getStrPruned(e) 212 | if self.featureLex.getStrPruned(e).find(feature) > -1: 213 | return feat 214 | return None 215 | 216 | def getNegSamplingCum(self): 217 | return self.negSamplingCum 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /learning/OieInduction.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | import argparse 4 | import os 5 | 6 | import numpy as np 7 | 8 | import sys 9 | import time 10 | import cPickle as pickle 11 | import operator 12 | from theano import sparse 13 | import theano 14 | import theano.tensor as T 15 | from learning.OieModel import OieModelFunctions 16 | 17 | from learning.OieData import DataSetManager 18 | from learning.OieData import MatrixDataSet 19 | from processing.OiePreprocessor import FeatureLexicon 20 | from evaluation.OieEvaluation import singleLabelClusterEvaluation 21 | import definitions.settings as settings 22 | from learning.NegativeExampleGenerator import NegativeExampleGenerator 23 | from collections import OrderedDict 24 | 25 | class ReconstructInducer(object): 26 | 27 | def __init__(self, data, goldStandard, rand, epochNum, learningRate, batchSize, embedSize, lambdaL1, lambdaL2, 28 | optimization, modelName, model, fixedSampling, extEmb, extendedReg, 29 | frequentEval, alpha): 30 | self.rand = rand 31 | self.data = data 32 | self.goldStandard = goldStandard 33 | self.optimization = optimization 34 | self.modelName = modelName 35 | self.model = model 36 | self.relationNum = data.getRelationNum() 37 | self.extEmb = extEmb 38 | self.extendedReg = extendedReg 39 | self.frequentEval = frequentEval 40 | self.alpha = alpha 41 | 42 | self.modelID = model + '_' + modelName+'_maxepoch'+str(epochNum)+'_lr'+str(learningRate)\ 43 | + '_embedsize' + str(embedSize) + '_l1' + str(lambdaL1) + '_l2' + str(lambdaL2) \ 44 | + '_opt' + str(optimization) + '_rel_num' + str(self.relationNum)+ \ 45 | '_batch' + str(batchSize) + '_negs' + str(data.negSamplesNum) 46 | 47 | self.modelFunc = OieModelFunctions(rand, data.getDimensionality(), embedSize, self.relationNum, 48 | data.getArgVocSize(), model, self.data, self.extEmb, self.extendedReg, 49 | self.alpha) 50 | 51 | self.embedSize = embedSize 52 | self.epochNum = epochNum 53 | self.learningRate = learningRate 54 | self.batchSize = batchSize 55 | self.lambdaL1 = lambdaL1 56 | self.lambdaL2 = lambdaL2 57 | self.fixedSampling = fixedSampling 58 | self.negativeSampler = NegativeExampleGenerator(rand, data.getNegSamplingCum()) 59 | self.accumulator = [] 60 | 61 | 62 | 63 | def _makeShared(self, matrixDataset, borrow=True): 64 | 65 | sharedMatrix = MatrixDataSet( 66 | arguments1=theano.shared(matrixDataset.args1, borrow=borrow), 67 | arguments2=theano.shared(matrixDataset.args2, borrow=borrow), 68 | argFeatures=theano.shared(matrixDataset.xFeats, borrow=borrow), 69 | negArgs1=theano.shared(matrixDataset.neg1, borrow=borrow), 70 | negArgs2=theano.shared(matrixDataset.neg2, borrow=borrow) 71 | ) 72 | return sharedMatrix 73 | 74 | 75 | def compileFunction(self, learningRate, epochNum, batchSize, lambda1, lambda2): 76 | 77 | trainDataNP = self.data.getTrainSet() 78 | trainData = self._makeShared(trainDataNP) 79 | 80 | validDataNP = self.data.getValidSet() 81 | 82 | testDataNP = self.data.getTestSet() 83 | 84 | if validDataNP is not None: 85 | validData = self._makeShared(validDataNP) 86 | 87 | if testDataNP is not None: 88 | testData = self._makeShared(testDataNP) 89 | 90 | # build the symbolic computation 91 | 92 | batchIdx = T.lscalar() # index to a [mini]batch 93 | xFeats = sparse.csr_matrix(name='x', dtype='float32') # l, h 94 | 95 | args1 = T.ivector() # l 96 | args2 = T.ivector() # l 97 | neg1 = T.imatrix() # n, l 98 | neg2 = T.imatrix() # n, l 99 | 100 | print "Starting to build train err computation (not compiling it yet)" 101 | adjust = float(batchSize) / float(trainDataNP.args1.shape[0]) 102 | 103 | cost = self.modelFunc.buildTrainErrComputation(batchSize, self.data.getNegNum(), 104 | xFeats, args1, args2, neg1, neg2) + \ 105 | (lambda1 * self.modelFunc.L1 * adjust) + \ 106 | (lambda2 * self.modelFunc.L2 * adjust) 107 | 108 | if self.optimization == 1: 109 | from learning.Optimizers import AdaGrad 110 | ada = AdaGrad(self.modelFunc.params) 111 | updates = ada.update(self.learningRate, self.modelFunc.params, cost) 112 | if False: 113 | adaEncoder = AdaGrad(self.modelFunc.relationClassifiers.params) 114 | updatesEncoder = adaEncoder.update(self.learningRate, self.modelFunc.relationClassifiers.params, cost) 115 | 116 | adaDecoder = AdaGrad(self.modelFunc.argProjector.params) 117 | updatesDecoder = adaDecoder.update(self.learningRate, self.modelFunc.argProjector.params, cost) 118 | 119 | elif self.optimization == 0: 120 | from learning.Optimizers import SGD 121 | sgd = SGD() 122 | updates = sgd.update(self.learningRate, self.modelFunc.params, cost) 123 | 124 | 125 | 126 | print "Compiling train function..." 127 | 128 | 129 | 130 | trainModel = theano.function(inputs=[batchIdx, neg1, neg2], 131 | outputs=cost, 132 | updates=updates, 133 | givens={ 134 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize], 135 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize], 136 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize] 137 | } 138 | ) 139 | if False: 140 | trainEncoder = theano.function(inputs=[batchIdx, neg1, neg2], 141 | outputs=cost, 142 | updates=updatesEncoder, 143 | givens={ 144 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize], 145 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize], 146 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize] 147 | } 148 | ) 149 | trainDecoder = theano.function(inputs=[batchIdx, neg1, neg2], 150 | outputs=cost, 151 | updates=updatesDecoder, 152 | givens={ 153 | xFeats: trainData.xFeats[batchIdx * batchSize: (batchIdx + 1) * batchSize], 154 | args1: trainData.args1[batchIdx * batchSize: (batchIdx + 1) * batchSize], 155 | args2: trainData.args2[batchIdx * batchSize: (batchIdx + 1) * batchSize] 156 | } 157 | ) 158 | 159 | prediction = self.modelFunc.buildLabelComputation(batchSize, xFeats) 160 | 161 | print "Compiling label function (for training)..." 162 | labelTrain = theano.function(inputs=[batchIdx], 163 | outputs=prediction, 164 | updates=[], 165 | givens={ 166 | xFeats: trainData.xFeats[batchIdx * batchSize:(batchIdx + 1) * batchSize]}) 167 | 168 | if validDataNP is not None: 169 | print "Compiling label function (for validation)..." 170 | labelValid = theano.function(inputs=[batchIdx], 171 | outputs=prediction, 172 | updates=[], 173 | givens={xFeats: validData.xFeats[batchIdx * batchSize: 174 | (batchIdx + 1) * batchSize]}) 175 | if testDataNP is not None: 176 | print "Compiling label function (for test)..." 177 | labelTest = theano.function(inputs=[batchIdx], 178 | outputs=prediction, 179 | updates=[], 180 | givens={xFeats: testData.xFeats[batchIdx * batchSize: 181 | (batchIdx + 1) * batchSize]}) 182 | 183 | 184 | print "Done with compiling function." 185 | if validDataNP is not None and testDataNP is not None: 186 | 187 | return trainModel, labelTest, labelValid 188 | else: 189 | if False: 190 | return trainEncoder, trainDecoder, labelTrain 191 | else: 192 | return trainModel, labelTrain 193 | 194 | def learn(self): 195 | trainDataNP = self.data.getTrainSet() 196 | validDataNP = self.data.getValidSet() 197 | testDataNP = self.data.getTestSet() 198 | 199 | print "Starting to compile functions" 200 | 201 | 202 | if validDataNP is not None and testDataNP is not None: 203 | trainModel, labelTest, labelValid = self.compileFunction(self.learningRate, self.epochNum, 204 | self.batchSize, self.lambdaL1, self.lambdaL2) 205 | else: 206 | if False: 207 | trainEncoder, trainDecoder, labelTrain = self.compileFunction(self.learningRate, self.epochNum, 208 | self.batchSize, self.lambdaL1, self.lambdaL2) 209 | else: 210 | trainModel, labelTrain = self.compileFunction(self.learningRate, self.epochNum, 211 | self.batchSize, self.lambdaL1, self.lambdaL2) 212 | 213 | 214 | ############### 215 | # TRAIN MODEL # 216 | ############### 217 | 218 | # compute number of minibatches for training, validation and testing 219 | trainBatchNum = trainDataNP.args1.shape[0] / self.batchSize 220 | 221 | if validDataNP is not None and testDataNP is not None: 222 | validBatchNum = validDataNP.args1.shape[0] / self.batchSize 223 | validEval = singleLabelClusterEvaluation(self.goldStandard['dev'], False) 224 | 225 | testBatchNum = testDataNP.args1.shape[0] / self.batchSize 226 | testEval = singleLabelClusterEvaluation(self.goldStandard['test'], False) 227 | else: 228 | trainEval = singleLabelClusterEvaluation(self.goldStandard['train'], False) 229 | 230 | print str(trainBatchNum * self.batchSize) + " training examples, " 231 | # print trainDataNP.args1.shape[0], self.batchSize, trainBatchNum 232 | print '... training the model' 233 | startTime = time.clock() 234 | 235 | doneLooping = False 236 | epoch = 0 237 | 238 | 239 | while (epoch < self.epochNum) and (not doneLooping): 240 | negativeSamples1 = self.negativeSampler.generate_random_negative_example(trainDataNP.args1, 241 | self.data.getNegNum()) 242 | negativeSamples2 = self.negativeSampler.generate_random_negative_example(trainDataNP.args2, 243 | self.data.getNegNum()) 244 | 245 | err = 0 246 | epochStartTime = time.clock() 247 | 248 | epoch += 1 249 | print '\nEPOCH ' + str(epoch) 250 | for idx in xrange(trainBatchNum): 251 | if not self.fixedSampling: 252 | neg1 = negativeSamples1[:, idx * self.batchSize: (idx + 1) * self.batchSize] 253 | neg2 = negativeSamples2[:, idx * self.batchSize: (idx + 1) * self.batchSize] 254 | else: 255 | neg1 = trainDataNP.neg1[:, idx * self.batchSize: (idx + 1) * self.batchSize] 256 | neg2 = trainDataNP.neg2[:, idx * self.batchSize: (idx + 1) * self.batchSize] 257 | 258 | 259 | ls = trainModel(idx, neg1, neg2) 260 | err += ls 261 | 262 | # self.modelFunc.argProjector.normalize() 263 | # print('.'), 264 | if self.frequentEval: 265 | if validDataNP is not None and testDataNP is not None: 266 | if idx % 1 == 0: 267 | print(str(idx * batchSize)), 268 | print idx, '############################################################' 269 | validCluster = self.getClustersSets(labelValid, validBatchNum) 270 | validEval.createResponse(validCluster) 271 | validEval.printEvaluation('Validation') 272 | 273 | testCluster = self.getClustersSets(labelTest, testBatchNum) 274 | testEval.createResponse(testCluster) 275 | testEval.printEvaluation('Test') 276 | else: 277 | print(str(idx * batchSize)), 278 | print idx, '############################################################' 279 | trainClusters = self.getClustersPopulation(labelTrain, trainBatchNum) 280 | print trainClusters 281 | print 282 | 283 | 284 | epochEndTime = time.clock() 285 | 286 | print 'Training error ', str(err) 287 | print "Epoch time = " + str(epochEndTime - epochStartTime) 288 | 289 | if validDataNP is None or testDataNP is None: 290 | print 'Training Set' 291 | # print labelTrain(1)[1] 292 | trainClusters = self.getClustersSets(labelTrain, trainBatchNum) 293 | posteriorsTrain = [labelTrain(i)[1] for i in xrange(trainBatchNum)] 294 | trainPosteriors = [item for sublist in posteriorsTrain for item in sublist] 295 | # for p, probs in enumerate(predictions): 296 | # print p, probs 297 | trainEval.createResponse(trainClusters) 298 | if self.modelName != 'Test': 299 | trainEval.printEvaluation('Training') 300 | 301 | if self.modelName == 'Test': 302 | self.getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize) 303 | else: 304 | getClustersWithFrequencies(trainClusters, self.data, settings.elems_to_visualize) 305 | if not settings.debug: 306 | pickleClustering(trainClusters, self.modelID+'_epoch'+str(epoch)) 307 | if epoch % 5 == 0 and epoch > 0: 308 | picklePosteriors(trainPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)) 309 | 310 | if validDataNP is not None and testDataNP is not None: 311 | 312 | validCluster = self.getClustersSets(labelValid, validBatchNum) 313 | posteriorsValid = [labelValid(i)[1] for i in xrange(validBatchNum)] 314 | validPosteriors = [item for sublist in posteriorsValid for item in sublist] 315 | validEval.createResponse(validCluster) 316 | validEval.printEvaluation('Validation') 317 | getClustersWithFrequenciesValid(validCluster, self.data, settings.elems_to_visualize) 318 | if not settings.debug: 319 | pickleClustering(validCluster, self.modelID+'_epoch'+str(epoch)+'_valid') 320 | if epoch % 5 == 0 and epoch > 0: 321 | picklePosteriors(validPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_valid') 322 | 323 | testCluster = self.getClustersSets(labelTest, testBatchNum) 324 | posteriorsTest = [labelTest(i)[1] for i in xrange(testBatchNum)] 325 | testPosteriors = [item for sublist in posteriorsTest for item in sublist] 326 | testEval.createResponse(testCluster) 327 | testEval.printEvaluation('Test') 328 | getClustersWithFrequenciesTest(testCluster, self.data, settings.elems_to_visualize) 329 | if not settings.debug: 330 | pickleClustering(testCluster, self.modelID+'_epoch'+str(epoch)+'_test') 331 | if epoch % 5 == 0 and epoch > 0: 332 | picklePosteriors(testPosteriors, self.modelID+'_Posteriors_epoch'+str(epoch)+'_test') 333 | 334 | 335 | endTime = time.clock() 336 | print 'Optimization complete' 337 | print 'The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (endTime - startTime)) 338 | print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + 339 | ' ran for %.1fs' % ((endTime - startTime))) 340 | 341 | 342 | 343 | 344 | def getClustersSets(self, labelTrain, trainBatchNum): 345 | clusters = {} 346 | for i in xrange(self.relationNum): 347 | clusters[i] = set() 348 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)] 349 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list 350 | for j in xrange(len(predictions)): 351 | clusters[predictions[j]].add(j) 352 | return clusters 353 | 354 | def getClustersPopulation(self, labelTrain, trainBatchNum): 355 | clusters = {} 356 | for i in xrange(self.relationNum): 357 | clusters[i] = 0 358 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)] 359 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list 360 | for j in xrange(len(predictions)): 361 | clusters[predictions[j]] += 1 362 | return clusters 363 | 364 | def getClusters(self, labelTrain, trainBatchNum, train_dev): 365 | clusters = {} 366 | for i in xrange(self.relationNum): 367 | clusters[i] = [] 368 | predictionsTrain = [labelTrain(i)[0] for i in xrange(trainBatchNum)] 369 | predictions = [item for sublist in predictionsTrain for item in sublist] # returns the flatten() list 370 | for j in xrange(len(predictions)): 371 | clusters[predictions[j]].append(self.data.getExampleRelation(j, train_dev)) 372 | return clusters 373 | 374 | 375 | def getClusteredFreq(self, clusters): 376 | clustFreq = {} 377 | for i in xrange(self.relationNum): 378 | clustFreq[i] = {} 379 | j = 0 380 | for c in clusters: 381 | for feat in clusters[c]: 382 | if feat in clustFreq[j]: 383 | clustFreq[j][feat] += 1 384 | else: 385 | clustFreq[j][feat] = 1 386 | clustFreq[j] = sorted(clustFreq[j].iteritems(), key=operator.itemgetter(1), reverse=True) 387 | j += 1 388 | return clustFreq 389 | 390 | def printFirstK(self, k, clusterFreq): 391 | for c in clusterFreq: 392 | print clusterFreq[c][:k] 393 | 394 | 395 | def getClustersWithFrequencies(self, clusterSets, data, threshold): 396 | for c in clusterSets: 397 | frequency = {} 398 | print c, 399 | for elem in clusterSets[c]: 400 | trig = self.goldStandard['train'][elem][0] 401 | if trig in frequency: 402 | frequency[trig] += 1 403 | else: 404 | frequency[trig] = 1 405 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) 406 | if len(sorted_freq) < threshold: 407 | for el in sorted_freq: 408 | print el, 409 | else: 410 | count = 0 411 | for el in sorted_freq: 412 | if count > threshold: 413 | break 414 | else: 415 | print el, 416 | count += 1 417 | print '' 418 | 419 | 420 | def saveModel(model, name): 421 | pklProtocol = 2 422 | pklFile = open(settings.models_path + name, 'wb') 423 | pickle.dump(model, pklFile, protocol=pklProtocol) 424 | 425 | def loadModel(name): 426 | pklFile = open(settings.models_path + name, 'rb') 427 | return pickle.load(pklFile) 428 | 429 | def loadData(args, rng, negativeSamples, relationNum, modelType): 430 | 431 | if not os.path.exists(args.pickled_dataset): 432 | print "Pickled dataset not found" 433 | sys.exit() 434 | 435 | tStart = time.time() 436 | print "Found existing pickled dataset, loading...", 437 | 438 | pklFile = open(args.pickled_dataset, 'rb') 439 | 440 | featureExtrs = pickle.load(pklFile) 441 | 442 | relationLexicon = pickle.load(pklFile) 443 | 444 | data = pickle.load(pklFile) 445 | 446 | goldStandard = pickle.load(pklFile) 447 | 448 | pklFile.close() 449 | tEnd = time.time() 450 | print "Done (" + str(tEnd - tStart) + "s.)" 451 | 452 | trigs = False 453 | 454 | 455 | indexedDataset = DataSetManager(data, relationLexicon, rng, negativeSamples, relationNum, trigs) 456 | 457 | print "Produced indexed dataset" 458 | 459 | return indexedDataset, goldStandard 460 | 461 | def pickleClustering(clustering, clusteringName): 462 | pklProtocol = 2 463 | pklFile = open(settings.clusters_path + clusteringName, 'wb') 464 | pickle.dump(clustering, pklFile, protocol=pklProtocol) 465 | 466 | 467 | def picklePosteriors(posteriors, posteriorsName): 468 | pklProtocol = 2 469 | pklFile = open(settings.clusters_path + posteriorsName, 'wb') 470 | pickle.dump(posteriors, pklFile, protocol=pklProtocol) 471 | 472 | def getClustersWithInfo(clusterSets, data, threshold): 473 | for c in clusterSets: 474 | print c, 475 | if len(clusterSets[c]) < threshold: 476 | for elem in clusterSets[c]: 477 | print elem, data.getExampleFeatures(elem), 478 | else: 479 | count = 0 480 | for elem in clusterSets[c]: 481 | if count > threshold: 482 | break 483 | else: 484 | print elem, data.getExampleFeatures(elem), 485 | count += 1 486 | print '' 487 | 488 | 489 | def getClustersWithFrequencies(clusterSets, data, threshold): 490 | for c in clusterSets: 491 | frequency = {} 492 | print c, 493 | for elem in clusterSets[c]: 494 | trig = data.getExampleFeature(elem, 'trigger') 495 | if trig is not None: 496 | trig = trig.replace('trigger#', '') 497 | if trig in frequency: 498 | frequency[trig] += 1 499 | else: 500 | frequency[trig] = 1 501 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) 502 | if len(sorted_freq) < threshold: 503 | for el in sorted_freq: 504 | print el, 505 | else: 506 | count = 0 507 | for el in sorted_freq: 508 | if count > threshold: 509 | break 510 | else: 511 | print el, 512 | count += 1 513 | print '' 514 | 515 | 516 | def getClustersWithFrequenciesValid(clusterSets, data, threshold): 517 | for c in clusterSets: 518 | frequency = {} 519 | print c, 520 | for elem in clusterSets[c]: 521 | trig = data.getExampleFeatureValid(elem, 'trigger') 522 | if trig is not None: 523 | trig = trig.replace('trigger#', '') 524 | if trig in frequency: 525 | frequency[trig] += 1 526 | else: 527 | frequency[trig] = 1 528 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) 529 | if len(sorted_freq) < threshold: 530 | for el in sorted_freq: 531 | print el, 532 | else: 533 | count = 0 534 | for el in sorted_freq: 535 | if count > threshold: 536 | break 537 | else: 538 | print el, 539 | count += 1 540 | print '' 541 | 542 | 543 | def getClustersWithFrequenciesTest(clusterSets, data, threshold): 544 | for c in clusterSets: 545 | frequency = {} 546 | print c, 547 | for elem in clusterSets[c]: 548 | trig = data.getExampleFeatureTest(elem, 'trigger') 549 | if trig is not None: 550 | trig = trig.replace('trigger#', '') 551 | if trig in frequency: 552 | frequency[trig] += 1 553 | else: 554 | frequency[trig] = 1 555 | sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) 556 | if len(sorted_freq) < threshold: 557 | for el in sorted_freq: 558 | print el, 559 | else: 560 | count = 0 561 | for el in sorted_freq: 562 | if count > threshold: 563 | break 564 | else: 565 | print el, 566 | count += 1 567 | print '' 568 | 569 | def getClustersWithRelationLabels(clusterSets, data, evaluation, threshold): 570 | for c in clusterSets: 571 | print c, 572 | if len(clusterSets[c]) < threshold: 573 | for elem in clusterSets[c]: 574 | if evaluation.relations[elem][0] != '': 575 | print elem, data.getExampleFeatures(elem), evaluation.relations[elem], 576 | else: 577 | count = 0 578 | for elem in clusterSets[c]: 579 | if count > threshold: 580 | break 581 | else: 582 | if evaluation.relations[elem][0] != '': 583 | print elem, data.getExampleFeatures(elem), evaluation.relations[elem], 584 | count += 1 585 | print '' 586 | 587 | 588 | def getCommandArgs(): 589 | parser = argparse.ArgumentParser(description='Trains a basic Open Information Extraction Model') 590 | 591 | parser.add_argument('--pickled_dataset', metavar='pickled_dataset', nargs='?', required=True, 592 | help='the pickled dataset file (produced by OiePreprocessor.py)') 593 | 594 | parser.add_argument('--epochs', metavar='epochs', nargs='?', type=int, default=100, 595 | help='maximum number of epochs') 596 | 597 | parser.add_argument('--learning_rate', metavar='learning_rate', nargs='?', type=float, default=0.1, 598 | help='initial learning rate') 599 | 600 | parser.add_argument('--batch_size', metavar='batch_size', nargs='?', type=int, default=50, 601 | help='size of the minibatches') 602 | 603 | parser.add_argument('--embed_size', metavar='embed_size', nargs='?', type=int, default=30, 604 | help='initial learning rate') 605 | 606 | parser.add_argument('--relations_number', metavar='relations_number', type=int, nargs='?', default=3, 607 | help='number of relations to induce') 608 | 609 | parser.add_argument('--negative_samples_number', metavar='negative_samples_number', nargs='?', type=int, default=5, 610 | help='number of negative samples') 611 | 612 | parser.add_argument('--l1_regularization', metavar='l1_regularization', nargs='?', type=float, default=0.0, 613 | help='lambda value of L1 regulatization') 614 | 615 | parser.add_argument('--l2_regularization', metavar='l2_regularization', nargs='?', type=float, default=0.0, 616 | help='lambda value of L2 regulatization') 617 | 618 | parser.add_argument('--optimization', metavar='optimization', nargs='?', type=int, default='0', 619 | help='optimization algorithm 0 SGD, 1 ADAGrad, 2 ADADelta. Default SDG.') 620 | 621 | parser.add_argument('--model_name', metavar='model_name', nargs='?', required=True, type=str, 622 | help='Name or ID of the model') 623 | 624 | parser.add_argument('--model', metavar='model', nargs='?', type=str, required=True, 625 | help='Model Type choose among A, C, AC.') 626 | 627 | parser.add_argument('--fixed_sampling', metavar='fixed_sampling', nargs='?', default='False', 628 | help='fixed/dynamic sampling switch, default fixed sampling') 629 | 630 | parser.add_argument('--ext_emb', metavar='ext_emb', nargs='?', default='False', 631 | help='external embeddings, default False') 632 | 633 | parser.add_argument('--extended_reg', metavar='extended_reg', nargs='?', default='False', 634 | help='extended regularization on reconstruction parameters, default False') 635 | 636 | parser.add_argument('--frequent_eval', metavar='frequent_eval', nargs='?', default='False', 637 | help='using frequent evaluation, default False') 638 | 639 | parser.add_argument('--seed', metavar='seed', nargs='?', type=int, default=2, 640 | help='random seed, default 2') 641 | 642 | parser.add_argument('--alpha', metavar='alpha', nargs='?', type=float, default=1.0, 643 | help='alpha coefficient for scaling the entropy term') 644 | 645 | 646 | return parser.parse_args() 647 | 648 | 649 | 650 | 651 | 652 | if __name__ == '__main__': 653 | print "Relation Learner" 654 | 655 | args = getCommandArgs() 656 | print args 657 | rseed = args.seed 658 | rand = np.random.RandomState(seed=rseed) 659 | 660 | 661 | negativeSamples = args.negative_samples_number 662 | numberRelations = args.relations_number 663 | indexedData, goldStandard = loadData(args, rand, negativeSamples, numberRelations, args.model) 664 | 665 | 666 | maxEpochs = args.epochs 667 | learningRate = args.learning_rate 668 | batchSize = args.batch_size 669 | embedSize = args.embed_size 670 | lambdaL1 = args.l1_regularization 671 | lambdaL2 = args.l2_regularization 672 | optimization = args.optimization 673 | modelName = args.model_name 674 | model = args.model 675 | fixedSampling = eval(args.fixed_sampling) 676 | extEmb = eval(args.ext_emb) 677 | extendedReg = eval(args.extended_reg) 678 | frequentEval = eval(args.frequent_eval) 679 | alpha = args.alpha 680 | inducer = ReconstructInducer(indexedData, goldStandard, rand, maxEpochs, learningRate, 681 | batchSize, embedSize, lambdaL1, lambdaL2, optimization, modelName, 682 | model, fixedSampling, extEmb, extendedReg, 683 | frequentEval, alpha) 684 | 685 | 686 | 687 | inducer.learn() 688 | 689 | saveModel(inducer, inducer.modelName) 690 | 691 | -------------------------------------------------------------------------------- /learning/OieModel.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | 4 | import theano.tensor as T 5 | import theano 6 | from models.encoders.RelationClassifier import IndependentRelationClassifiers 7 | 8 | class OieModelFunctions(object): 9 | 10 | def __init__(self, rng, featureDim, embedSize, relationNum, argVocSize, model, 11 | data, extEmb, extendedReg, alpha): 12 | self.rng = rng 13 | 14 | self.h = featureDim 15 | self.k = embedSize 16 | self.r = relationNum 17 | 18 | self.a = argVocSize 19 | self.model = model 20 | self.relationClassifiers = IndependentRelationClassifiers(rng, featureDim, relationNum) 21 | self.params = self.relationClassifiers.params 22 | self.alpha = alpha 23 | print 'Feature space size =', self.h 24 | print 'Argument vocabulary size =', argVocSize 25 | 26 | self.L1 = T.sum(abs(self.relationClassifiers.W)) 27 | 28 | self.L2 = T.sum(T.sqr(self.relationClassifiers.W)) # + T.sum(T.sqr(self.relationClassifiers.Wb)) 29 | 30 | if self.model == 'A': 31 | print 'Bilinear Model' 32 | from models.decoders.Bilinear import Bilinear 33 | 34 | self.argProjector = Bilinear(rng, embedSize, relationNum, self.a, data, extEmb) 35 | self.params += self.argProjector.params 36 | if extendedReg: 37 | self.L1 += T.sum(abs(self.argProjector.C)) 38 | self.L2 += T.sum(T.sqr(self.argProjector.C)) 39 | 40 | elif self.model == 'AC': 41 | print 'Bilinear + Selectional Preferences Model' 42 | from models.decoders.BilinearPlusSP import BilinearPlusSP 43 | 44 | self.argProjector = BilinearPlusSP(rng, embedSize, relationNum, self.a, data, extEmb) 45 | self.params += self.argProjector.params 46 | if extendedReg: 47 | self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2)) + T.sum(abs(self.argProjector.C)) 48 | self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2)) + T.sum(T.sqr(self.argProjector.C)) 49 | 50 | 51 | elif self.model == 'C': 52 | print 'Selectional Preferences' 53 | from models.decoders.SelectionalPreferences import SelectionalPreferences 54 | 55 | self.argProjector = SelectionalPreferences(rng, embedSize, relationNum, self.a, data, extEmb) 56 | self.params += self.argProjector.params 57 | if extendedReg: 58 | self.L1 += T.sum(abs(self.argProjector.C1)) + T.sum(abs(self.argProjector.C2)) 59 | self.L2 += T.sum(T.sqr(self.argProjector.C1)) + T.sum(T.sqr(self.argProjector.C2)) 60 | 61 | 62 | 63 | def buildTrainErrComputation(self, batchSize, negNum, xFeats, args1, args2, neg1, neg2): 64 | l = batchSize 65 | n = negNum 66 | 67 | # print xFeats 68 | print "Relation classifiers..." 69 | # relationLabeler.output are probabilities of relations assignment arranged in a tensor [l, r] 70 | relationProbs = self.relationClassifiers.compRelationProbsFunc(xFeats=xFeats) 71 | print "Arg projection..." 72 | 73 | entropy = self.alpha * -T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l] 74 | 75 | if self.model == 'A': 76 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy) 77 | 78 | 79 | elif self.model == 'AC': 80 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy) 81 | 82 | 83 | elif self.model == 'C': 84 | allScores = self.argProjector.getScores(args1, args2, l, n, relationProbs, neg1, neg2, entropy) 85 | 86 | 87 | resError = -T.mean(allScores) 88 | print "Done with building the graph..." 89 | # resError = theano.printing.Print("resError ")(resError) 90 | return resError 91 | 92 | 93 | 94 | 95 | def buildLabelComputation(self, batchSize, xFeats): 96 | # xFeats [ l * e, h ] matrix 97 | return self.relationClassifiers.labelFunct(batchSize, xFeats) 98 | 99 | 100 | def buildRelationProbComputation(self, batchSize, xFeats): 101 | return self.relationClassifiers.compRelationProbsFunc(xFeats) 102 | 103 | -------------------------------------------------------------------------------- /learning/Optimizers.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | 7 | 8 | 9 | class AdaGrad(object): 10 | def __init__(self, params): 11 | self.accumulator = [] 12 | for para_i in params: 13 | eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) 14 | self.accumulator.append(theano.shared(eps_p, borrow=True)) 15 | 16 | def update(self, learningRate, params, cost): 17 | print 'AdaGrad takes the floor' 18 | grads = T.grad(cost, params) 19 | updates = [] 20 | for param_i, grad_i, acc_i in zip(params, grads, self.accumulator): 21 | acc = acc_i + T.sqr(grad_i) 22 | updates.append((param_i, param_i - learningRate * grad_i / (T.sqrt(acc)+1e-6))) 23 | updates.append((acc_i, acc)) 24 | return updates 25 | 26 | 27 | class SGD(object): 28 | def update(self, learningRate, params, cost): 29 | print 'SGD takes the floor' 30 | grads = T.grad(cost, params) 31 | updates = [] 32 | for param_i, grad_i in zip(params, grads): 33 | updates.append((param_i, param_i - learningRate * grad_i)) 34 | return updates 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /learning/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'admin' 2 | -------------------------------------------------------------------------------- /learning/models/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | -------------------------------------------------------------------------------- /learning/models/decoders/Bilinear.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | 3 | import math 4 | import theano 5 | from definitions import settings 6 | import numpy as np 7 | import theano.tensor as T 8 | from collections import OrderedDict 9 | import cPickle as pickle 10 | 11 | class Bilinear(object): 12 | 13 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb): 14 | 15 | self.k = embedSize 16 | self.r = relationNum 17 | self.a = argVocSize 18 | 19 | a = self.a 20 | k = self.k 21 | r = self.r 22 | 23 | 24 | 25 | # KxK matrix for each argument-argument for each relation 26 | CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX) 27 | 28 | 29 | self.C = theano.shared(value=CNP, name='C') 30 | # self.C = theano.printing.Print("C = ")(self.C) 31 | # argument embeddings 32 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) 33 | 34 | if ex_emb: 35 | import gensim 36 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path) 37 | for idArg in xrange(self.a): 38 | arg = data.id2Arg[idArg].lower().split(' ') 39 | new = np.zeros(k, dtype=theano.config.floatX) 40 | size = 0 41 | for ar in arg: 42 | if ar in external_embeddings: 43 | new += external_embeddings[ar] 44 | size += 1 45 | if size > 0: 46 | ANP[idArg] = new/size 47 | 48 | self.A = theano.shared(value=ANP, name='A') # (a1, k) 49 | 50 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable 51 | name='Ab', borrow=True) 52 | 53 | self.updates = OrderedDict({self.A: self.A / T.sqrt(T.sum(T.sqr(self.A), axis=0))}) 54 | self.normalize = theano.function([], [], updates=self.updates) 55 | 56 | # self.params = [self.C, self.A] 57 | self.params = [self.C, self.A, self.Ab] 58 | 59 | 60 | 61 | def factorization(self, batchSize, argsEmbA, argsEmbB, wC): 62 | 63 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k] 64 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k] 65 | Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l] 66 | # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l] 67 | return Asecond 68 | 69 | def negFactorization1(self, batchSize, negEmbA, argsEmbB, wC): 70 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k] 71 | Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k, k] * [n, l, k] = [l, k, n] 72 | Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]]) # [l, k, n] * [l, k] = [l, n] 73 | return Asecond 74 | 75 | def negFactorization2(self, batchSize, argsEmbA, negEmbB, wC): 76 | # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k] 77 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k] 78 | Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k] * [l, k, n] = [l, n] 79 | return Asecond 80 | 81 | 82 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): 83 | argembed1 = self.A[args1] 84 | argembed2 = self.A[args2] 85 | 86 | weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) 87 | one = self.factorization(batchSize=l, 88 | argsEmbA=argembed1, 89 | argsEmbB=argembed2, 90 | wC=weightedC) # [l,n] 91 | 92 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) 93 | 94 | logScoresP = T.log(T.nnet.sigmoid(u)) 95 | 96 | allScores = logScoresP 97 | allScores = T.concatenate([allScores, entropy, entropy]) 98 | 99 | 100 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) 101 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) 102 | negOne = self.negFactorization1(batchSize=l, 103 | negEmbA=negembed1, 104 | argsEmbB=argembed2, 105 | wC=weightedC) 106 | 107 | negTwo = self.negFactorization2(batchSize=l, 108 | argsEmbA=argembed1, 109 | negEmbB=negembed2, 110 | wC=weightedC) 111 | 112 | g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0), 113 | negTwo + self.Ab[neg2].dimshuffle(1, 0)]) 114 | logScores = T.log(T.nnet.sigmoid(-g)) 115 | allScores = T.concatenate([allScores, logScores.flatten()]) 116 | return allScores 117 | 118 | 119 | -------------------------------------------------------------------------------- /learning/models/decoders/BilinearPlusSP.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | 3 | import math 4 | import theano 5 | from definitions import settings 6 | import numpy as np 7 | import theano.tensor as T 8 | import cPickle as pickle 9 | 10 | class BilinearPlusSP(object): 11 | 12 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb, ): 13 | 14 | self.k = embedSize 15 | self.r = relationNum 16 | self.a = argVocSize 17 | 18 | a = self.a 19 | k = self.k 20 | r = self.r 21 | 22 | 23 | # KxK matrix for each argument-argument for each relation 24 | CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX) 25 | # @UndefinedVariable 26 | self.C = theano.shared(value=CNP, name='C') 27 | # self.C = theano.printing.Print("C = ")(self.C) 28 | 29 | # Selectional Preferences 30 | Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX) 31 | Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX) 32 | self.C1 = theano.shared(value=Ca1NP, name='C1') 33 | self.C2 = theano.shared(value=Ca2NP, name='C2') 34 | # argument embeddings 35 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable 36 | 37 | if ex_emb: 38 | import gensim 39 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path) 40 | 41 | for idArg in xrange(self.a): 42 | arg = data.id2Arg[idArg].lower().split(' ') 43 | new = np.zeros(k, dtype=theano.config.floatX) 44 | size = 0 45 | for ar in arg: 46 | if ar in external_embeddings: 47 | new += external_embeddings[ar] 48 | size += 1 49 | if size > 0: 50 | ANP[idArg] = new/size 51 | 52 | self.A = theano.shared(value=ANP, name='A') # (a1, k) 53 | 54 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable 55 | name='Ab', borrow=True) 56 | 57 | self.params = [self.C, self.A, self.Ab, self.C1, self.C2] 58 | 59 | 60 | 61 | 62 | def factorization(self, batchSize, argsEmbA, argsEmbB, wC, wC1, wC2): 63 | # l = batchSize 64 | # k = self.k # embed size 65 | # r = self.r # relation number 66 | 67 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # + self.Cb # [l, k, k] * [l, k] = [l, k] 68 | Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l] 69 | spFirst = T.batched_dot(wC1, argsEmbA) 70 | spSecond = T.batched_dot(wC2, argsEmbB) 71 | return Asecond + spFirst + spSecond 72 | 73 | 74 | 75 | def negLeftFactorization(self, batchSize, negEmbA, argsEmbB, wC, wC1, wC2): 76 | # l = batchSize 77 | # k = self.k # embed size 78 | # r = self.r # relation number 79 | 80 | Afirst = T.batched_tensordot(wC, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k, k] * [n, l, k] = [l, k, n] 81 | Asecond = T.batched_tensordot(Afirst, argsEmbB, axes=[[1], [1]]) # [l, k, n] * [l, k] = [l, n] 82 | 83 | spAfirst = T.batched_tensordot(wC1, negEmbA.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n] 84 | 85 | spSecond = T.batched_dot(wC2, argsEmbB) 86 | 87 | return Asecond + spAfirst + spSecond.reshape((batchSize, 1)) 88 | 89 | def negRightFactorization(self, batchSize, argsEmbA, negEmbB, wC, wC1, wC2): 90 | Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k] 91 | Asecond = T.batched_tensordot(Afirst, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l, k] * [l, k, n] = [l, n] 92 | spFirst = T.batched_dot(wC1, argsEmbA) 93 | spAsecond = T.batched_tensordot(wC2, negEmbB.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n] 94 | return Asecond + spAsecond + spFirst.reshape((batchSize, 1)) 95 | 96 | 97 | 98 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): 99 | weightedC1 = T.dot(relationProbs, self.C1.dimshuffle(1, 0)) 100 | weightedC2 = T.dot(relationProbs, self.C2.dimshuffle(1, 0)) 101 | weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) 102 | 103 | 104 | argembed1 = self.A[args1] 105 | argembed2 = self.A[args2] 106 | 107 | one = self.factorization(batchSize=l, 108 | argsEmbA=argembed1, 109 | argsEmbB=argembed2, 110 | wC=weightedC, 111 | wC1=weightedC1, 112 | wC2=weightedC2) 113 | 114 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) 115 | logScoresP = T.log(T.nnet.sigmoid(u)) 116 | 117 | allScores = logScoresP 118 | allScores = T.concatenate([allScores, entropy, entropy]) 119 | 120 | 121 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) 122 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) 123 | negOne = self.negLeftFactorization(batchSize=l, 124 | negEmbA=negembed1, 125 | argsEmbB=argembed2, 126 | wC=weightedC, 127 | wC1=weightedC1, 128 | wC2=weightedC2) 129 | 130 | negTwo = self.negRightFactorization(batchSize=l, 131 | argsEmbA=argembed1, 132 | negEmbB=negembed2, 133 | wC=weightedC, 134 | wC1=weightedC1, 135 | wC2=weightedC2) 136 | g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0), 137 | negTwo + self.Ab[neg2].dimshuffle(1, 0)]) 138 | logScores = T.log(T.nnet.sigmoid(-g)) 139 | allScores = T.concatenate([allScores, logScores.flatten()]) 140 | 141 | return allScores 142 | 143 | -------------------------------------------------------------------------------- /learning/models/decoders/SelectionalPreferences.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | 3 | import math 4 | import theano 5 | from definitions import settings 6 | import numpy as np 7 | import theano.tensor as T 8 | import cPickle as pickle 9 | 10 | class SelectionalPreferences(object): 11 | 12 | def __init__(self, rng, embedSize, relationNum, argVocSize, data, ex_emb): 13 | 14 | self.k = embedSize 15 | self.r = relationNum 16 | self.a = argVocSize 17 | 18 | a = self.a 19 | k = self.k 20 | r = self.r 21 | 22 | 23 | # Selectional Preferences 24 | Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX) 25 | Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX) 26 | self.C1 = theano.shared(value=Ca1NP, name='C1') 27 | self.C2 = theano.shared(value=Ca2NP, name='C2') 28 | 29 | # argument embeddings 30 | ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable 31 | 32 | if ex_emb: 33 | import gensim 34 | external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path) 35 | 36 | for idArg in xrange(self.a): 37 | arg = data.id2Arg[idArg].lower().split(' ') 38 | new = np.zeros(k, dtype=theano.config.floatX) 39 | size = 0 40 | for ar in arg: 41 | if ar in external_embeddings: 42 | new += external_embeddings[ar] 43 | size += 1 44 | if size > 0: 45 | ANP[idArg] = new/size 46 | 47 | self.A = theano.shared(value=ANP, name='A') # (a1, k) 48 | 49 | self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable 50 | name='Ab', borrow=True) 51 | 52 | self.params = [self.A, self.C1, self.C2, self.Ab] 53 | 54 | 55 | 56 | 57 | 58 | def leftMostFactorization(self, batchSize, args, wC1): 59 | l = batchSize 60 | k = self.k # embed size 61 | r = self.r # relation number 62 | argEmbeds = self.A[args.flatten()] 63 | Afirst = T.batched_dot(wC1, argEmbeds) 64 | return Afirst 65 | 66 | def rightMostFactorization(self, batchSize, args, wC2): 67 | l = batchSize 68 | k = self.k # embed size 69 | r = self.r # relation number 70 | argEmbeds2 = self.A[args.flatten()] 71 | Asecond = T.batched_dot(wC2, argEmbeds2) 72 | return Asecond 73 | 74 | 75 | 76 | def negLeftMostFactorization(self, batchSize, negEmbed, wC1): 77 | # l = batchSize 78 | # k = self.k # embed size 79 | # r = self.r # relation number 80 | Afirst = T.batched_tensordot(wC1, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n] 81 | return Afirst 82 | 83 | def negRightMostFactorization(self, batchSize, negEmbed, wC2): 84 | # l = batchSize 85 | # k = self.k # embed size 86 | # r = self.r # relation number 87 | Asecond = T.batched_tensordot(wC2, negEmbed.dimshuffle(1, 2, 0), axes=[[1], [1]]) # [l,k] [l,k,n] = [l,n] 88 | return Asecond 89 | 90 | 91 | 92 | def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy): 93 | weightedC1= T.dot(relationProbs, self.C1.dimshuffle(1, 0)) 94 | weightedC2= T.dot(relationProbs, self.C2.dimshuffle(1, 0)) 95 | 96 | left1 = self.leftMostFactorization(batchSize=l, args=args1, wC1=weightedC1) 97 | right1 = self.rightMostFactorization(batchSize=l, args=args2, wC2=weightedC2) 98 | one = left1 + right1 99 | 100 | u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]]) 101 | logScoresP = T.log(T.nnet.sigmoid(u)) 102 | allScores = logScoresP 103 | allScores = T.concatenate([allScores, entropy, entropy]) 104 | 105 | negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k)) 106 | negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k)) 107 | negative1 = self.negLeftMostFactorization(batchSize=l, 108 | negEmbed=negembed1, 109 | wC1=weightedC1) 110 | negative2 = self.negRightMostFactorization(batchSize=l, 111 | negEmbed=negembed2, 112 | wC2=weightedC2) 113 | 114 | negOne = negative1.dimshuffle(1, 0) + right1 115 | negTwo = negative2.dimshuffle(1, 0) + left1 116 | g = T.concatenate([negOne + self.Ab[neg1], negTwo + self.Ab[neg2]]) 117 | logScores = T.log(T.nnet.sigmoid(-g)) 118 | allScores = T.concatenate([allScores, logScores.flatten()]) 119 | 120 | return allScores 121 | 122 | 123 | -------------------------------------------------------------------------------- /learning/models/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | -------------------------------------------------------------------------------- /learning/models/encoders/RelationClassifier.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | from theano import sparse 4 | import theano 5 | from definitions import settings 6 | import numpy as np 7 | import theano.tensor as T 8 | 9 | class IndependentRelationClassifiers(object): 10 | # rng is a random generator, 11 | # featureDim is the dimension of the feature space 12 | # relationNum is the number of possible relations (classes of relations) 13 | 14 | def __init__(self, rng, featureDim, relationNum): 15 | 16 | # dimensionality of feature space 17 | self.h = featureDim 18 | # relation num 19 | self.r = relationNum 20 | # print str(np.sqrt(6. / (self.h + self.r))) 21 | # w_bound = np.sqrt(self.h * self.r) 22 | 23 | # print str(1.0 / w_bound) 24 | print 'low bound =', settings.low, 'high bound =', settings.high 25 | self.W = theano.shared(np.asarray(rng.uniform( 26 | low=settings.low, 27 | high=settings.high, 28 | size=(self.h, self.r)), dtype=theano.config.floatX), # @UndefinedVariable 29 | name='W', borrow=True) 30 | # npW = np.zeros((3,3),dtype=theano.config.floatX) 31 | # npW[0,0] = 1.e+40 32 | # npW[1,1] = 1.e+40 33 | # npW[2,2] = 1.e+40 34 | 35 | # @UndefinedVariable 36 | # self.W = theano.shared(value=np.asarray(npW)) 37 | 38 | self.Wb = theano.shared(value=np.zeros(self.r, 39 | dtype=theano.config.floatX), # @UndefinedVariable 40 | name='Wb', borrow=True) 41 | 42 | self.params = [self.W, self.Wb] 43 | # self.params = [self.Wb] 44 | # self.params = [] 45 | 46 | def compRelationProbsFunc(self, xFeats): 47 | # xFeats [l, h] matrix 48 | # xFeats = theano.printing.Print("xFeats")(xFeats) 49 | # self.Wb = theano.printing.Print("Wb ") (self.Wb) 50 | # self.W = theano.printing.Print("W ") (self.W) 51 | # scores of each role by a classifier 52 | relationScores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r] 53 | #relationScores = theano.printing.Print("relationScores=")(relationScores) 54 | 55 | # convert it to probabilities 56 | relationProbs = T.nnet.softmax(relationScores) 57 | #relationProbs = theano.printing.Print("relationProbs = ")(relationProbs) 58 | 59 | 60 | return relationProbs # [l, r] 61 | 62 | 63 | def labelFunct(self, batchSize, xFeats): 64 | # xFeats [l, h] 65 | # l = batchSize 66 | # self.W = theano.printing.Print("W ") (self.W) 67 | # self.Wb = theano.printing.Print("Wb ") (self.Wb) 68 | scores = sparse.dot(xFeats, self.W) + self.Wb # [l, h] x [h, r] => [l, r] 69 | relationProbs = T.nnet.softmax(scores) 70 | # scores = theano.printing.Print("scores ") (scores) 71 | labels = T.argmax(scores, axis=1) # [l, r] => [l] 72 | # labels = theano.printing.Print("labels ") (labels) 73 | return (labels, relationProbs) -------------------------------------------------------------------------------- /learning/models/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'enfry' 2 | -------------------------------------------------------------------------------- /processing/OiePreprocessor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'diego' 2 | 3 | import argparse 4 | import os 5 | import sys 6 | import time 7 | from definitions import OieFeatures 8 | from definitions import OieExample 9 | print sys.path 10 | import cPickle as pickle 11 | 12 | 13 | class FeatureLexicon: 14 | 15 | def __init__(self): 16 | self.nextId = 0 17 | self.id2Str = {} 18 | self.str2Id = {} 19 | self.id2freq = {} 20 | self.nextIdPruned = 0 21 | self.id2StrPruned = {} 22 | self.str2IdPruned = {} 23 | 24 | def getOrAdd(self, s): 25 | if s not in self.str2Id: 26 | self.id2Str[self.nextId] = s 27 | self.str2Id[s] = self.nextId 28 | self.id2freq[self.nextId] = 1 29 | self.nextId += 1 30 | else: 31 | self.id2freq[self.str2Id[s]] += 1 32 | return self.str2Id[s] 33 | 34 | 35 | def getOrAddPruned(self, s): 36 | if s not in self.str2IdPruned: 37 | self.id2StrPruned[self.nextIdPruned] = s 38 | self.str2IdPruned[s] = self.nextIdPruned 39 | self.nextIdPruned += 1 40 | return self.str2IdPruned[s] 41 | 42 | def getId(self, s): 43 | if s not in self.str2Id: 44 | return None 45 | return self.str2Id[s] 46 | 47 | def getStr(self, idx): 48 | if idx not in self.id2Str: 49 | return None 50 | else: 51 | return self.id2Str[idx] 52 | 53 | def getStrPruned(self, idx): 54 | if idx not in self.id2StrPruned: 55 | return None 56 | else: 57 | return self.id2StrPruned[idx] 58 | 59 | def getFreq(self, idx): 60 | if idx not in self.id2freq: 61 | return None 62 | return self.id2freq[idx] 63 | 64 | 65 | def getDimensionality(self): 66 | return self.nextIdPruned 67 | # return self.nextId 68 | 69 | 70 | def getFeatures(lexicon, featureExs, info, arg1=None, arg2=None, expand=False): 71 | feats = [] 72 | for f in featureExs: 73 | res = f(info, arg1, arg2) 74 | if res is not None: 75 | if type(res) == list: 76 | for el in res: 77 | featStrId = f.__name__ + "#" + el 78 | if expand: 79 | feats.append(lexicon.getOrAdd(featStrId)) 80 | else: 81 | featId = lexicon.getId(featStrId) 82 | if featId is not None: 83 | feats.append(featId) 84 | else: 85 | featStrId = f.__name__ + "#" + res 86 | if expand: 87 | feats.append(lexicon.getOrAdd(featStrId)) 88 | else: 89 | featId = lexicon.getId(featStrId) 90 | if featId is not None: 91 | feats.append(featId) 92 | 93 | return feats 94 | 95 | def getFeaturesThreshold(lexicon, featureExs, info, arg1=None, arg2=None, expand=False, threshold=0): 96 | feats = [] 97 | for f in featureExs: 98 | res = f(info, arg1, arg2) 99 | if res is not None: 100 | if type(res) == list: 101 | for el in res: 102 | featStrId = f.__name__ + "#" + el 103 | if expand: 104 | if lexicon.id2freq[lexicon.getId(featStrId)] > threshold: 105 | feats.append(lexicon.getOrAddPruned(featStrId)) 106 | else: 107 | featId = lexicon.getId(featStrId) 108 | if featId is not None: 109 | if lexicon.id2freq[featId] > threshold: 110 | feats.append(lexicon.getOrAddPruned(featStrId)) 111 | else: 112 | featStrId = f.__name__ + "#" + res 113 | if expand: 114 | if lexicon.id2freq[lexicon.getId(featStrId)] > threshold: 115 | feats.append(lexicon.getOrAddPruned(featStrId)) 116 | else: 117 | featId = lexicon.getId(featStrId) 118 | if featId is not None: 119 | if lexicon.id2freq[featId] > threshold: 120 | feats.append(lexicon.getOrAddPruned(featStrId)) 121 | 122 | return feats 123 | 124 | def prepareArgParser(): 125 | parser = argparse.ArgumentParser(description='Processes an Oie file and add its representations ' 126 | 'to a Python pickled file.') 127 | 128 | parser.add_argument('input_file', metavar='input-file', help='input file in the Yao format') 129 | 130 | parser.add_argument('pickled_dataset', metavar='pickled-dataset', help='pickle file to be used to store output ' 131 | '(created if empty)') 132 | 133 | parser.add_argument('--batch-name', default="train", nargs="?", help='name used as a reference in the pickled file') 134 | 135 | parser.add_argument('--features', default="basic", nargs="?", help='features (basic vs ?)') 136 | parser.add_argument('--threshold', default="0", nargs="?", type=int, help='minimum feature frequency') 137 | 138 | 139 | 140 | parser.add_argument('--test-mode', action='store_true', 141 | help='used for test files ' 142 | '(the feature space is not expanded to include previously unseen features)') 143 | 144 | 145 | return parser 146 | 147 | def loadExamples(fileName): 148 | count = 0 149 | with open(fileName, 'r') as fp: 150 | relationExamples = [] 151 | for line in fp: 152 | line.strip() 153 | if len(line) == 0 or len(line.split()) == 0: 154 | raise IOError 155 | 156 | else: 157 | fields = line.split('\t') 158 | assert len(fields) == 9, "a problem with the file format (# fields is wrong) len is " \ 159 | + str(len(fields)) + "instead of 9" 160 | # this will be 10 161 | relationExamples.append([str(count)] + fields) 162 | count += 1 163 | 164 | return relationExamples 165 | 166 | # if __name__ == '__main__': 167 | # examples = loadExamples('/Users/admin/isti/amsterdam/data/candidate-100.txt') 168 | # print "Using basic features" 169 | # argFeatureExtrs = OieFeatures.getBasicFeatures() 170 | # ex = examples[0] 171 | # print ex 172 | # features = argFeatureExtrs 173 | # 174 | # s = [] 175 | # for f in features: 176 | # res = f([ex[1], ex[4], ex[5], ex[7]], ex[2], ex[3]) 177 | # if res is not None: 178 | # s.append(f.__name__ + "#" + res) 179 | # 180 | # print s, 'dd' 181 | 182 | if __name__ == '__main__': 183 | 184 | tStart = time.time() 185 | 186 | print "Parameters: " + str(sys.argv[1::]) 187 | parser = prepareArgParser() 188 | args = parser.parse_args() 189 | 190 | print "Parsed params: " + str(args) 191 | 192 | print "Loading sentences...", 193 | relationExamples = loadExamples(args.input_file) 194 | 195 | tEnd = time.time() 196 | print "Done (" + str(tEnd - tStart) + "s.)" 197 | 198 | # predFeatureExtrs = definitions.SrlFeatures.getJohanssonPredDisFeatures() 199 | # 200 | featureExtrs = None 201 | if args.features == "basic": 202 | print "Using rich features" 203 | featureExtrs = OieFeatures.getBasicCleanFeatures() 204 | 205 | relationLexicon = FeatureLexicon() 206 | 207 | dataset = {} 208 | goldstandard = {} 209 | 210 | if os.path.exists(args.pickled_dataset): 211 | tStart = time.time() 212 | print "Found existing pickled dataset, loading...", 213 | 214 | pklFile = open(args.pickled_dataset, 'rb') 215 | 216 | featureExtrs = pickle.load(pklFile) 217 | relationLexicon = pickle.load(pklFile) 218 | dataset = pickle.load(pklFile) 219 | goldstandard = pickle.load(pklFile) 220 | 221 | pklFile.close() 222 | tEnd = time.time() 223 | print "Done (" + str(tEnd - tStart) + "s.)" 224 | 225 | tStart = time.time() 226 | print "Processing relation Examples", 227 | 228 | examples = [] 229 | relationLabels = {} 230 | if args.batch_name in dataset: 231 | examples = dataset[args.batch_name] 232 | relationLabels = goldstandard[args.batch_name] 233 | else: 234 | dataset[args.batch_name] = examples 235 | goldstandard[args.batch_name] = relationLabels 236 | 237 | reIdx = 0 238 | c = 0 239 | for re in relationExamples: 240 | getFeatures(relationLexicon, featureExtrs, [re[1], re[4], re[5], re[7], re[8], re[6]], 241 | re[2], re[3], True) 242 | for re in relationExamples: 243 | reIdx += 1 244 | if reIdx % 1000 == 0: 245 | print ".", 246 | if reIdx % 10000 == 0: 247 | print reIdx, 248 | 249 | 250 | relationE = '' 251 | if re[9] != '': 252 | relationE = re[9] 253 | # print re[9] 254 | # if re[10] != '': 255 | # if relationE != '': 256 | # relationE += ' '+re[10] 257 | # else: 258 | # relationE = re[10] 259 | 260 | ex = OieExample.OieExample(re[2], re[3], getFeaturesThreshold(relationLexicon, 261 | featureExtrs, 262 | [re[1], re[4], re[5], re[7], re[8], re[6]], 263 | # [re[1], re[4], re[5], re[7]], 264 | re[2], re[3], True, threshold=args.threshold), re[5] 265 | ,relation=relationE 266 | ) 267 | relationLabels[c] = re[-1].strip().split(' ') 268 | c += 1 269 | 270 | examples.append(ex) 271 | 272 | 273 | tEnd = time.time() 274 | print "Done (" + str(tEnd - tStart) + "s.), processed " + str(len(examples)) 275 | 276 | tStart = time.time() 277 | print "Pickling the dataset...", 278 | 279 | pklFile = open(args.pickled_dataset, 'wb') 280 | #pklFile = gzip.GzipFile(args.pickled_dataset, 'wb') 281 | 282 | pklProtocol = 2 283 | pickle.dump(featureExtrs, pklFile, protocol=pklProtocol) 284 | pickle.dump(relationLexicon, pklFile, protocol=pklProtocol) 285 | pickle.dump(dataset, pklFile, protocol=pklProtocol) 286 | pickle.dump(goldstandard, pklFile, protocol=pklProtocol) 287 | 288 | tEnd = time.time() 289 | print "Done (" + str(tEnd - tStart) + "s.)" -------------------------------------------------------------------------------- /processing/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'admin' 2 | --------------------------------------------------------------------------------