├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── data_example ├── Lexicon-example.tsv ├── README.txt ├── SRL-example.all.lemma.tags └── SRL-example.frame.elements └── simpleFrameId ├── __init__.py ├── check.py ├── classifier.py ├── config.py ├── data.py ├── evaluation.py ├── extras.py ├── globals.py ├── graph.py ├── main.py ├── reporting.py ├── representation.py └── resources.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Copyright 2017 3 | Ubiquitous Knowledge Processing (UKP) Lab 4 | Technische Universität Darmstadt 5 | 6 | ------------------------------------------------------------------------------- 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | 20 | ------------------------------------------------------------------------------- 21 | Third party libraries licensing information: 22 | 23 | BSD License: 24 | - scikit-learn 25 | - networkx 26 | 27 | BSD-new License: 28 | - numpy 29 | 30 | MIT License: 31 | - keras 32 | 33 | Apache License v. 2.0 34 | - lightfm -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Out-of-domain FrameNet Semantic Role Labeling 2 | 3 | This code is an implementation of a simple frame identification approach (SimpleFrameId) described in the paper "Out-of-domain FrameNet Semantic Role Labeling". 4 | Please use the following citation: 5 | 6 | ``` 7 | @inproceedings{TUD-CS-2017-0011, 8 | title = {Out-of-domain FrameNet Semantic Role Labeling}, 9 | author = {Hartmann, Silvana and Kuznetsov, Ilia and Martin, Teresa and Gurevych, Iryna}, 10 | publisher = {Association for Computational Linguistics}, 11 | booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics (EACL 2017)}, 12 | pages = {to appear}, 13 | month = apr, 14 | year = {2017}, 15 | location = {Valencia, Spain}, 16 | } 17 | ``` 18 | 19 | > **Abstract:** 20 | Domain dependence of NLP systems is one of the major obstacles to their application in large-scale text analysis, also restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set. For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for out-of-domain FrameNet SRL is the frame identification step. To address this problem, we develop a simple, yet efficient system based on distributed word representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain. 21 | 22 | Contact persons: Teresa Martin, martin@aiphes.tu-darmstadt.de; Ilia Kuznetsov, kuznetsov@ukp.informatik.tu-darmstadt.de 23 | 24 | https://www.ukp.tu-darmstadt.de/ 25 | 26 | https://www.tu-darmstadt.de/ 27 | 28 | 29 | Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions. 30 | 31 | > This repository contains experimental software and is published for the sole purpose of giving additional background details on the respective publication. 32 | 33 | ## Project structure 34 | The implementation is a single package. Two most important modules are: 35 | 36 | * `main.py` -- the entry point for experiments 37 | * `globals.py` -- global variables used in experiments 38 | * `classifier.py` -- the classifiers 39 | * `representation.py` -- representation builders 40 | 41 | The system requires a specific folder structure where the data is stored: 42 | * `ROOT` -- your project root (just a folder somewhere on your disk) 43 | * `ROOT/srl_data` -- source data 44 | * `ROOT/srl_data/corpora` -- input corpora 45 | * `ROOT/srl_data/embeddings` -- external VSMs 46 | * `ROOT/srl_data/lexicons` -- external lexicons 47 | * `ROOT/out` -- here the experiment results are stored 48 | 49 | ## Requirements 50 | 51 | * Python 2.7 52 | * Python dependencies: keras, lightfm, sklearn, numpy, networkx 53 | 54 | ## Installation 55 | 56 | Install the dependencies, adjust the paths in `main.py` and `globals.py` accordingly and run via `python main.py` 57 | 58 | ### Parameter description 59 | 60 | * to define in `globals.py`: filenames for 61 | * pretrained embeddings e.g, Levy dependency embeddings 62 | * FrameNet lexicon 63 | * train data 64 | * test data 65 | * to define in `main.py` 66 | * `vsms` -- vector space model to use 67 | * `lexicons` -- lexicon to use (mind the all_unknown setting!) 68 | * `multiword_averaging` -- treatment of multiword predicates, false - use head embedding, true - use avg 69 | * `all_unknown` -- makes the lexicon treat all LU as unknown, corresponds to the no-lex setting 70 | * `num_components` -- for wsabie classifier: dimension for the learned latent representations 71 | * `max_sampled` -- for wsabie classifier: maximum number of negative samples used during WARP fitting 'warp' 72 | * `num_epochs` -- for wsabie classifier: number of epochs to train the model 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /data_example/Lexicon-example.tsv: -------------------------------------------------------------------------------- 1 | Dominate_competitor dominate.v 2 | Dominate_competitor domination.n 3 | Dominate_competitor dominant.a 4 | Dominate_competitor strongman.n 5 | Intentionally_act action.n 6 | Intentionally_act do.v 7 | Intentionally_act step.n 8 | Intentionally_act act.v -------------------------------------------------------------------------------- /data_example/README.txt: -------------------------------------------------------------------------------- 1 | The system requires three kinds of input: 2 | 1. SRL data 3 | 2. Lexicon 4 | 3. VSM Lookup 5 | 6 | 1. SRL Data 7 | The default format for SRL data uses two types of files: sentence files and annotation files. 8 | *Sentence files* are tab-separated one sentence per line with POS tags, lemmas and dependency relations. 9 | The format is similar to CoNLL-2009 or MaltTab with all columns being merged in a single line. 10 | 11 | [# tokens][tokens][POS tags][dependency labels][dependency heads][O][lemmas] 12 | 13 | *Frame element files* are tab separated with the following column semantics: 14 | 15 | [optional] 16 | [optional] 17 | [# of roles] 18 | [frame name] 19 | [lemma.pos] 20 | [position of the FEE in the sentence] 21 | [FEE string] 22 | [line# in the sentence file (incl. 0)] 23 | [role1] 24 | [position1] 25 | [role2] 26 | [position2] 27 | etc. 28 | 29 | 2. Lexicon data 30 | Lexicon files are simple lists of frames and predicates that can evoke them, tab-separated, one pair per line. 31 | 32 | 3. VSM data 33 | We use the standard word embeddings format, where each line corresponds to a word followed by its vector representation. -------------------------------------------------------------------------------- /data_example/SRL-example.all.lemma.tags: -------------------------------------------------------------------------------- 1 | 26 In addition to that , by helping them find jobs , Goodwill reduced the state 's Public Support tab by an estimated $ 4 million . IN NN TO DT , IN VBG PRP VBP NNS , NNP VBD DT NN POS NNP NNP NN IN DT VBN $ CD CD . prep pobj prep pobj punct prep pcomp nsubj ccomp dobj punct nsubj ROOT det poss possessive nn nn dobj prep dep amod pobj number number punct 13 1 2 3 13 13 6 9 7 9 13 13 0 15 19 15 19 19 13 13 23 23 20 23 23 13 O O O O O O O O O O O O O O O O O O O O O O O O O O in addition to that , by help them find job , goodwill reduce the state ' public support tab by an estimate $ 4 million . -------------------------------------------------------------------------------- /data_example/SRL-example.frame.elements: -------------------------------------------------------------------------------- 1 | 0 0 3 Assistance help.v 6 helping 0 Benefited_party 7 Goal 8:9 2 | 0 0 3 Locating find.v 8 find 0 Perceiver 7 Sought_entity 9 3 | -------------------------------------------------------------------------------- /simpleFrameId/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UKPLab/eacl2017-oodFrameNetSRL/d30e23d724c911d001cc5ca8f28bdac86eee1ce4/simpleFrameId/__init__.py -------------------------------------------------------------------------------- /simpleFrameId/check.py: -------------------------------------------------------------------------------- 1 | from globals import * 2 | from data import get_graphs 3 | from resources import ResourceManager 4 | from reporting import ConllReporter 5 | 6 | def check_corpora_read_ok(sources, out): 7 | print "Checking datasets" 8 | 9 | # set corpora to test gere 10 | for corpus in [CORPUS_YAGS_TEST, CORPUS_DAS_TRAIN, CORPUS_DAS_TEST, 11 | CORPUS_YAGS_TEST, CORPUS_MASC_TEST, CORPUS_TW_G_TEST, COPRUS_TW_M_TEST, CORPUS_TW_S_TEST]: 12 | g = get_graphs(*sources.get_corpus(corpus), verbose=False) 13 | reporter = ConllReporter(out+corpus+".conll") 14 | reporter.report(g) 15 | 16 | 17 | if __name__ == "__main__": 18 | src = "your/path/here" 19 | root = ResourceManager(src) 20 | check_corpora_read_ok(root, "your/path/here/tmp") -------------------------------------------------------------------------------- /simpleFrameId/classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense 4 | from keras.utils.np_utils import to_categorical 5 | from collections import Counter 6 | 7 | from lightfm import LightFM 8 | # LightFM source code had to be hacked as it is buggy and does not say with which python version it actually should work 9 | # aMatrix.tocsr() --> sp.csr_matrix(aMatrix) 10 | # aMatrix.tocoo() --> sp.coo_matrix(aMatrix) 11 | from sklearn.metrics.pairwise import cosine_similarity 12 | 13 | 14 | # Generic classifier, doesn't do much 15 | class Classifier: 16 | def __init__(self, lexicon, all_unknown=False, num_components=False, max_sampled=False, num_epochs=False ): 17 | self.clf = None 18 | self.lexicon = lexicon 19 | self.all_unknown = all_unknown 20 | self.num_components = num_components 21 | self.max_sampled = max_sampled 22 | self.num_epochs = num_epochs 23 | 24 | def train(self, X, y, lemmapos): 25 | raise NotImplementedError("Not implemented, use child classes") 26 | def predict(self, X, lemmapos): 27 | raise NotImplementedError("Not implemented, use child classes") 28 | 29 | 30 | # Data-driven majority baseline 31 | class DataMajorityBaseline(Classifier): 32 | def train(self, X, y, lemmapos_list): 33 | self.majorityClasses = {} 34 | total_y = [] 35 | # get frame by LU counts from DATA. Not seen in data = doesn't exist 36 | for X_i, y_i, lemmapos_i in zip(X, y, lemmapos_list): 37 | self.majorityClasses[lemmapos_i] = self.majorityClasses.get(lemmapos_i, []) + [y_i] 38 | total_y += [y_i] 39 | 40 | uninformed_majority = Counter(total_y).most_common(1)[0][0] # uninformed majority for lemmas not seen in data 41 | 42 | # get top frame for each LU 43 | for lemmapos in self.majorityClasses: 44 | if len(self.majorityClasses.get(lemmapos, [])) == 0: 45 | self.majorityClasses[lemmapos] = uninformed_majority 46 | else: 47 | self.majorityClasses[lemmapos] = Counter(self.majorityClasses[lemmapos]).most_common(1)[0][0] 48 | 49 | self.majorityClasses["__UNKNOWN__"] = uninformed_majority 50 | print self.majorityClasses 51 | print "Majority baseline extracted, uninformed majority class is", uninformed_majority, ":", self.lexicon.idToFrame[uninformed_majority] 52 | 53 | def predict(self, X, lemmapos): 54 | if self.all_unknown: 55 | return self.majorityClasses["__UNKNOWN__"] 56 | return self.majorityClasses.get(lemmapos, self.majorityClasses["__UNKNOWN__"]) 57 | 58 | 59 | # Lexicon-driven majority baseline 60 | class LexiconMajorityBaseline(DataMajorityBaseline): 61 | def train(self, X, y, lemmapos_list): 62 | frame_counts = [] 63 | for y_i, lemmapos_i in zip(y, lemmapos_list): # collect TOTAL frame counts from data 64 | frame_counts += [y_i] 65 | 66 | frame_counts = Counter(frame_counts) 67 | 68 | self.majorityClasses = {} 69 | uninformed_majority = frame_counts.most_common(1)[0][0] 70 | self.majorityClasses["__UNKNOWN__"] = uninformed_majority 71 | 72 | for lemmapos in self.lexicon.frameLexicon: # for each lemma in LEXICON, determine most frequent frame among available, based on data 73 | available_frames = self.lexicon.get_available_frame_ids(lemmapos) 74 | available_frame_counts = Counter({f:frame_counts.get(f, 0) for f in available_frames}) # no frame in data - count set to 0 75 | self.majorityClasses[lemmapos] = available_frame_counts.most_common(1)[0][0] 76 | 77 | print "Majority baseline extracted, uninformed majority class is", uninformed_majority, ":", self.lexicon.idToFrame[uninformed_majority] 78 | 79 | 80 | # A simple NN-based classifier 81 | class SharingDNNClassifier(Classifier): 82 | def train(self, X, y, lemmapos_list): 83 | self.clf = Sequential() 84 | self.clf.add(Dense(256, input_dim=len(X[0]), activation='relu')) 85 | self.clf.add(Dense(100, activation='relu')) 86 | self.clf.add(Dense(output_dim=np.max(y)+1, activation='softmax')) # np.max()+1 because frames are 0-indexed 87 | 88 | self.clf.compile(optimizer='adagrad', 89 | loss='categorical_crossentropy', 90 | metrics=['accuracy']) 91 | 92 | self.clf.fit(X, to_categorical(y, np.max(y)+1), verbose=1, nb_epoch=100) 93 | 94 | def predict(self, X, lemmapos): 95 | available_frames = self.lexicon.get_available_frame_ids(lemmapos) # get available frames from lexicon 96 | ambig = self.lexicon.is_ambiguous(lemmapos) 97 | unknown = self.lexicon.is_unknown(lemmapos) # unknown = not in lexicon 98 | 99 | if unknown or self.all_unknown: # the all_unknown setting renders all lemma.pos unknown! 100 | available_frames = self.lexicon.get_all_frame_ids() # if the lemma.pos is unknown, search in all frames 101 | else: 102 | # if the LU is known and has only one frame, just return it. Even if there is no data for this LU (!) 103 | if not ambig: 104 | return available_frames[0] 105 | 106 | y = self.clf.predict(X.reshape((-1, len(X))))[0] 107 | # pick the best-scoring frame among available 108 | bestScore = None 109 | bestClass = None 110 | for cl in available_frames: 111 | score = y[cl] 112 | if bestScore is None or score >= bestScore: 113 | bestScore = score 114 | bestClass = cl 115 | return bestClass 116 | 117 | 118 | # classification with WSABIE latent representations 119 | class WsabieClassifier(Classifier): 120 | def train(self, X, y, lemmapos_list): 121 | 122 | # MODEL 123 | self.clf = LightFM(no_components = self.num_components, learning_schedule = 'adagrad', loss = 'warp', \ 124 | learning_rate = 0.05, epsilon = 1e-06, item_alpha = 0.0, user_alpha = 1e-6, \ 125 | max_sampled = self.max_sampled, random_state = None) 126 | 127 | # DATA 128 | # training data 129 | # X: list of vectors 130 | # each vector is the initial representation for a sentence (more precisely, for a predicate with context) 131 | # --> these are the user features in the training set 132 | # y: list of IDs for frames 133 | # the frame IDs are the labels for the representations 134 | # --> these are used to create the interaction matrix for the training set such that LightFM can deal with it 135 | # y_interactionLabels: interaction matrix is of size (num sentences in y) x (num frames) with 1 indicating the frame label for a predicate in its context sentence 136 | y_interactionLabels = self.createInteractionMatrix(y) 137 | 138 | # FIT 139 | self.clf = self.clf.fit(interactions = y_interactionLabels, user_features = X, item_features = None, \ 140 | sample_weight = None, epochs = self.num_epochs, num_threads = 2, verbose = True) 141 | 142 | def predict(self, X, lemmapos): 143 | # DATA 144 | # test data 145 | # X: list of vectors 146 | # each vector is the initial representation for a sentence (more precisely, for a predicate with context) 147 | # --> these are the user features in the test set 148 | X_reshape = X.reshape((-1, len(X))) 149 | 150 | # get projection matrices from trained MODEL 151 | user_embeddings_fromTraining = self.clf.user_embeddings 152 | item_embeddings_fromTraining = self.clf.item_embeddings 153 | 154 | # PREDICT 155 | # do the prediction for this new user via the dot product of the user feature X and the projection matrix user embeddings obtained during training 156 | embeddedNewUser = np.dot(X_reshape, user_embeddings_fromTraining) # now in the same space as the item embeddings obtained during training 157 | # use cosine similarity as similarity measure between the embedded test sentence and all the embeddings corresponding to frames 158 | similarity_to_all_frames = cosine_similarity(embeddedNewUser, item_embeddings_fromTraining)[0] 159 | 160 | available_frame_IDs = self.lexicon.get_available_frame_ids(lemmapos) # get available frame IDs for this lemma.pos from lexicon 161 | ambig = self.lexicon.is_ambiguous(lemmapos) # amiguous = can evoke more than one frame 162 | unknown = self.lexicon.is_unknown(lemmapos) # unknown = not in lexicon 163 | 164 | if unknown or self.all_unknown: # the all_unknown setting renders all lemma.pos unknown! 165 | available_frame_IDs = self.lexicon.get_all_frame_ids() # if the lemma.pos is unknown, search in all frames 166 | else: 167 | # if the lemma.pos is known and has only one frame, just return it. Even if there is no data for this lemma.pos. 168 | if not ambig: 169 | return available_frame_IDs[0] 170 | 171 | # pick the best-scoring frameID among available frameIDs 172 | bestScore = None 173 | best_frame_ID = None 174 | for frame_ID in available_frame_IDs: 175 | score = similarity_to_all_frames[frame_ID] 176 | if bestScore is None or score >= bestScore: 177 | bestScore = score 178 | best_frame_ID = frame_ID 179 | return best_frame_ID 180 | 181 | 182 | def createInteractionMatrix(self, y_ID): 183 | # interactionMatrix is of size (num sentences in y_ID) x (num frames) with 1 indicating the frame label for a predicate in its context sentence 184 | 185 | numSentInY = len(y_ID) 186 | numFrames = len(self.lexicon.get_all_frame_ids()) 187 | y_interactionLabels = np.zeros([numSentInY, numFrames], dtype = np.float32) 188 | 189 | for i in range(numSentInY): 190 | y_interactionLabels[i, y_ID[i]] = 1. 191 | 192 | return y_interactionLabels -------------------------------------------------------------------------------- /simpleFrameId/config.py: -------------------------------------------------------------------------------- 1 | class Config: # Container class for configurations 2 | def __init__(self, clf, feature_extractor, lexicon, vsm, multiword_averaging, 3 | all_unknown, num_components, max_sampled, num_epochs): 4 | self.clf = clf 5 | self.feat_extractor = feature_extractor 6 | self.lexicon = lexicon 7 | self.vsm = vsm 8 | self.multiword_averaging = multiword_averaging 9 | self.all_unknown = all_unknown 10 | self.num_components = num_components 11 | self.max_sampled = max_sampled 12 | self.num_epochs = num_epochs 13 | 14 | def get_clf(self): 15 | return self.clf 16 | 17 | def get_feat_extractor(self): 18 | return self.feat_extractor 19 | 20 | def get_lexicon(self): 21 | return self.lexicon 22 | 23 | def get_vsm(self): 24 | return self.vsm 25 | 26 | def get_multiword_averaging(self): 27 | return self.multiword_averaging 28 | 29 | def get_all_unknown(self): 30 | return self.all_unknown 31 | 32 | def get_num_components(self): 33 | return self.num_components 34 | 35 | def get_max_sampled(self): 36 | return self.max_sampled 37 | 38 | def get_num_epochs(self): 39 | return self.num_epochs 40 | 41 | def __str__(self): 42 | return "c_"+self.clf.__name__+"__"+"f_"+self.feat_extractor.__name__+"__"+\ 43 | "l_"+(self.lexicon if self.lexicon is not None else "NA") +"__"+"vsm_"+\ 44 | (self.vsm if self.vsm is not None else "NA")+\ 45 | "__"+"MWA_"+str(self.multiword_averaging)+"__unk_"+str(self.all_unknown)+\ 46 | "__comp_"+str(self.num_components)+"__samp_"+str(self.max_sampled)+"__ep_"+str(self.num_epochs) 47 | -------------------------------------------------------------------------------- /simpleFrameId/data.py: -------------------------------------------------------------------------------- 1 | import codecs, sys 2 | from graph import DependencyGraph 3 | 4 | # Data management routines 5 | 6 | 7 | def fix_tid(src_tid, sep): # fixes and unrolls the offsets 8 | if sep not in src_tid: 9 | fixed_span = [str(int(src_tid)+1)] 10 | else: 11 | vals = src_tid.split(sep) 12 | fixed_span = [str(int(val)+1) for val in vals] 13 | 14 | unrolled_span = [] # unroll spans, e.g. 2:5 -> [2,3,4,5]; 6_7_9 -> [6,7,8,9] 15 | if len(fixed_span) <= 1: 16 | return tuple([int(i) for i in fixed_span]) 17 | else: 18 | for x in range(len(fixed_span)-1): 19 | for y in range(int(fixed_span[x]), int(fixed_span[x+1])+1): 20 | unrolled_span += [y] 21 | return tuple(set(sorted([int(i) for i in unrolled_span]))) 22 | 23 | 24 | def collect_srl_data(in_fes): # load SRL data (~frame.elements). All the offsets are shifted by 1! 25 | srl_data = {} # {sentence_id: {fe_id: [[fee_frame, fee_lemmapos, {role: role_span}], [fee_frame2, {role: role_span}], ...]} 26 | for line in in_fes: 27 | line = line.strip().split("\t") 28 | fee_tid = fix_tid(line[5], "_") # predicate offsets are given as tid_tid_tid_tid 29 | fee_frame = line[3] 30 | fee_lemmapos = line[4].lower() 31 | sid = int(line[7]) 32 | role_info = line[8:] 33 | srl_data[sid] = srl_data.get(sid, {}) 34 | srl_data[sid][fee_tid] = srl_data[sid].get(fee_tid, []) 35 | fee_info = [] # ugly but so is the data! Multiple fee possible on single span 36 | fee_info += [fee_frame] 37 | fee_info += [fee_lemmapos] 38 | 39 | role_dict = {} 40 | for x in range(0, len(role_info), 2): 41 | role_dict[role_info[0]] = fix_tid(role_info[1], ":") # role offsets are given as start:end 42 | fee_info += [role_dict] 43 | srl_data[sid][fee_tid] += [fee_info] 44 | return srl_data 45 | 46 | 47 | def collect_sentence_data(in_sentences): # load parse data (~all.lemma.tags) 48 | sid = 0 49 | sentences = {} 50 | for line in in_sentences: 51 | line = line.strip() 52 | if line: 53 | line = line.split("\t") 54 | num_tok = int(line[0]) 55 | line = line[1:] 56 | data = [line[x*num_tok:x*num_tok+num_tok] for x in range(0, len(line)/num_tok)] # TODO list comprehension ninja required here 57 | sentences[sid] = {} 58 | try: 59 | tid = 1 60 | for form, pos, dep, head, _, lemma in zip(*data): 61 | sentences[sid][tid] = {} 62 | sentences[sid][tid]["form"] = form 63 | sentences[sid][tid]["pos"] = pos 64 | sentences[sid][tid]["dep"] = dep 65 | sentences[sid][tid]["head"] = int(head) 66 | sentences[sid][tid]["lemma"] = lemma 67 | tid += 1 68 | except Exception: 69 | print "Malformed parse data in sentence", sid 70 | sentences[sid] = None 71 | finally: 72 | sid += 1 73 | return sentences 74 | 75 | 76 | def merge_to_graph(srl_data, sentences, verbose=False): # zip sentence and SRL data together and turn them into a graph 77 | for sid in sentences: 78 | if sid in srl_data: 79 | sentence = sentences[sid] 80 | if sentence is not None: 81 | nodes = {tid: sentence[tid]["form"] for tid in sentence} 82 | edges = [(sentence[tid]["head"], tid, sentence[tid]["dep"]) for tid in sentence] 83 | srl = srl_data[sid] 84 | for pred_tid in srl: 85 | for pred_info in srl[pred_tid]: 86 | g = DependencyGraph(nodes, edges) 87 | frame, lemmapos, roles = pred_info 88 | roles_by_tid = {} 89 | for (x, y) in roles.items(): 90 | for role_tid in y: 91 | roles_by_tid[int(role_tid)] = x 92 | try: 93 | g.add_srl((pred_tid, frame, lemmapos), roles_by_tid) 94 | yield g 95 | except Exception: 96 | print "SRL data error in sentence", sid, sys.exc_info()[0] 97 | if verbose: 98 | print "pred:", pred_tid, frame, lemmapos 99 | print roles_by_tid 100 | print g.pretty() 101 | 102 | 103 | # This is the method you are looking for 104 | def get_graphs(src_sentences, src_fes, verbose=False): # files in, graphs out 105 | i = 0 106 | with codecs.open(src_sentences, "r", "utf-8") as in_sentences: 107 | with codecs.open(src_fes, "r", "utf-8") as in_fes: 108 | srl_data = collect_srl_data(in_fes) 109 | sentences = collect_sentence_data(in_sentences) 110 | graphs = [x for x in merge_to_graph(srl_data, sentences, verbose)] 111 | print src_sentences.split("/")[-1], src_fes.split("/")[-1], "labeled:", len(srl_data), "parsed:", len(sentences), "graphs:", len(graphs) 112 | for graph in graphs: 113 | graph.gid = i 114 | i += 1 115 | return graphs 116 | 117 | 118 | -------------------------------------------------------------------------------- /simpleFrameId/evaluation.py: -------------------------------------------------------------------------------- 1 | # Evaluation routines 2 | 3 | 4 | def acc(correct, total): 5 | return 1.0 * correct / total if total != 0 else 0 6 | 7 | 8 | class Score: 9 | def __init__(self, skip_unknown_frames=True): 10 | self.total = 0 11 | self.correct = 0 12 | self.total_ambig = 0 13 | self.correct_ambig = 0 14 | self.total_unambig = 0 15 | self.correct_unambig = 0 16 | 17 | self.total_unknown = 0 18 | self.correct_unknown = 0 19 | 20 | # if the frame is missing in the lexicon AND in the training data, there is no system that will predict it. 21 | self.skip_unknown_frames = skip_unknown_frames 22 | 23 | def consume(self, correct, ambig, unknown, gold_frame): 24 | if self.skip_unknown_frames and gold_frame == -1: 25 | pass 26 | else: 27 | self.total += 1 28 | self.correct += int(correct) 29 | 30 | self.total_ambig += int(ambig) 31 | self.correct_ambig += int(ambig & correct) 32 | 33 | self.total_unambig += int(not ambig) 34 | self.correct_unambig += int(correct & (not ambig)) 35 | 36 | self.total_unknown += int(unknown) 37 | self.correct_unknown += int(unknown & correct) 38 | 39 | def report_accuracies(self): 40 | print "Acc", acc(self.correct, self.total) 41 | print "Ambig", acc(self.correct_ambig, self.total_ambig) 42 | print "Unambig", acc(self.correct_unambig, self.total_unambig) 43 | print "Unknown", acc(self.correct_unknown, self.total_unknown) 44 | 45 | def report_counts(self): 46 | print "Total", self.total 47 | print "Correct", self.correct 48 | print "Total_ambig", self.total_ambig 49 | print "Correct_ambig", self.correct_ambig 50 | print "Total_unambig", self.total_unambig 51 | print "Correct_unambig", self.correct_unambig 52 | print "Total_unknown", self.total_unknown 53 | print "Correct_unknown", self.correct_unknown 54 | 55 | def report(self): 56 | print "==========================" 57 | self.report_accuracies() 58 | self.report_counts() 59 | print "==========================" 60 | 61 | -------------------------------------------------------------------------------- /simpleFrameId/extras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import codecs 3 | 4 | # Extra classes for managing external resources 5 | 6 | 7 | class Lexicon: # Lexicon manager. Stores information about lemma.pos -> frame mappings 8 | def __init__(self): 9 | self.frameLexicon = {} 10 | self.frameToId = {} 11 | self.idToFrame = {} 12 | self.source = "NA" 13 | 14 | def get_id(self, frame): 15 | if frame not in self.frameToId: 16 | print "Unknown frame", frame, "assigning id=-1" 17 | return self.frameToId.get(frame, -1) 18 | 19 | def get_available_frame_ids(self, lemmapos): 20 | return [self.frameToId[x] for x in self.frameLexicon.get(lemmapos, [])] 21 | 22 | def get_all_frame_ids(self): 23 | return list(self.idToFrame.keys()) 24 | 25 | def get_frame(self, id): 26 | return self.idToFrame.get(id, "UNKNOWN_FRAME") 27 | 28 | # Load from pre-defined lexicon in format [frame \t lemmapos] 29 | def load_from_list(self, src): 30 | with codecs.open(src, "r", "utf-8") as f: 31 | frames = [] 32 | for line in f: 33 | frame, lemmapos = line.strip().rstrip().split("\t") 34 | self.frameLexicon[lemmapos] = self.frameLexicon.get(lemmapos, []) + [frame] 35 | frames += [frame] 36 | frames = list(set(frames)) 37 | self.frameToId = {frames[i]:i for i in range(len(frames))} 38 | self.idToFrame = {y:x for (x,y) in self.frameToId.items()} 39 | self.source = src.split("/")[-1] 40 | 41 | def is_unknown(self, lemmapos): 42 | return lemmapos not in self.frameLexicon 43 | 44 | def is_ambiguous(self, lemmapos): 45 | return len(self.frameLexicon.get(lemmapos, []))>1 46 | 47 | # Load from training data 48 | def load_from_graphs(self, g_train): 49 | frames = [] 50 | for g in g_train: 51 | predicate = g.get_predicate_head() 52 | lemmapos = predicate["lemmapos"] 53 | frame = predicate["frame"] 54 | self.frameLexicon[lemmapos] = self.frameLexicon.get(lemmapos, []) + [frame] 55 | frames += [frame] 56 | frames = list(set(frames)) 57 | self.frameToId = {frames[i]: i for i in range(len(frames))} 58 | self.idToFrame = {y: x for (x, y) in self.frameToId.items()} 59 | self.source = "training_data" 60 | 61 | 62 | class VSM: 63 | def __init__(self, src): 64 | self.map = {} 65 | self.dim = None 66 | self.source = src.split("/")[-1] if src is not None else "NA" 67 | # create dictionary for mapping from word to its embedding 68 | if src is not None: 69 | with open(src) as f: 70 | i = 0 71 | for line in f: 72 | word = line.split()[0] 73 | embedding = line.split()[1:] 74 | self.map[word] = np.array(embedding, dtype=np.float32) 75 | i += 1 76 | self.dim = len(embedding) 77 | else: 78 | self.dim = 1 79 | 80 | def get(self, word): 81 | word = word.lower() 82 | if word in self.map: 83 | return self.map[word] 84 | else: 85 | return np.zeros(self.dim, dtype=np.float32) -------------------------------------------------------------------------------- /simpleFrameId/globals.py: -------------------------------------------------------------------------------- 1 | # pretrained embeddings 2 | EMBEDDINGS_LEVY_DEPS_300 = 'deps.words.txt' # 174.015 words, 300 dim 3 | 4 | # lexicons 5 | LEXICON_FULL_BRACKETS_FIX = "fn1.5_full_lexicon_expanded" 6 | 7 | # corpora 8 | # full training sets 9 | CORPUS_DAS_TRAIN = "train-das" 10 | CORPORA_TRAIN = [CORPUS_DAS_TRAIN] 11 | 12 | #test sets 13 | CORPUS_DAS_TEST = "test-das" 14 | CORPUS_YAGS_TEST = "test-yags" 15 | CORPUS_YAGS_POSFIX_SPELL_TEST = "test-yags-posfix-spell" 16 | CORPUS_YAGS_POSFIX_TEST = "test-yags-posfix" 17 | CORPUS_MASC_TEST = "test-masc" 18 | CORPUS_TW_G_TEST = "test-tw-g" 19 | COPRUS_TW_M_TEST = "test-tw-m" 20 | CORPUS_TW_S_TEST = "test-tw-s" 21 | CORPORA_TEST = [CORPUS_DAS_TEST, CORPUS_YAGS_POSFIX_SPELL_TEST, CORPUS_YAGS_POSFIX_TEST, CORPUS_YAGS_TEST, 22 | CORPUS_MASC_TEST, CORPUS_TW_G_TEST, COPRUS_TW_M_TEST, CORPUS_TW_S_TEST] 23 | 24 | CORPORA_ALL = CORPORA_TRAIN + CORPORA_TEST 25 | -------------------------------------------------------------------------------- /simpleFrameId/graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | class DependencyGraph: 4 | def __init__(self, nodes, edges): 5 | """ Initialize a dependency graph from a list of nodes and a list of edges 6 | Nodes are represented as a dictionary {node_id:word, ...} 7 | Edges are a list of triples [(src_id, tgt_id, label), ...] """ 8 | self.G = nx.DiGraph() 9 | for node_id in nodes: 10 | self.G.add_node(node_id, word=nodes[node_id]) 11 | self.G.add_node(0, word="ROOT") 12 | for edge in edges: 13 | label = edge[2] 14 | # add prepositions to labels 15 | if label == 'prep': 16 | label += "_" + self.G.node[edge[1]]["word"].lower() 17 | self.G.add_edge(edge[0], edge[1], label=label) 18 | 19 | self.predicate_head = None 20 | self.predicate_nodes = None 21 | self.roles = None 22 | self.sent = " ".join(nodes[nid] for nid in sorted(list(nodes.keys()))) 23 | self.gid = None 24 | 25 | def add_srl(self, predicate_node, role_nodes): 26 | """ Add SRL information to the graph 27 | Predicate is specified as a tuple (node_ids, sense, lemmapos) 28 | Roles are specified as a dictionary {node_id:role, ...} 29 | This can be done only once, since only one predicate-argument structure at a time is considered """ 30 | self.predicate_nodes = [] 31 | if (self.predicate_head is not None) or (self.roles is not None): 32 | raise Exception("Each graph must contain only one predicate-argument structure") 33 | for x in predicate_node[0]: 34 | self.G.node[int(x)]["frame"] = predicate_node[1] 35 | self.G.node[int(x)]["lemmapos"] = predicate_node[2] 36 | self.predicate_nodes += [int(x)] 37 | self.predicate_head = predicate_node[0][0] 38 | self.roles = [] 39 | node_groups = {} #group nodes by role 40 | for node_id in role_nodes: 41 | node_groups[role_nodes[node_id]] = node_groups.get(role_nodes[node_id], []) + [node_id] 42 | for role in node_groups: 43 | head = self.get_head(node_groups[role]) 44 | self.G.node[head]["role"] = role 45 | self.roles += [head] 46 | 47 | def pretty(self): 48 | """ Pretty-print the graph """ 49 | s = "" 50 | for n in self.G.nodes(): 51 | if self.G.node[n] != {}: 52 | gid = str(self.gid) if self.gid!=None else "NOID" 53 | word = self.G.node[n]["word"] 54 | head = self.G.predecessors(n)[0] if len(self.G.predecessors(n)) > 0 else "_" 55 | dep_label = self.G[head][n]["label"] if len(self.G.predecessors(n)) > 0 else "_" 56 | role = self.G.node[n].get("role", "_") 57 | pred = self.G.node[n].get("frame", "_") 58 | s += "\t".join([x for x in [str(gid), str(n), word, str(head), dep_label, role, pred]])+"\n" 59 | return s 60 | 61 | def get_predicate_head(self): 62 | return self.G.node[self.predicate_head] 63 | 64 | def get_predicate_node_words(self): 65 | return [self.G.node[x]["word"].lower() for x in self.predicate_nodes] 66 | 67 | def get_direct_dependents(self, node): 68 | """ Get direct dependents of a node """ 69 | return self.G.successors(node) 70 | 71 | def get_path(self, src, tgt): 72 | """ Get path from the source node (id) to the target node (id) 73 | Path is represented as a list of dependency relations concatenated by "->" """ 74 | edges = None 75 | if tgt in self.G.predecessors(src) and tgt!=0: # don't want the ROOT 76 | return "-1" # the parent relation 77 | try: 78 | edges = nx.shortest_path(self.G, src, tgt) 79 | except nx.exception.NetworkXNoPath: 80 | edges = None 81 | finally: 82 | if edges is not None: 83 | dep_labels = [self.G[edges[n]][edges[n + 1]]["label"] for n in range(len(edges) - 1)] 84 | return "->".join(dep_labels) 85 | else: 86 | return None 87 | 88 | def create_pathmap(self): 89 | """ Internal function that calculates paths between all possible node pairs in the graph """ 90 | self.pathmap = {} 91 | self.all_paths = [] 92 | for n1 in self.G.nodes(): 93 | self.pathmap[n1] = {} 94 | for n2 in self.G.nodes(): 95 | if n1 != n2: 96 | path = self.get_path(n1, n2) 97 | if path is not None: 98 | p = self.get_path(n1, n2) 99 | self.pathmap[n1][n2] = p 100 | self.all_paths += [p] 101 | self.all_paths = set(self.all_paths) 102 | 103 | def find_node(self, src, path): 104 | """ Find node in a graph given the source and the path """ 105 | res = [] 106 | if path == '': 107 | return [src] 108 | if path not in self.all_paths: 109 | return None 110 | for tgt in self.G.nodes(): 111 | if tgt != src: 112 | if self.pathmap[src] is not None: 113 | if tgt in self.pathmap[src]: 114 | if self.pathmap[src][tgt] == path: 115 | res += [tgt] 116 | return res if len(res) > 0 else None 117 | 118 | def get_node_label(self, 119 | node_id): 120 | """ Get node label given the node id 121 | If it's a preposition, take the noun it points to! """ 122 | in_rel = self.G.in_edges(node_id) 123 | if in_rel is not None and len(in_rel)>0: 124 | label = self.G[in_rel[0][0]][in_rel[0][1]]["label"] # check the label 125 | if label.startswith("prep"): 126 | succ = self.G.successors(in_rel[0][1]) 127 | if succ is None or len(succ) == 0: 128 | return "#ERR" # no successor? That's weird! 129 | else: 130 | pobj = self.G.successors(in_rel[0][1])[0] # here we assume that a preposition has only one successor, the pobj 131 | return self.G.node[pobj]["word"] 132 | return self.G.node[node_id]["word"] 133 | 134 | def get_head(self, nodes): 135 | """ Get the head node for a role span. 136 | First, try to find a node with outgoing arc. 137 | If none found, pick the node with most dependents inside the span """ 138 | head = None # leftmost node is default 139 | if len(nodes) == 1: 140 | head = nodes[0] 141 | else: 142 | for node_id in nodes: 143 | parent = self.G.predecessors(node_id)[0] 144 | if parent not in nodes: 145 | head = node_id 146 | break 147 | return head 148 | -------------------------------------------------------------------------------- /simpleFrameId/main.py: -------------------------------------------------------------------------------- 1 | from globals import * 2 | from data import get_graphs 3 | from extras import Lexicon, VSM 4 | from representation import DependentsBowMapper, SentenceBowMapper, DummyMapper 5 | from classifier import SharingDNNClassifier, DataMajorityBaseline, LexiconMajorityBaseline, WsabieClassifier 6 | from evaluation import Score 7 | from reporting import ReportManager 8 | from config import Config 9 | from resources import ResourceManager 10 | import time 11 | from numpy import random 12 | 13 | HOME = "/home/local/UKP/martin/repos/frameID/" # adjust accordingly 14 | 15 | if __name__ == "__main__": 16 | 17 | random.seed(4) # fix the random seed 18 | 19 | vsms = [EMBEDDINGS_LEVY_DEPS_300] # vector space model to use 20 | lexicons = [LEXICON_FULL_BRACKETS_FIX] # lexicon to use (mind the all_unknown setting!) 21 | multiword_averaging = [False] # treatment of multiword predicates, false - use head embedding, true - use avg 22 | all_unknown = [False, True] # makes the lexicon treat all LU as unknown, corresponds to the no-lex setting 23 | 24 | # WSABIE params 25 | num_components = [1500] 26 | max_sampled = [10] # maximum number of negative samples used during WARP fitting 'warp' 27 | num_epochs = [500] 28 | 29 | configs = [] 30 | for lexicon in lexicons: 31 | for all_unk in all_unknown: 32 | # DummyMapper doesn't do anything 33 | configs += [Config(DataMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)] 34 | configs += [Config(LexiconMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)] 35 | 36 | # Add configurations for NN classifiers 37 | for lexicon in lexicons: 38 | for vsm in vsms: 39 | for mwa in multiword_averaging: 40 | for all_unk in all_unknown: 41 | configs += [Config(SharingDNNClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)] 42 | configs += [Config(SharingDNNClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)] 43 | 44 | # Add configurations for WSABIE classifiers 45 | for lexicon in lexicons: 46 | for vsm in vsms: 47 | for mwa in multiword_averaging: 48 | for all_unk in all_unknown: 49 | for num_comp in num_components: 50 | for max_sampl in max_sampled: 51 | for num_ep in num_epochs: 52 | configs += [Config(WsabieClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)] 53 | configs += [Config(WsabieClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)] 54 | 55 | print "Starting resource manager" 56 | sources = ResourceManager(HOME) 57 | 58 | print "Initializing reporters" 59 | reports = ReportManager(sources.out) 60 | 61 | print "Running the experiments!" 62 | runs = len(configs)*len(CORPORA_TRAIN)*len(CORPORA_TEST) 63 | print len(configs), "configurations, ", len(CORPORA_TRAIN)*len(CORPORA_TEST), " train-test pairs -> ", \ 64 | runs, " runs" 65 | 66 | current_train = 0 67 | current_config = 0 68 | current_test = 0 69 | for corpus_train in CORPORA_TRAIN: 70 | current_train += 1 71 | current_config = 0 72 | 73 | g_train = get_graphs(*sources.get_corpus(corpus_train)) 74 | reports.conll_reporter_train.report(g_train) 75 | 76 | for conf in configs: 77 | current_config += 1 78 | start_time = time.time() 79 | 80 | lexicon = Lexicon() 81 | # go to configuration, check which lexicon is needed, locate the lexicon in FS, load the lexicon 82 | lexicon.load_from_list(sources.get_lexicon(conf.get_lexicon())) 83 | reports.lexicon_reporter.report(lexicon) 84 | 85 | # same for VSM 86 | vsm = VSM(sources.get_vsm(conf.get_vsm())) 87 | mapper = conf.get_feat_extractor()(vsm, lexicon) 88 | 89 | # prepare the data 90 | X_train, y_train, lemmapos_train, gid_train = mapper.get_matrix(g_train) 91 | 92 | # train the model 93 | clf = conf.get_clf()(lexicon, conf.get_all_unknown(), conf.get_num_components(), conf.get_max_sampled(), 94 | conf.get_num_epochs()) 95 | clf.train(X_train, y_train, lemmapos_train) 96 | 97 | current_test = 0 98 | for corpus_test in CORPORA_TEST: 99 | score = Score() # storage for scores 100 | score_v = Score() # storage for verb-only scores 101 | score_known = Score() # storage for known lemma-only scores 102 | 103 | start_time = time.time() 104 | 105 | reports.set_config(conf, corpus_train, corpus_test) 106 | 107 | current_test += 1 108 | 109 | # prepare test data 110 | g_test = get_graphs(*sources.get_corpus(corpus_test)) 111 | reports.conll_reporter_test.report(g_test) 112 | X_test, y_test, lemmapos_test, gid_test = mapper.get_matrix(g_test) 113 | 114 | # predict and compare 115 | for x, y_true, lemmapos, gid, g in zip(X_test, y_test, lemmapos_test, gid_test, g_test): 116 | y_predicted = clf.predict(x, lemmapos) 117 | correct = y_true == y_predicted 118 | 119 | score.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true) 120 | if lemmapos.endswith(".v"): 121 | score_v.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true) 122 | if not lexicon.is_unknown(lemmapos): 123 | score_known.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true) 124 | 125 | reports.result_reporter.report(gid, g, lemmapos, y_predicted, y_true, lexicon) 126 | reports.summary_reporter.report(corpus_train, corpus_test, conf, score, time.time() - start_time) 127 | reports.summary_reporter_v.report(corpus_train, corpus_test, conf, score_v, time.time() - start_time) 128 | reports.summary_reporter_known.report(corpus_train, corpus_test, conf, score_known, time.time() - start_time) 129 | 130 | print "============ STATUS: - train", current_train, "/", len(CORPORA_TRAIN), \ 131 | "conf", current_config, "/", len(configs),\ 132 | "test", current_test, "/", len(CORPORA_TEST) 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /simpleFrameId/reporting.py: -------------------------------------------------------------------------------- 1 | import codecs, os, shutil 2 | from evaluation import acc 3 | 4 | # Reporting classes 5 | 6 | class ReportManager: 7 | def __init__(self, report_folder): 8 | if os.path.exists(report_folder): 9 | shutil.rmtree(report_folder) 10 | os.makedirs(report_folder) 11 | self.report_folder = report_folder 12 | self.result_reporter = ResultReporter(os.path.join(self.report_folder, "results")) 13 | self.lexicon_reporter = LexiconReporter(os.path.join(self.report_folder, "lexicon")) 14 | self.conll_reporter_train = ConllReporter(os.path.join(self.report_folder, "train.conll")) 15 | self.conll_reporter_test = ConllReporter(os.path.join(self.report_folder, "test.conll")) 16 | self.summary_reporter = ResultSummaryReporter(os.path.join(self.report_folder, "summary")) 17 | self.summary_reporter_v = ResultSummaryReporter(os.path.join(self.report_folder, "summary_v")) 18 | self.summary_reporter_known = ResultSummaryReporter(os.path.join(self.report_folder, "summary_known")) 19 | 20 | def set_config(self, config, train, test): 21 | self.result_reporter = ResultReporter(os.path.join(self.report_folder, "results_"+train+"_"+test+"_"+str(config))) 22 | self.lexicon_reporter = LexiconReporter(os.path.join(self.report_folder, "lexicon_"+config.lexicon if config.lexicon is not None else "NA")) 23 | 24 | 25 | class Reporter(object): 26 | def __init__(self, out_path): 27 | self.out = codecs.open(out_path, "w", "utf-8") 28 | if hasattr(self, 'columns'): 29 | self.write_header() 30 | 31 | def write_header(self): 32 | self.out.write("\t".join(self.columns)+"\n") 33 | def close(self): 34 | self.out.close() 35 | 36 | 37 | class ResultReporter(Reporter): 38 | def __init__(self, out_path): 39 | self.columns = ["gid", "sent", "lemmapos", "pos", "predicted_id", "true_id", "predicted_frame", "true_frame", "ambig", "unknown"] 40 | super(self.__class__, self).__init__(out_path) 41 | 42 | def report(self, instance_id, g, lemmapos, predicted, true, lexicon): 43 | self.out.write("\t".join([str(instance_id), g.sent, 44 | lemmapos, lemmapos.split(".")[1], 45 | str(predicted), str(true), lexicon.get_frame(predicted), lexicon.get_frame(true), 46 | str(lexicon.is_ambiguous(lemmapos)), str(lexicon.is_unknown(lemmapos))])+"\n") 47 | 48 | 49 | class ResultSummaryReporter(Reporter): 50 | def __init__(self, out_path): 51 | self.columns = ["train", "test", "clf", "feats", "lex", "vsm", "MWE_avg", "all_unk", "num_components", "max_sampled", "num_epochs", "total", "correct", "ambig", "ambig_correct", "unambig", "unambig_correct", "unk", "unk_correct", 52 | "total_acc", "ambig_acc", "unambig_acc", "unk_acc", "time"] 53 | super(self.__class__, self).__init__(out_path) 54 | 55 | def report(self, train, test, config, score, time_delta): 56 | self.out.write( 57 | "\t".join([train, test, config.clf.__name__, config.feat_extractor.__name__, config.lexicon if config.lexicon is not None else "NA", 58 | config.vsm if config.vsm is not None else "NA", str(config.multiword_averaging), str(config.all_unknown), 59 | str(config.num_components) if config.num_components is not None else "NA", 60 | str(config.max_sampled) if config.max_sampled is not None else "NA", 61 | str(config.num_epochs) if config.num_epochs is not None else "NA", 62 | str(score.total), str(score.correct), str(score.total_ambig), str(score.correct_ambig), str(score.total_unambig), 63 | str(score.correct_unambig), str(score.total_unknown), str(score.correct_unknown), 64 | str(acc(score.correct, score.total)), str(acc(score.correct_ambig, score.total_ambig)), 65 | str(acc(score.correct_unambig, score.total_unambig)), str(acc(score.correct_unknown, score.total_unknown)), 66 | str(time_delta)])+"\n" 67 | ) 68 | 69 | 70 | class LexiconReporter(Reporter): 71 | def __init__(self, out_path): 72 | self.columns = ["lemma", "frames"] 73 | super(self.__class__, self).__init__(out_path) 74 | 75 | def report(self, lexicon): 76 | for lemma in lexicon.frameLexicon: 77 | self.out.write("\t".join([lemma, ", ".join([str(lexicon.get_id(frame))+": "+frame for frame in lexicon.frameLexicon[lemma]])]) + "\n") 78 | 79 | 80 | class ConllReporter(Reporter): 81 | def report(self, graphs): 82 | for g in graphs: 83 | self.out.write(g.pretty() + "\n") -------------------------------------------------------------------------------- /simpleFrameId/representation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Feature mappers convert graphs into matrices given lexicon and vsm 4 | 5 | 6 | class FeatureMapper: 7 | def __init__(self, vsm, lexicon, multiword_averaging=False): 8 | self.vsm = vsm 9 | self.lexicon = lexicon 10 | self.multiword_averaging = multiword_averaging 11 | 12 | def get_repr(self, graph): 13 | raise NotImplementedError("Not implemented") 14 | 15 | def get_repr_sent(self, words, predicate_id): 16 | raise NotImplementedError("Not implemented") 17 | 18 | def get_matrix(self, graph_list): 19 | X = [] 20 | y = [] 21 | lemmapos = [] 22 | gid = [] 23 | for g in graph_list: 24 | X += [self.get_repr(g)] 25 | frame = g.get_predicate_head()["frame"] 26 | y += [self.lexicon.get_id(frame)] 27 | lemmapos += [g.get_predicate_head()["lemmapos"]] 28 | gid += [g.gid] 29 | X = np.vstack(X) 30 | y = np.array(y, dtype=np.int) 31 | return X, y, lemmapos, gid 32 | 33 | 34 | class DummyMapper(FeatureMapper): # Dummy mapper for cases where no features are needed, e.g. for majority baselines 35 | def get_repr(self, graph): 36 | return np.zeros(self.vsm.dim) 37 | 38 | 39 | def avg_embedding(wordlist, emb): 40 | res = [] 41 | for word in wordlist: 42 | word = word.lower() 43 | res += [emb.get(word)] 44 | return np.mean(res, axis=0) 45 | 46 | 47 | class SentenceBowMapper(FeatureMapper): 48 | def get_repr(self, graph): 49 | words = graph.sent.split(" ") 50 | if not self.multiword_averaging: 51 | predicate_head = graph.get_predicate_head() 52 | tgt_w = [predicate_head["word"].lower(), ] 53 | else: 54 | tgt_w = graph.get_predicate_node_words() 55 | return self.get_repr_sent(words, tgt_w) 56 | 57 | def get_repr_sent(self, words, tgt_w): 58 | return np.concatenate((avg_embedding(words, self.vsm), avg_embedding(tgt_w, self.vsm)), axis=0) 59 | 60 | 61 | class DependentsBowMapper(FeatureMapper): 62 | def get_repr(self, graph): 63 | predicate_head = graph.get_predicate_head() 64 | deps = graph.get_direct_dependents(graph.predicate_head) 65 | parent = graph.G.predecessors(graph.predicate_head) 66 | if parent is not None and len(parent)>0: 67 | deps += [parent[0]] 68 | words = [graph.G.node[n]["word"].lower() for n in deps] 69 | if not self.multiword_averaging: 70 | tgt_w = [predicate_head["word"].lower(), ] 71 | else: 72 | tgt_w = graph.get_predicate_node_words() 73 | return np.concatenate((avg_embedding(words, self.vsm), avg_embedding(tgt_w, self.vsm)), axis=0) -------------------------------------------------------------------------------- /simpleFrameId/resources.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Some basic resource management 4 | # Required folder structure: 5 | # - project_root 6 | # - out results 7 | # - srl_data data 8 | # - embeddings VSMs 9 | # - corpora training and test data 10 | # - lexicons lexicon lists 11 | 12 | class ResourceManager: 13 | def __init__(self, root): 14 | self.root = root 15 | self.out = os.path.join(self.root, "out") 16 | self.data = os.path.join(self.root, "srl_data") 17 | self.vsm_folder = os.path.join(self.data, "embeddings") 18 | self.corpora = os.path.join(self.data, "corpora") 19 | self.lexicons = os.path.join(self.data, "lexicons") 20 | 21 | def get_corpus(self, corpus_name): 22 | return (os.path.join(self.corpora, corpus_name+x) for x in [".all.lemma.tags", ".frame.elements"]) 23 | 24 | def get_lexicon(self, lexicon_name): 25 | return os.path.join(self.lexicons, lexicon_name) if lexicon_name is not None else None 26 | 27 | def get_vsm(self, vsm_name): 28 | return os.path.join(self.vsm_folder, vsm_name) if vsm_name is not None else None --------------------------------------------------------------------------------