├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── data_example
    ├── Lexicon-example.tsv
    ├── README.txt
    ├── SRL-example.all.lemma.tags
    └── SRL-example.frame.elements
└── simpleFrameId
    ├── __init__.py
    ├── check.py
    ├── classifier.py
    ├── config.py
    ├── data.py
    ├── evaluation.py
    ├── extras.py
    ├── globals.py
    ├── graph.py
    ├── main.py
    ├── reporting.py
    ├── representation.py
    └── resources.py


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------------------
 2 | Copyright 2017
 3 | Ubiquitous Knowledge Processing (UKP) Lab
 4 | Technische Universität Darmstadt
 5 | 
 6 | -------------------------------------------------------------------------------
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 |    you may not use this file except in compliance with the License.
10 |    You may obtain a copy of the License at
11 | 
12 |        http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 |    Unless required by applicable law or agreed to in writing, software
15 |    distributed under the License is distributed on an "AS IS" BASIS,
16 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |    See the License for the specific language governing permissions and
18 |    limitations under the License.
19 | 
20 | -------------------------------------------------------------------------------
21 | Third party libraries licensing information:
22 | 
23 | BSD License:
24 | - scikit-learn
25 | - networkx
26 | 
27 | BSD-new License:
28 | - numpy
29 | 
30 | MIT License:
31 | - keras
32 | 
33 | Apache License v. 2.0
34 | - lightfm


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Out-of-domain FrameNet Semantic Role Labeling
 2 | 
 3 | This code is an implementation of a simple frame identification approach (SimpleFrameId) described in the paper "Out-of-domain FrameNet Semantic Role Labeling".
 4 | Please use the following citation:
 5 | 
 6 | ```
 7 | @inproceedings{TUD-CS-2017-0011,
 8 | 	title = {Out-of-domain FrameNet Semantic Role Labeling},
 9 | 	author = {Hartmann, Silvana and Kuznetsov, Ilia and Martin, Teresa and Gurevych, Iryna},
10 | 	publisher = {Association for Computational Linguistics},
11 | 	booktitle = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics (EACL 2017)},
12 | 	pages = {to appear},
13 | 	month = apr,
14 | 	year = {2017},
15 | 	location = {Valencia, Spain},
16 | }
17 | ```
18 | 
19 | > **Abstract:**
20 | Domain dependence of NLP systems is one of the major obstacles to their application in  large-scale  text  analysis,  also  restricting the applicability of FrameNet semantic role labeling (SRL) systems. Yet, current FrameNet SRL systems are still only evaluated on a single in-domain test set.  For the first time, we study the domain dependence of FrameNet SRL on a wide range of benchmark sets. We create a novel test set for FrameNet SRL based on user-generated web text and find that the major bottleneck for  out-of-domain  FrameNet  SRL  is  the frame identification step.  To address this problem, we develop a simple, yet efficient system  based  on  distributed  word  representations. Our system closely approaches the state-of-the-art in-domain while outperforming the best available frame identification system out-of-domain.
21 | 
22 | Contact persons: Teresa Martin, martin@aiphes.tu-darmstadt.de; Ilia Kuznetsov, kuznetsov@ukp.informatik.tu-darmstadt.de
23 | 
24 | https://www.ukp.tu-darmstadt.de/
25 | 
26 | https://www.tu-darmstadt.de/
27 | 
28 | 
29 | Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions.
30 | 
31 | > This repository contains experimental software and is published for the sole purpose of giving additional background details on the respective publication.
32 | 
33 | ## Project structure
34 | The implementation is a single package. Two most important modules are:
35 | 
36 | * `main.py` -- the entry point for experiments
37 | * `globals.py` -- global variables used in experiments
38 | * `classifier.py` -- the classifiers
39 | * `representation.py` -- representation builders
40 | 
41 | The system requires a specific folder structure where the data is stored:
42 | * `ROOT` -- your project root (just a folder somewhere on your disk)
43 | * `ROOT/srl_data` -- source data
44 | * `ROOT/srl_data/corpora` -- input corpora
45 | * `ROOT/srl_data/embeddings` -- external VSMs
46 | * `ROOT/srl_data/lexicons` -- external lexicons
47 | * `ROOT/out` -- here the experiment results are stored
48 | 
49 | ## Requirements
50 | 
51 | * Python 2.7
52 | * Python dependencies: keras, lightfm, sklearn, numpy, networkx
53 | 
54 | ## Installation
55 | 
56 | Install the dependencies, adjust the paths in `main.py` and `globals.py` accordingly and run via `python main.py`
57 | 
58 | ### Parameter description
59 | 
60 | * to define in `globals.py`: filenames for
61 |   * pretrained embeddings e.g, Levy dependency embeddings
62 |   * FrameNet lexicon
63 |   * train data
64 |   * test data
65 | * to define in `main.py`
66 |   * `vsms` -- vector space model to use
67 |   * `lexicons` -- lexicon to use (mind the all_unknown setting!)
68 |   * `multiword_averaging` -- treatment of multiword predicates, false - use head embedding, true - use avg
69 |   * `all_unknown` -- makes the lexicon treat all LU as unknown, corresponds to the no-lex setting
70 |   * `num_components` -- for wsabie classifier: dimension for the learned latent representations
71 |   * `max_sampled` -- for wsabie classifier: maximum number of negative samples used during WARP fitting 'warp'
72 |   * `num_epochs` -- for wsabie classifier: number of epochs to train the model
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/data_example/Lexicon-example.tsv:
--------------------------------------------------------------------------------
1 | Dominate_competitor	dominate.v
2 | Dominate_competitor	domination.n
3 | Dominate_competitor	dominant.a
4 | Dominate_competitor	strongman.n
5 | Intentionally_act	action.n
6 | Intentionally_act	do.v
7 | Intentionally_act	step.n
8 | Intentionally_act	act.v


--------------------------------------------------------------------------------
/data_example/README.txt:
--------------------------------------------------------------------------------
 1 | The system requires three kinds of input:
 2 | 1. SRL data
 3 | 2. Lexicon
 4 | 3. VSM Lookup
 5 | 
 6 | 1. SRL Data
 7 | The default format for SRL data uses two types of files: sentence files and annotation files.
 8 | *Sentence files* are tab-separated one sentence per line with POS tags, lemmas and dependency relations.
 9 | The format is similar to CoNLL-2009 or MaltTab with all columns being merged in a single line.
10 | 
11 | [# tokens][tokens][POS tags][dependency labels][dependency heads][O][lemmas]
12 | 
13 | *Frame element files* are tab separated with the following column semantics:
14 | 
15 | [optional]
16 | [optional]
17 | [# of roles]
18 | [frame name]
19 | [lemma.pos]
20 | [position of the FEE in the sentence]
21 | [FEE string]
22 | [line# in the sentence file (incl. 0)]
23 | [role1]
24 | [position1]
25 | [role2]
26 | [position2]
27 | etc.
28 | 
29 | 2. Lexicon data
30 | Lexicon files are simple lists of frames and predicates that can evoke them, tab-separated, one pair per line.
31 | 
32 | 3. VSM data
33 | We use the standard word embeddings format, where each line corresponds to a word followed by its vector representation.


--------------------------------------------------------------------------------
/data_example/SRL-example.all.lemma.tags:
--------------------------------------------------------------------------------
1 | 26	In	addition	to	that	,	by	helping	them	find	jobs	,	Goodwill	reduced	the	state	's	Public	Support	tab	by	an	estimated	$	4	million	.	IN	NN	TO	DT	,	IN	VBG	PRP	VBP	NNS	,	NNP	VBD	DT	NN	POS	NNP	NNP	NN	IN	DT	VBN	$	CD	CD	.	prep	pobj	prep	pobj	punct	prep	pcomp	nsubj	ccomp	dobj	punct	nsubj	ROOT	det	poss	possessive	nn	nn	dobj	prep	dep	amod	pobj	number	number	punct	13	1	2	3	13	13	6	9	7	9	13	13	0	15	19	15	19	19	13	13	23	23	20	23	23	13	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	in	addition	to	that	,	by	help	them	find	job	,	goodwill	reduce	the	state	'	public	support	tab	by	an	estimate	$	4	million	.


--------------------------------------------------------------------------------
/data_example/SRL-example.frame.elements:
--------------------------------------------------------------------------------
1 | 0	0	3	Assistance	help.v	6	helping	0	Benefited_party	7	Goal	8:9
2 | 0	0	3	Locating	find.v	8	find	0	Perceiver	7	Sought_entity	9
3 | 


--------------------------------------------------------------------------------
/simpleFrameId/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UKPLab/eacl2017-oodFrameNetSRL/d30e23d724c911d001cc5ca8f28bdac86eee1ce4/simpleFrameId/__init__.py


--------------------------------------------------------------------------------
/simpleFrameId/check.py:
--------------------------------------------------------------------------------
 1 | from globals import *
 2 | from data import get_graphs
 3 | from resources import ResourceManager
 4 | from reporting import ConllReporter
 5 | 
 6 | def check_corpora_read_ok(sources, out):
 7 |     print "Checking datasets"
 8 | 
 9 |     # set corpora to test gere
10 |     for corpus in [CORPUS_YAGS_TEST, CORPUS_DAS_TRAIN, CORPUS_DAS_TEST,
11 |                    CORPUS_YAGS_TEST, CORPUS_MASC_TEST, CORPUS_TW_G_TEST, COPRUS_TW_M_TEST, CORPUS_TW_S_TEST]:
12 |         g = get_graphs(*sources.get_corpus(corpus), verbose=False)
13 |         reporter = ConllReporter(out+corpus+".conll")
14 |         reporter.report(g)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     src = "your/path/here"
19 |     root = ResourceManager(src)
20 |     check_corpora_read_ok(root, "your/path/here/tmp")


--------------------------------------------------------------------------------
/simpleFrameId/classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.models import Sequential
  3 | from keras.layers.core import Dense
  4 | from keras.utils.np_utils import to_categorical
  5 | from collections import Counter
  6 | 
  7 | from lightfm import LightFM
  8 | # LightFM source code had to be hacked as it is buggy and does not say with which python version it actually should work
  9 |         # aMatrix.tocsr() --> sp.csr_matrix(aMatrix)
 10 |         # aMatrix.tocoo() --> sp.coo_matrix(aMatrix)
 11 | from sklearn.metrics.pairwise import cosine_similarity
 12 | 
 13 | 
 14 | # Generic classifier, doesn't do much
 15 | class Classifier:
 16 |     def __init__(self, lexicon, all_unknown=False, num_components=False, max_sampled=False, num_epochs=False ):
 17 |         self.clf = None
 18 |         self.lexicon = lexicon
 19 |         self.all_unknown = all_unknown
 20 |         self.num_components = num_components
 21 |         self.max_sampled = max_sampled
 22 |         self.num_epochs = num_epochs
 23 | 
 24 |     def train(self, X, y, lemmapos):
 25 |         raise NotImplementedError("Not implemented, use child classes")
 26 |     def predict(self, X, lemmapos):
 27 |         raise NotImplementedError("Not implemented, use child classes")
 28 | 
 29 | 
 30 | # Data-driven majority baseline
 31 | class DataMajorityBaseline(Classifier):
 32 |     def train(self, X, y, lemmapos_list):
 33 |         self.majorityClasses = {}
 34 |         total_y = []
 35 |         # get frame by LU counts from DATA. Not seen in data = doesn't exist
 36 |         for X_i, y_i, lemmapos_i in zip(X, y, lemmapos_list):
 37 |             self.majorityClasses[lemmapos_i] = self.majorityClasses.get(lemmapos_i, []) + [y_i]
 38 |             total_y += [y_i]
 39 | 
 40 |         uninformed_majority = Counter(total_y).most_common(1)[0][0]  # uninformed majority for lemmas not seen in data
 41 | 
 42 |         # get top frame for each LU
 43 |         for lemmapos in self.majorityClasses:
 44 |             if len(self.majorityClasses.get(lemmapos, [])) == 0:
 45 |                 self.majorityClasses[lemmapos] = uninformed_majority
 46 |             else:
 47 |                 self.majorityClasses[lemmapos] = Counter(self.majorityClasses[lemmapos]).most_common(1)[0][0]
 48 | 
 49 |         self.majorityClasses["__UNKNOWN__"] = uninformed_majority
 50 |         print self.majorityClasses
 51 |         print "Majority baseline extracted, uninformed majority class is", uninformed_majority, ":", self.lexicon.idToFrame[uninformed_majority]
 52 | 
 53 |     def predict(self, X, lemmapos):
 54 |         if self.all_unknown:
 55 |             return self.majorityClasses["__UNKNOWN__"]
 56 |         return self.majorityClasses.get(lemmapos, self.majorityClasses["__UNKNOWN__"])
 57 | 
 58 | 
 59 | # Lexicon-driven majority baseline
 60 | class LexiconMajorityBaseline(DataMajorityBaseline):
 61 |     def train(self, X, y, lemmapos_list):
 62 |         frame_counts = []
 63 |         for y_i, lemmapos_i in zip(y, lemmapos_list):  # collect TOTAL frame counts from data
 64 |             frame_counts += [y_i]
 65 | 
 66 |         frame_counts = Counter(frame_counts)
 67 | 
 68 |         self.majorityClasses = {}
 69 |         uninformed_majority = frame_counts.most_common(1)[0][0]
 70 |         self.majorityClasses["__UNKNOWN__"] = uninformed_majority
 71 | 
 72 |         for lemmapos in self.lexicon.frameLexicon:   # for each lemma in LEXICON, determine most frequent frame among available, based on data
 73 |             available_frames = self.lexicon.get_available_frame_ids(lemmapos)
 74 |             available_frame_counts = Counter({f:frame_counts.get(f, 0) for f in available_frames})  # no frame in data - count set to 0
 75 |             self.majorityClasses[lemmapos] = available_frame_counts.most_common(1)[0][0]
 76 | 
 77 |         print "Majority baseline extracted, uninformed majority class is", uninformed_majority, ":", self.lexicon.idToFrame[uninformed_majority]
 78 | 
 79 | 
 80 | # A simple NN-based classifier
 81 | class SharingDNNClassifier(Classifier):
 82 |     def train(self, X, y, lemmapos_list):
 83 |         self.clf = Sequential()
 84 |         self.clf.add(Dense(256, input_dim=len(X[0]), activation='relu'))
 85 |         self.clf.add(Dense(100, activation='relu'))
 86 |         self.clf.add(Dense(output_dim=np.max(y)+1, activation='softmax'))  # np.max()+1 because frames are 0-indexed
 87 | 
 88 |         self.clf.compile(optimizer='adagrad',
 89 |                     loss='categorical_crossentropy',
 90 |                     metrics=['accuracy'])
 91 | 
 92 |         self.clf.fit(X, to_categorical(y, np.max(y)+1), verbose=1, nb_epoch=100)
 93 | 
 94 |     def predict(self, X, lemmapos):
 95 |         available_frames = self.lexicon.get_available_frame_ids(lemmapos)  # get available frames from lexicon
 96 |         ambig = self.lexicon.is_ambiguous(lemmapos)
 97 |         unknown = self.lexicon.is_unknown(lemmapos)  # unknown = not in lexicon
 98 | 
 99 |         if unknown or self.all_unknown:  # the all_unknown setting renders all lemma.pos unknown!
100 |             available_frames = self.lexicon.get_all_frame_ids()  # if the lemma.pos is unknown, search in all frames
101 |         else:
102 |             # if the LU is known and has only one frame, just return it. Even if there is no data for this LU (!)
103 |             if not ambig:
104 |                 return available_frames[0]
105 | 
106 |         y = self.clf.predict(X.reshape((-1, len(X))))[0]
107 |         # pick the best-scoring frame among available
108 |         bestScore = None
109 |         bestClass = None
110 |         for cl in available_frames:
111 |             score = y[cl]
112 |             if bestScore is None or score >= bestScore:
113 |                 bestScore = score
114 |                 bestClass = cl
115 |         return bestClass
116 | 
117 | 
118 | # classification with WSABIE latent representations
119 | class WsabieClassifier(Classifier):
120 |     def train(self, X, y, lemmapos_list):
121 |         
122 |         # MODEL
123 |         self.clf = LightFM(no_components = self.num_components, learning_schedule = 'adagrad', loss = 'warp', \
124 |                            learning_rate = 0.05, epsilon = 1e-06, item_alpha = 0.0, user_alpha = 1e-6, \
125 |                            max_sampled = self.max_sampled, random_state = None)
126 |         
127 |         # DATA
128 |         # training data
129 |         # X: list of vectors
130 |         #    each vector is the initial representation for a sentence (more precisely, for a predicate with context)
131 |         #    --> these are the user features in the training set
132 |         # y: list of IDs for frames
133 |         #    the frame IDs are the labels for the representations
134 |         #    --> these are used to create the interaction matrix for the training set such that LightFM can deal with it
135 |         # y_interactionLabels: interaction matrix is of size (num sentences in y) x (num frames) with 1 indicating the frame label for a predicate in its context sentence
136 |         y_interactionLabels = self.createInteractionMatrix(y)
137 |                  
138 |         # FIT
139 |         self.clf = self.clf.fit(interactions = y_interactionLabels, user_features = X, item_features = None, \
140 |                                 sample_weight = None, epochs = self.num_epochs, num_threads = 2, verbose = True)
141 | 
142 |     def predict(self, X, lemmapos):
143 |         # DATA
144 |         # test data
145 |         # X: list of vectors
146 |         #    each vector is the initial representation for a sentence (more precisely, for a predicate with context)
147 |         #    --> these are the user features in the test set
148 |         X_reshape = X.reshape((-1, len(X)))
149 | 
150 |         # get projection matrices from trained MODEL
151 |         user_embeddings_fromTraining = self.clf.user_embeddings
152 |         item_embeddings_fromTraining = self.clf.item_embeddings
153 |         
154 |         # PREDICT
155 |         # do the prediction for this new user via the dot product of the user feature X and the projection matrix user embeddings obtained during training
156 |         embeddedNewUser = np.dot(X_reshape, user_embeddings_fromTraining) # now in the same space as the item embeddings obtained during training
157 |         # use cosine similarity as similarity measure between the embedded test sentence and all the embeddings corresponding to frames
158 |         similarity_to_all_frames = cosine_similarity(embeddedNewUser, item_embeddings_fromTraining)[0]
159 |         
160 |         available_frame_IDs = self.lexicon.get_available_frame_ids(lemmapos)  # get available frame IDs for this lemma.pos from lexicon
161 |         ambig = self.lexicon.is_ambiguous(lemmapos)  # amiguous = can evoke more than one frame
162 |         unknown = self.lexicon.is_unknown(lemmapos)  # unknown = not in lexicon
163 | 
164 |         if unknown or self.all_unknown:  # the all_unknown setting renders all lemma.pos unknown!
165 |             available_frame_IDs = self.lexicon.get_all_frame_ids()  # if the lemma.pos is unknown, search in all frames
166 |         else:
167 |             # if the lemma.pos is known and has only one frame, just return it. Even if there is no data for this lemma.pos.
168 |             if not ambig:
169 |                 return available_frame_IDs[0]
170 |             
171 |         # pick the best-scoring frameID among available frameIDs
172 |         bestScore = None
173 |         best_frame_ID = None
174 |         for frame_ID in available_frame_IDs:
175 |             score = similarity_to_all_frames[frame_ID]
176 |             if bestScore is None or score >= bestScore:
177 |                 bestScore = score
178 |                 best_frame_ID = frame_ID
179 |         return best_frame_ID
180 |     
181 |     
182 |     def createInteractionMatrix(self, y_ID):
183 |         # interactionMatrix is of size (num sentences in y_ID) x (num frames) with 1 indicating the frame label for a predicate in its context sentence
184 |         
185 |         numSentInY = len(y_ID)
186 |         numFrames = len(self.lexicon.get_all_frame_ids())
187 |         y_interactionLabels = np.zeros([numSentInY, numFrames], dtype = np.float32)
188 |                 
189 |         for i in range(numSentInY):
190 |             y_interactionLabels[i, y_ID[i]] = 1.
191 |         
192 |         return y_interactionLabels


--------------------------------------------------------------------------------
/simpleFrameId/config.py:
--------------------------------------------------------------------------------
 1 | class Config:  # Container class for configurations
 2 |     def __init__(self, clf, feature_extractor, lexicon, vsm, multiword_averaging,
 3 |                  all_unknown, num_components, max_sampled, num_epochs):
 4 |         self.clf = clf
 5 |         self.feat_extractor = feature_extractor
 6 |         self.lexicon = lexicon
 7 |         self.vsm = vsm
 8 |         self.multiword_averaging = multiword_averaging
 9 |         self.all_unknown = all_unknown
10 |         self.num_components = num_components
11 |         self.max_sampled = max_sampled
12 |         self.num_epochs = num_epochs
13 | 
14 |     def get_clf(self):
15 |         return self.clf
16 | 
17 |     def get_feat_extractor(self):
18 |         return self.feat_extractor
19 | 
20 |     def get_lexicon(self):
21 |         return self.lexicon
22 | 
23 |     def get_vsm(self):
24 |         return self.vsm
25 | 
26 |     def get_multiword_averaging(self):
27 |         return self.multiword_averaging
28 | 
29 |     def get_all_unknown(self):
30 |         return self.all_unknown
31 |     
32 |     def get_num_components(self):
33 |         return self.num_components
34 |     
35 |     def get_max_sampled(self):
36 |         return self.max_sampled
37 |     
38 |     def get_num_epochs(self):
39 |         return self.num_epochs
40 | 
41 |     def __str__(self):
42 |         return "c_"+self.clf.__name__+"__"+"f_"+self.feat_extractor.__name__+"__"+\
43 |                "l_"+(self.lexicon if self.lexicon is not None else "NA") +"__"+"vsm_"+\
44 |                (self.vsm if self.vsm is not None else "NA")+\
45 |               "__"+"MWA_"+str(self.multiword_averaging)+"__unk_"+str(self.all_unknown)+\
46 |               "__comp_"+str(self.num_components)+"__samp_"+str(self.max_sampled)+"__ep_"+str(self.num_epochs)
47 | 


--------------------------------------------------------------------------------
/simpleFrameId/data.py:
--------------------------------------------------------------------------------
  1 | import codecs, sys
  2 | from graph import DependencyGraph
  3 | 
  4 | # Data management routines
  5 | 
  6 | 
  7 | def fix_tid(src_tid, sep):  # fixes and unrolls the offsets
  8 |     if sep not in src_tid:
  9 |         fixed_span = [str(int(src_tid)+1)]
 10 |     else:
 11 |         vals = src_tid.split(sep)
 12 |         fixed_span = [str(int(val)+1) for val in vals]
 13 | 
 14 |     unrolled_span = []  # unroll spans, e.g. 2:5 -> [2,3,4,5]; 6_7_9 -> [6,7,8,9]
 15 |     if len(fixed_span) <= 1:
 16 |         return tuple([int(i) for i in fixed_span])
 17 |     else:
 18 |         for x in range(len(fixed_span)-1):
 19 |             for y in range(int(fixed_span[x]), int(fixed_span[x+1])+1):
 20 |                 unrolled_span += [y]
 21 |         return tuple(set(sorted([int(i) for i in unrolled_span])))
 22 | 
 23 | 
 24 | def collect_srl_data(in_fes):  # load SRL data (~frame.elements). All the offsets are shifted by 1!
 25 |     srl_data = {}  # {sentence_id: {fe_id: [[fee_frame, fee_lemmapos, {role: role_span}], [fee_frame2, {role: role_span}], ...]}
 26 |     for line in in_fes:
 27 |         line = line.strip().split("\t")
 28 |         fee_tid = fix_tid(line[5], "_")  # predicate offsets are given as tid_tid_tid_tid
 29 |         fee_frame = line[3]
 30 |         fee_lemmapos = line[4].lower()
 31 |         sid = int(line[7])
 32 |         role_info = line[8:]
 33 |         srl_data[sid] = srl_data.get(sid, {})
 34 |         srl_data[sid][fee_tid] = srl_data[sid].get(fee_tid, [])
 35 |         fee_info = []  # ugly but so is the data! Multiple fee possible on single span
 36 |         fee_info += [fee_frame]
 37 |         fee_info += [fee_lemmapos]
 38 | 
 39 |         role_dict = {}
 40 |         for x in range(0, len(role_info), 2):
 41 |             role_dict[role_info[0]] = fix_tid(role_info[1], ":")  # role offsets are given as start:end
 42 |         fee_info += [role_dict]
 43 |         srl_data[sid][fee_tid] += [fee_info]
 44 |     return srl_data
 45 | 
 46 | 
 47 | def collect_sentence_data(in_sentences):  # load parse data (~all.lemma.tags)
 48 |     sid = 0
 49 |     sentences = {}
 50 |     for line in in_sentences:
 51 |         line = line.strip()
 52 |         if line:
 53 |             line = line.split("\t")
 54 |             num_tok = int(line[0])
 55 |             line = line[1:]
 56 |             data = [line[x*num_tok:x*num_tok+num_tok] for x in range(0, len(line)/num_tok)]  # TODO list comprehension ninja required here
 57 |             sentences[sid] = {}
 58 |             try:
 59 |                 tid = 1
 60 |                 for form, pos, dep, head, _, lemma in zip(*data):
 61 |                     sentences[sid][tid] = {}
 62 |                     sentences[sid][tid]["form"] = form
 63 |                     sentences[sid][tid]["pos"] = pos
 64 |                     sentences[sid][tid]["dep"] = dep
 65 |                     sentences[sid][tid]["head"] = int(head)
 66 |                     sentences[sid][tid]["lemma"] = lemma
 67 |                     tid += 1
 68 |             except Exception:
 69 |                 print "Malformed parse data in sentence", sid
 70 |                 sentences[sid] = None
 71 |             finally:
 72 |                 sid += 1
 73 |     return sentences
 74 | 
 75 | 
 76 | def merge_to_graph(srl_data, sentences, verbose=False):  # zip sentence and SRL data together and turn them into a graph
 77 |     for sid in sentences:
 78 |         if sid in srl_data:
 79 |             sentence = sentences[sid]
 80 |             if sentence is not None:
 81 |                 nodes = {tid: sentence[tid]["form"] for tid in sentence}
 82 |                 edges = [(sentence[tid]["head"], tid, sentence[tid]["dep"]) for tid in sentence]
 83 |                 srl = srl_data[sid]
 84 |                 for pred_tid in srl:
 85 |                     for pred_info in srl[pred_tid]:
 86 |                         g = DependencyGraph(nodes, edges)
 87 |                         frame, lemmapos, roles = pred_info
 88 |                         roles_by_tid = {}
 89 |                         for (x, y) in roles.items():
 90 |                             for role_tid in y:
 91 |                                 roles_by_tid[int(role_tid)] = x
 92 |                         try:
 93 |                             g.add_srl((pred_tid, frame, lemmapos), roles_by_tid)
 94 |                             yield g
 95 |                         except Exception:
 96 |                             print "SRL data error in sentence", sid, sys.exc_info()[0]
 97 |                             if verbose:
 98 |                                 print "pred:", pred_tid, frame, lemmapos
 99 |                                 print roles_by_tid
100 |                                 print g.pretty()
101 | 
102 | 
103 | # This is the method you are looking for
104 | def get_graphs(src_sentences, src_fes, verbose=False):  # files in, graphs out
105 |     i = 0
106 |     with codecs.open(src_sentences, "r", "utf-8") as in_sentences:
107 |         with codecs.open(src_fes, "r", "utf-8") as in_fes:
108 |             srl_data = collect_srl_data(in_fes)
109 |             sentences = collect_sentence_data(in_sentences)
110 |             graphs = [x for x in merge_to_graph(srl_data, sentences, verbose)]
111 |             print src_sentences.split("/")[-1], src_fes.split("/")[-1], "labeled:", len(srl_data), "parsed:", len(sentences), "graphs:", len(graphs)
112 |             for graph in graphs:
113 |                 graph.gid = i
114 |                 i += 1
115 |             return graphs
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/simpleFrameId/evaluation.py:
--------------------------------------------------------------------------------
 1 | # Evaluation routines
 2 | 
 3 | 
 4 | def acc(correct, total):
 5 |     return 1.0 * correct / total if total != 0 else 0
 6 | 
 7 | 
 8 | class Score:
 9 |     def __init__(self, skip_unknown_frames=True):
10 |         self.total = 0
11 |         self.correct = 0
12 |         self.total_ambig = 0
13 |         self.correct_ambig = 0
14 |         self.total_unambig = 0
15 |         self.correct_unambig = 0
16 | 
17 |         self.total_unknown = 0
18 |         self.correct_unknown = 0
19 | 
20 |         # if the frame is missing in the lexicon AND in the training data, there is no system that will predict it.
21 |         self.skip_unknown_frames = skip_unknown_frames
22 | 
23 |     def consume(self, correct, ambig, unknown, gold_frame):
24 |         if self.skip_unknown_frames and gold_frame == -1:
25 |             pass
26 |         else:
27 |             self.total += 1
28 |             self.correct += int(correct)
29 | 
30 |             self.total_ambig += int(ambig)
31 |             self.correct_ambig += int(ambig & correct)
32 | 
33 |             self.total_unambig += int(not ambig)
34 |             self.correct_unambig += int(correct & (not ambig))
35 | 
36 |             self.total_unknown += int(unknown)
37 |             self.correct_unknown += int(unknown & correct)
38 | 
39 |     def report_accuracies(self):
40 |         print "Acc", acc(self.correct, self.total)
41 |         print "Ambig", acc(self.correct_ambig, self.total_ambig)
42 |         print "Unambig", acc(self.correct_unambig, self.total_unambig)
43 |         print "Unknown", acc(self.correct_unknown, self.total_unknown)
44 | 
45 |     def report_counts(self):
46 |         print "Total", self.total
47 |         print "Correct", self.correct
48 |         print "Total_ambig", self.total_ambig
49 |         print "Correct_ambig", self.correct_ambig
50 |         print "Total_unambig", self.total_unambig
51 |         print "Correct_unambig", self.correct_unambig
52 |         print "Total_unknown", self.total_unknown
53 |         print "Correct_unknown", self.correct_unknown
54 | 
55 |     def report(self):
56 |         print "=========================="
57 |         self.report_accuracies()
58 |         self.report_counts()
59 |         print "=========================="
60 | 
61 | 


--------------------------------------------------------------------------------
/simpleFrameId/extras.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import codecs
 3 | 
 4 | # Extra classes for managing external resources
 5 | 
 6 | 
 7 | class Lexicon:  # Lexicon manager. Stores information about lemma.pos -> frame mappings
 8 |     def __init__(self):
 9 |         self.frameLexicon = {}
10 |         self.frameToId = {}
11 |         self.idToFrame = {}
12 |         self.source = "NA"
13 | 
14 |     def get_id(self, frame):
15 |         if frame not in self.frameToId:
16 |             print "Unknown frame", frame, "assigning id=-1"
17 |         return self.frameToId.get(frame, -1)
18 | 
19 |     def get_available_frame_ids(self, lemmapos):
20 |         return [self.frameToId[x] for x in self.frameLexicon.get(lemmapos, [])]
21 | 
22 |     def get_all_frame_ids(self):
23 |         return list(self.idToFrame.keys())
24 | 
25 |     def get_frame(self, id):
26 |         return self.idToFrame.get(id, "UNKNOWN_FRAME")
27 | 
28 |     # Load from pre-defined lexicon in format [frame \t lemmapos]
29 |     def load_from_list(self, src):
30 |         with codecs.open(src, "r", "utf-8") as f:
31 |             frames = []
32 |             for line in f:
33 |                 frame, lemmapos = line.strip().rstrip().split("\t")
34 |                 self.frameLexicon[lemmapos] = self.frameLexicon.get(lemmapos, []) + [frame]
35 |                 frames += [frame]
36 |         frames = list(set(frames))
37 |         self.frameToId = {frames[i]:i for i in range(len(frames))}
38 |         self.idToFrame = {y:x for (x,y) in self.frameToId.items()}
39 |         self.source = src.split("/")[-1]
40 | 
41 |     def is_unknown(self, lemmapos):
42 |         return lemmapos not in self.frameLexicon
43 | 
44 |     def is_ambiguous(self, lemmapos):
45 |         return len(self.frameLexicon.get(lemmapos, []))>1
46 | 
47 |     # Load from training data
48 |     def load_from_graphs(self, g_train):
49 |         frames = []
50 |         for g in g_train:
51 |             predicate = g.get_predicate_head()
52 |             lemmapos = predicate["lemmapos"]
53 |             frame = predicate["frame"]
54 |             self.frameLexicon[lemmapos] = self.frameLexicon.get(lemmapos, []) + [frame]
55 |             frames += [frame]
56 |         frames = list(set(frames))
57 |         self.frameToId = {frames[i]: i for i in range(len(frames))}
58 |         self.idToFrame = {y: x for (x, y) in self.frameToId.items()}
59 |         self.source = "training_data"
60 | 
61 | 
62 | class VSM:
63 |     def __init__(self, src):
64 |         self.map = {}
65 |         self.dim = None
66 |         self.source = src.split("/")[-1] if src is not None else "NA"
67 |         # create dictionary for mapping from word to its embedding
68 |         if src is not None:
69 |             with open(src) as f:
70 |                 i = 0
71 |                 for line in f:
72 |                     word = line.split()[0]
73 |                     embedding = line.split()[1:]
74 |                     self.map[word] = np.array(embedding, dtype=np.float32)
75 |                     i += 1
76 |                 self.dim = len(embedding)
77 |         else:
78 |             self.dim = 1
79 | 
80 |     def get(self, word):
81 |         word = word.lower()
82 |         if word in self.map:
83 |             return self.map[word]
84 |         else:
85 |             return np.zeros(self.dim, dtype=np.float32)


--------------------------------------------------------------------------------
/simpleFrameId/globals.py:
--------------------------------------------------------------------------------
 1 | # pretrained embeddings
 2 | EMBEDDINGS_LEVY_DEPS_300 = 'deps.words.txt' # 174.015 words, 300 dim
 3 | 
 4 | # lexicons
 5 | LEXICON_FULL_BRACKETS_FIX = "fn1.5_full_lexicon_expanded"
 6 | 
 7 | # corpora
 8 | # full training sets
 9 | CORPUS_DAS_TRAIN = "train-das"
10 | CORPORA_TRAIN = [CORPUS_DAS_TRAIN]
11 | 
12 | #test sets
13 | CORPUS_DAS_TEST = "test-das"
14 | CORPUS_YAGS_TEST = "test-yags"
15 | CORPUS_YAGS_POSFIX_SPELL_TEST = "test-yags-posfix-spell"
16 | CORPUS_YAGS_POSFIX_TEST = "test-yags-posfix"
17 | CORPUS_MASC_TEST = "test-masc"
18 | CORPUS_TW_G_TEST = "test-tw-g"
19 | COPRUS_TW_M_TEST = "test-tw-m"
20 | CORPUS_TW_S_TEST = "test-tw-s"
21 | CORPORA_TEST = [CORPUS_DAS_TEST, CORPUS_YAGS_POSFIX_SPELL_TEST, CORPUS_YAGS_POSFIX_TEST, CORPUS_YAGS_TEST,
22 |                 CORPUS_MASC_TEST, CORPUS_TW_G_TEST, COPRUS_TW_M_TEST, CORPUS_TW_S_TEST]
23 | 
24 | CORPORA_ALL = CORPORA_TRAIN + CORPORA_TEST
25 | 


--------------------------------------------------------------------------------
/simpleFrameId/graph.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | 
  3 | class DependencyGraph:
  4 |     def __init__(self, nodes, edges):
  5 |         """ Initialize a dependency graph from a list of nodes and a list of edges
  6 | 			Nodes are represented as a dictionary {node_id:word, ...}
  7 | 			Edges are a list of triples [(src_id, tgt_id, label), ...] """
  8 |         self.G = nx.DiGraph()
  9 |         for node_id in nodes:
 10 |             self.G.add_node(node_id, word=nodes[node_id])
 11 |         self.G.add_node(0, word="ROOT")
 12 |         for edge in edges:
 13 |             label = edge[2]
 14 |             # add prepositions to labels
 15 |             if label == 'prep':
 16 |                 label += "_" + self.G.node[edge[1]]["word"].lower()
 17 |             self.G.add_edge(edge[0], edge[1], label=label)
 18 | 
 19 |         self.predicate_head = None
 20 |         self.predicate_nodes = None
 21 |         self.roles = None
 22 |         self.sent = " ".join(nodes[nid] for nid in sorted(list(nodes.keys())))
 23 |         self.gid = None
 24 | 
 25 |     def add_srl(self, predicate_node, role_nodes):
 26 |         """ Add SRL information to the graph
 27 | 			Predicate is specified as a tuple (node_ids, sense, lemmapos)
 28 | 			Roles are specified as a dictionary {node_id:role, ...}
 29 | 			This can be done only once, since only one predicate-argument structure at a time is considered """
 30 |         self.predicate_nodes = []
 31 |         if (self.predicate_head is not None) or (self.roles is not None):
 32 |             raise Exception("Each graph must contain only one predicate-argument structure")
 33 |         for x in predicate_node[0]:
 34 |             self.G.node[int(x)]["frame"] = predicate_node[1]
 35 |             self.G.node[int(x)]["lemmapos"] = predicate_node[2]
 36 |             self.predicate_nodes += [int(x)]
 37 |         self.predicate_head = predicate_node[0][0]
 38 |         self.roles = []
 39 |         node_groups = {}  #group nodes by role
 40 |         for node_id in role_nodes:
 41 |             node_groups[role_nodes[node_id]] = node_groups.get(role_nodes[node_id], []) + [node_id]
 42 |         for role in node_groups:
 43 |             head = self.get_head(node_groups[role])
 44 |             self.G.node[head]["role"] = role
 45 |             self.roles += [head]
 46 | 
 47 |     def pretty(self):
 48 |         """ Pretty-print the graph """
 49 |         s = ""
 50 |         for n in self.G.nodes():
 51 |             if self.G.node[n] != {}:
 52 |                 gid = str(self.gid) if self.gid!=None else "NOID"
 53 |                 word = self.G.node[n]["word"]
 54 |                 head = self.G.predecessors(n)[0] if len(self.G.predecessors(n)) > 0 else "_"
 55 |                 dep_label = self.G[head][n]["label"] if len(self.G.predecessors(n)) > 0 else "_"
 56 |                 role = self.G.node[n].get("role", "_")
 57 |                 pred = self.G.node[n].get("frame", "_")
 58 |                 s += "\t".join([x for x in [str(gid), str(n), word, str(head), dep_label, role, pred]])+"\n"
 59 |         return s
 60 | 
 61 |     def get_predicate_head(self):
 62 |         return self.G.node[self.predicate_head]
 63 | 
 64 |     def get_predicate_node_words(self):
 65 |         return [self.G.node[x]["word"].lower() for x in self.predicate_nodes]
 66 | 
 67 |     def get_direct_dependents(self, node):
 68 |         """ Get direct dependents of a node """
 69 |         return self.G.successors(node)
 70 | 
 71 |     def get_path(self, src, tgt):
 72 |         """ Get path from the source node (id) to the target node (id)
 73 | 			Path is represented as a list of dependency relations concatenated by "->" """
 74 |         edges = None
 75 |         if tgt in self.G.predecessors(src) and tgt!=0:  # don't want the ROOT
 76 |             return "-1"  # the parent relation
 77 |         try:
 78 |             edges = nx.shortest_path(self.G, src, tgt)
 79 |         except nx.exception.NetworkXNoPath:
 80 |             edges = None
 81 |         finally:
 82 |             if edges is not None:
 83 |                 dep_labels = [self.G[edges[n]][edges[n + 1]]["label"] for n in range(len(edges) - 1)]
 84 |                 return "->".join(dep_labels)
 85 |             else:
 86 |                 return None
 87 | 
 88 |     def create_pathmap(self):
 89 |         """ Internal function that calculates paths between all possible node pairs in the graph """
 90 |         self.pathmap = {}
 91 |         self.all_paths = []
 92 |         for n1 in self.G.nodes():
 93 |             self.pathmap[n1] = {}
 94 |             for n2 in self.G.nodes():
 95 |                 if n1 != n2:
 96 |                     path = self.get_path(n1, n2)
 97 |                     if path is not None:
 98 |                         p = self.get_path(n1, n2)
 99 |                         self.pathmap[n1][n2] = p
100 |                         self.all_paths += [p]
101 |         self.all_paths = set(self.all_paths)
102 | 
103 |     def find_node(self, src, path):
104 |         """ Find node in a graph given the source and the path """
105 |         res = []
106 |         if path == '':
107 |             return [src]
108 |         if path not in self.all_paths:
109 |             return None
110 |         for tgt in self.G.nodes():
111 |             if tgt != src:
112 |                 if self.pathmap[src] is not None:
113 |                     if tgt in self.pathmap[src]:
114 |                         if self.pathmap[src][tgt] == path:
115 |                             res += [tgt]
116 |         return res if len(res) > 0 else None
117 | 
118 |     def get_node_label(self,
119 |                        node_id):
120 |         """ Get node label given the node id
121 | 			If it's a preposition, take the noun it points to! """
122 |         in_rel = self.G.in_edges(node_id)
123 |         if in_rel is not None and len(in_rel)>0:
124 |             label = self.G[in_rel[0][0]][in_rel[0][1]]["label"]  # check the label
125 |             if label.startswith("prep"):
126 |                 succ = self.G.successors(in_rel[0][1])
127 |                 if succ is None or len(succ) == 0:
128 |                     return "#ERR"  # no successor? That's weird!
129 |                 else:
130 |                     pobj = self.G.successors(in_rel[0][1])[0]  # here we assume that a preposition has only one successor, the pobj
131 |                     return self.G.node[pobj]["word"]
132 |         return self.G.node[node_id]["word"]
133 | 
134 |     def get_head(self, nodes):
135 |         """ Get the head node for a role span.
136 | 		First, try to find a node with outgoing arc.
137 | 		If none found, pick the node with most dependents inside the span """
138 |         head = None # leftmost node is default
139 |         if len(nodes) == 1:
140 |             head = nodes[0]
141 |         else:
142 |             for node_id in nodes:
143 |                 parent = self.G.predecessors(node_id)[0]
144 |                 if parent not in nodes:
145 |                     head = node_id
146 |                     break
147 |         return head
148 | 


--------------------------------------------------------------------------------
/simpleFrameId/main.py:
--------------------------------------------------------------------------------
  1 | from globals import *
  2 | from data import get_graphs
  3 | from extras import Lexicon, VSM
  4 | from representation import DependentsBowMapper, SentenceBowMapper, DummyMapper
  5 | from classifier import SharingDNNClassifier, DataMajorityBaseline, LexiconMajorityBaseline, WsabieClassifier
  6 | from evaluation import Score
  7 | from reporting import ReportManager
  8 | from config import Config
  9 | from resources import ResourceManager
 10 | import time
 11 | from numpy import random
 12 | 
 13 | HOME = "/home/local/UKP/martin/repos/frameID/"  # adjust accordingly
 14 | 
 15 | if __name__ == "__main__":
 16 | 
 17 |     random.seed(4)  # fix the random seed
 18 | 
 19 |     vsms = [EMBEDDINGS_LEVY_DEPS_300]  # vector space model to use
 20 |     lexicons = [LEXICON_FULL_BRACKETS_FIX]  # lexicon to use (mind the all_unknown setting!)
 21 |     multiword_averaging = [False]  # treatment of multiword predicates, false - use head embedding, true - use avg
 22 |     all_unknown = [False, True]  # makes the lexicon treat all LU as unknown, corresponds to the no-lex setting
 23 | 
 24 |     # WSABIE params
 25 |     num_components = [1500]
 26 |     max_sampled = [10]  # maximum number of negative samples used during WARP fitting 'warp'
 27 |     num_epochs = [500]
 28 | 
 29 |     configs = []
 30 |     for lexicon in lexicons:
 31 |         for all_unk in all_unknown:
 32 |             # DummyMapper doesn't do anything
 33 |             configs += [Config(DataMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)]
 34 |             configs += [Config(LexiconMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)]
 35 | 
 36 |     # Add configurations for NN classifiers
 37 |     for lexicon in lexicons:
 38 |         for vsm in vsms:
 39 |             for mwa in multiword_averaging:
 40 |                 for all_unk in all_unknown:
 41 |                    configs += [Config(SharingDNNClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)]
 42 |                    configs += [Config(SharingDNNClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)]
 43 | 
 44 |     # Add configurations for WSABIE classifiers
 45 |     for lexicon in lexicons:
 46 |         for vsm in vsms:
 47 |             for mwa in multiword_averaging:
 48 |                 for all_unk in all_unknown:
 49 |                     for num_comp in num_components:
 50 |                        for max_sampl in max_sampled:
 51 |                             for num_ep in num_epochs:
 52 |                                 configs += [Config(WsabieClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)]
 53 |                                 configs += [Config(WsabieClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)]
 54 | 
 55 |     print "Starting resource manager"
 56 |     sources = ResourceManager(HOME)
 57 | 
 58 |     print "Initializing reporters"
 59 |     reports = ReportManager(sources.out)
 60 | 
 61 |     print "Running the experiments!"
 62 |     runs = len(configs)*len(CORPORA_TRAIN)*len(CORPORA_TEST)
 63 |     print len(configs), "configurations, ", len(CORPORA_TRAIN)*len(CORPORA_TEST), " train-test pairs -> ", \
 64 |         runs, " runs"
 65 | 
 66 |     current_train = 0
 67 |     current_config = 0
 68 |     current_test = 0
 69 |     for corpus_train in CORPORA_TRAIN:
 70 |         current_train += 1
 71 |         current_config = 0
 72 | 
 73 |         g_train = get_graphs(*sources.get_corpus(corpus_train))
 74 |         reports.conll_reporter_train.report(g_train)
 75 | 
 76 |         for conf in configs:
 77 |             current_config += 1
 78 |             start_time = time.time()
 79 | 
 80 |             lexicon = Lexicon()
 81 |             # go to configuration, check which lexicon is needed, locate the lexicon in FS, load the lexicon
 82 |             lexicon.load_from_list(sources.get_lexicon(conf.get_lexicon()))
 83 |             reports.lexicon_reporter.report(lexicon)
 84 | 
 85 |             # same for VSM
 86 |             vsm = VSM(sources.get_vsm(conf.get_vsm()))
 87 |             mapper = conf.get_feat_extractor()(vsm, lexicon)
 88 | 
 89 |             # prepare the data
 90 |             X_train, y_train, lemmapos_train, gid_train = mapper.get_matrix(g_train)
 91 | 
 92 |             # train the model
 93 |             clf = conf.get_clf()(lexicon, conf.get_all_unknown(), conf.get_num_components(), conf.get_max_sampled(),
 94 |                                  conf.get_num_epochs())
 95 |             clf.train(X_train, y_train, lemmapos_train)
 96 | 
 97 |             current_test = 0
 98 |             for corpus_test in CORPORA_TEST:
 99 |                 score = Score()  # storage for scores
100 |                 score_v = Score()  # storage for verb-only scores
101 |                 score_known = Score()  # storage for known lemma-only scores
102 | 
103 |                 start_time = time.time()
104 | 
105 |                 reports.set_config(conf, corpus_train, corpus_test)
106 | 
107 |                 current_test += 1
108 | 
109 |                 # prepare test data
110 |                 g_test = get_graphs(*sources.get_corpus(corpus_test))
111 |                 reports.conll_reporter_test.report(g_test)
112 |                 X_test, y_test, lemmapos_test, gid_test = mapper.get_matrix(g_test)
113 | 
114 |                 # predict and compare
115 |                 for x, y_true, lemmapos, gid, g in zip(X_test, y_test, lemmapos_test, gid_test, g_test):
116 |                     y_predicted = clf.predict(x, lemmapos)
117 |                     correct = y_true == y_predicted
118 | 
119 |                     score.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)
120 |                     if lemmapos.endswith(".v"):
121 |                         score_v.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)
122 |                     if not lexicon.is_unknown(lemmapos):
123 |                         score_known.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)
124 | 
125 |                     reports.result_reporter.report(gid, g, lemmapos, y_predicted, y_true, lexicon)
126 |                 reports.summary_reporter.report(corpus_train, corpus_test, conf, score, time.time() - start_time)
127 |                 reports.summary_reporter_v.report(corpus_train, corpus_test, conf, score_v, time.time() - start_time)
128 |                 reports.summary_reporter_known.report(corpus_train, corpus_test, conf, score_known, time.time() - start_time)
129 | 
130 |                 print "============ STATUS: - train", current_train, "/", len(CORPORA_TRAIN), \
131 |                     "conf", current_config, "/", len(configs),\
132 |                     "test", current_test, "/", len(CORPORA_TEST)
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/simpleFrameId/reporting.py:
--------------------------------------------------------------------------------
 1 | import codecs, os, shutil
 2 | from evaluation import acc
 3 | 
 4 | # Reporting classes
 5 | 
 6 | class ReportManager:
 7 |     def __init__(self, report_folder):
 8 |         if os.path.exists(report_folder):
 9 |             shutil.rmtree(report_folder)
10 |         os.makedirs(report_folder)
11 |         self.report_folder = report_folder
12 |         self.result_reporter = ResultReporter(os.path.join(self.report_folder, "results"))
13 |         self.lexicon_reporter = LexiconReporter(os.path.join(self.report_folder, "lexicon"))
14 |         self.conll_reporter_train = ConllReporter(os.path.join(self.report_folder, "train.conll"))
15 |         self.conll_reporter_test = ConllReporter(os.path.join(self.report_folder, "test.conll"))
16 |         self.summary_reporter = ResultSummaryReporter(os.path.join(self.report_folder, "summary"))
17 |         self.summary_reporter_v = ResultSummaryReporter(os.path.join(self.report_folder, "summary_v"))
18 |         self.summary_reporter_known = ResultSummaryReporter(os.path.join(self.report_folder, "summary_known"))
19 | 
20 |     def set_config(self, config, train, test):
21 |         self.result_reporter = ResultReporter(os.path.join(self.report_folder, "results_"+train+"_"+test+"_"+str(config)))
22 |         self.lexicon_reporter = LexiconReporter(os.path.join(self.report_folder, "lexicon_"+config.lexicon if config.lexicon is not None else "NA"))
23 | 
24 | 
25 | class Reporter(object):
26 |     def __init__(self, out_path):
27 |         self.out = codecs.open(out_path, "w", "utf-8")
28 |         if hasattr(self, 'columns'):
29 |             self.write_header()
30 | 
31 |     def write_header(self):
32 |         self.out.write("\t".join(self.columns)+"\n")
33 |     def close(self):
34 |         self.out.close()
35 | 
36 | 
37 | class ResultReporter(Reporter):
38 |     def __init__(self, out_path):
39 |         self.columns = ["gid", "sent", "lemmapos", "pos", "predicted_id", "true_id", "predicted_frame", "true_frame", "ambig", "unknown"]
40 |         super(self.__class__, self).__init__(out_path)
41 | 
42 |     def report(self, instance_id, g, lemmapos, predicted, true, lexicon):
43 |         self.out.write("\t".join([str(instance_id), g.sent,
44 |                                   lemmapos, lemmapos.split(".")[1],
45 |                                   str(predicted), str(true), lexicon.get_frame(predicted), lexicon.get_frame(true),
46 |                                   str(lexicon.is_ambiguous(lemmapos)), str(lexicon.is_unknown(lemmapos))])+"\n")
47 | 
48 | 
49 | class ResultSummaryReporter(Reporter):
50 |     def __init__(self, out_path):
51 |         self.columns = ["train", "test", "clf", "feats", "lex", "vsm", "MWE_avg", "all_unk", "num_components", "max_sampled", "num_epochs", "total", "correct", "ambig", "ambig_correct", "unambig", "unambig_correct", "unk", "unk_correct",
52 |                         "total_acc", "ambig_acc", "unambig_acc", "unk_acc", "time"]
53 |         super(self.__class__, self).__init__(out_path)
54 | 
55 |     def report(self, train, test, config, score, time_delta):
56 |         self.out.write(
57 |             "\t".join([train, test, config.clf.__name__, config.feat_extractor.__name__, config.lexicon if config.lexicon is not None else "NA",
58 |                        config.vsm if config.vsm is not None else "NA", str(config.multiword_averaging), str(config.all_unknown), 
59 |                        str(config.num_components) if config.num_components is not None else "NA", 
60 |                        str(config.max_sampled) if config.max_sampled is not None else "NA", 
61 |                        str(config.num_epochs) if config.num_epochs is not None else "NA", 
62 |                        str(score.total), str(score.correct), str(score.total_ambig), str(score.correct_ambig), str(score.total_unambig),
63 |                        str(score.correct_unambig), str(score.total_unknown), str(score.correct_unknown),
64 |                        str(acc(score.correct, score.total)), str(acc(score.correct_ambig, score.total_ambig)),
65 |                        str(acc(score.correct_unambig, score.total_unambig)), str(acc(score.correct_unknown, score.total_unknown)),
66 |                        str(time_delta)])+"\n"
67 |         )
68 | 
69 | 
70 | class LexiconReporter(Reporter):
71 |     def __init__(self, out_path):
72 |         self.columns = ["lemma", "frames"]
73 |         super(self.__class__, self).__init__(out_path)
74 | 
75 |     def report(self, lexicon):
76 |         for lemma in lexicon.frameLexicon:
77 |             self.out.write("\t".join([lemma, ", ".join([str(lexicon.get_id(frame))+": "+frame for frame in lexicon.frameLexicon[lemma]])]) + "\n")
78 | 
79 | 
80 | class ConllReporter(Reporter):
81 |     def report(self, graphs):
82 |         for g in graphs:
83 |             self.out.write(g.pretty() + "\n")


--------------------------------------------------------------------------------
/simpleFrameId/representation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # Feature mappers convert graphs into matrices given lexicon and vsm
 4 | 
 5 | 
 6 | class FeatureMapper:
 7 |     def __init__(self, vsm, lexicon, multiword_averaging=False):
 8 |         self.vsm = vsm
 9 |         self.lexicon = lexicon
10 |         self.multiword_averaging = multiword_averaging
11 | 
12 |     def get_repr(self, graph):
13 |         raise NotImplementedError("Not implemented")
14 | 
15 |     def get_repr_sent(self, words, predicate_id):
16 |         raise NotImplementedError("Not implemented")
17 | 
18 |     def get_matrix(self, graph_list):
19 |         X = []
20 |         y = []
21 |         lemmapos = []
22 |         gid = []
23 |         for g in graph_list:
24 |             X += [self.get_repr(g)]
25 |             frame = g.get_predicate_head()["frame"]
26 |             y += [self.lexicon.get_id(frame)]
27 |             lemmapos += [g.get_predicate_head()["lemmapos"]]
28 |             gid += [g.gid]
29 |         X = np.vstack(X)
30 |         y = np.array(y, dtype=np.int)
31 |         return X, y, lemmapos, gid
32 | 
33 | 
34 | class DummyMapper(FeatureMapper): # Dummy mapper for cases where no features are needed, e.g. for majority baselines
35 |     def get_repr(self, graph):
36 |         return np.zeros(self.vsm.dim)
37 | 
38 | 
39 | def avg_embedding(wordlist, emb):
40 |     res = []
41 |     for word in wordlist:
42 |         word = word.lower()
43 |         res += [emb.get(word)]
44 |     return np.mean(res, axis=0)
45 | 
46 | 
47 | class SentenceBowMapper(FeatureMapper):
48 |     def get_repr(self, graph):
49 |         words = graph.sent.split(" ")
50 |         if not self.multiword_averaging:
51 |             predicate_head = graph.get_predicate_head()
52 |             tgt_w = [predicate_head["word"].lower(), ]
53 |         else:
54 |             tgt_w = graph.get_predicate_node_words()
55 |         return self.get_repr_sent(words, tgt_w)
56 | 
57 |     def get_repr_sent(self, words, tgt_w):
58 |         return np.concatenate((avg_embedding(words, self.vsm), avg_embedding(tgt_w, self.vsm)), axis=0)
59 | 
60 | 
61 | class DependentsBowMapper(FeatureMapper):
62 |     def get_repr(self, graph):
63 |         predicate_head = graph.get_predicate_head()
64 |         deps = graph.get_direct_dependents(graph.predicate_head)
65 |         parent = graph.G.predecessors(graph.predicate_head)
66 |         if parent is not None and len(parent)>0:
67 |             deps += [parent[0]]
68 |         words = [graph.G.node[n]["word"].lower() for n in deps]
69 |         if not self.multiword_averaging:
70 |             tgt_w = [predicate_head["word"].lower(), ]
71 |         else:
72 |             tgt_w = graph.get_predicate_node_words()
73 |         return np.concatenate((avg_embedding(words, self.vsm), avg_embedding(tgt_w, self.vsm)), axis=0)


--------------------------------------------------------------------------------
/simpleFrameId/resources.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Some basic resource management
 4 | # Required folder structure:
 5 | # - project_root
 6 | #       - out                   results
 7 | #       - srl_data              data
 8 | #           - embeddings        VSMs
 9 | #           - corpora           training and test data
10 | #           - lexicons          lexicon lists
11 | 
12 | class ResourceManager:
13 |     def __init__(self, root):
14 |         self.root = root
15 |         self.out = os.path.join(self.root, "out")
16 |         self.data = os.path.join(self.root, "srl_data")
17 |         self.vsm_folder = os.path.join(self.data, "embeddings")
18 |         self.corpora = os.path.join(self.data, "corpora")
19 |         self.lexicons = os.path.join(self.data, "lexicons")
20 | 
21 |     def get_corpus(self, corpus_name):
22 |         return (os.path.join(self.corpora, corpus_name+x) for x in [".all.lemma.tags", ".frame.elements"])
23 | 
24 |     def get_lexicon(self, lexicon_name):
25 |         return os.path.join(self.lexicons, lexicon_name) if lexicon_name is not None else None
26 | 
27 |     def get_vsm(self, vsm_name):
28 |         return os.path.join(self.vsm_folder, vsm_name) if vsm_name is not None else None


--------------------------------------------------------------------------------