├── LICENSE
├── README.md
└── src
    ├── easylstm.py
    ├── parser.py
    └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Easy-first Parser
 2 | ## Easy-first dependency parser based on Hierarchical Tree LSTMs
 3 | 
 4 | The techniques behind the parser are described in the paper [Easy-First Dependency Parsing with Hierarchical Tree LSTMs](https://www.transacl.org/ojs/index.php/tacl/article/viewFile/798/208). Further materials could be found [here](http://elki.cc/#/article/Easy-First%20Dependency%20Parsing%20with%20Hierarchical%20Tree%20LSTMs). 
 5 | 
 6 | #### Required software
 7 | 
 8 |  * Python 2.7 interpreter
 9 |  * [PyCNN library](https://github.com/clab/cnn-v1/tree/master/pycnn)
10 | 
11 | #### Train a parsing model
12 | 
13 | The software requires having a `training.conll` and `development.conll` files formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat).
14 | 
15 | To train a parsing model with for either parsing architecture type the following at the command prompt:
16 | 
17 |     python src/parser.py --outdir [results directory] --train training.conll --dev development.conll [--extrn path_to_external_embeddings_file]
18 | 
19 | We use the same external embedding used in [Transition-Based Dependency Parsing with Stack Long Short-Term Memory](http://arxiv.org/abs/1505.08075) which can be downloaded from the authors [github repository](https://github.com/clab/lstm-parser/) and [directly here](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing).
20 | 
21 | Note 1: The reported test result is the one matching the highest development score.
22 | 
23 | Note 2: The parser calculates (after each iteration) the accuracies excluding punctuation symbols by running the `eval.pl` script from the CoNLL-X Shared Task and stores the results in directory specified by the `--outdir`.
24 | 
25 | Note 3: The external embeddings parameter is optional and could be omitted.
26 | 
27 | #### Parse data with your parsing model
28 | 
29 | The command for parsing a `test.conll` file formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat) with a previously trained model is:
30 | 
31 |     python src/parser.py --predict --outdir [results directory] --test test.conll [--extrn extrn.vectors] --model [trained model file] --params [param file generate during training]
32 | 
33 | The parser will store the resulting conll file in the out directory (`--outdir`).
34 | 
35 | #### Citation
36 | 
37 | If you make use of this software for research purposes, we'll appreciate citing the following:
38 | 
39 |     @article{DBLP:journals/tacl/KiperwasserG16a,
40 |         author    = {Eliyahu Kiperwasser and
41 |                     Yoav Goldberg},
42 |         title     = {Easy-First Dependency Parsing with Hierarchical Tree LSTMs},
43 |         journal   = {{TACL}},
44 |         volume    = {4},
45 |         pages     = {445--461},
46 |         year      = {2016},
47 |         url       = {https://transacl.org/ojs/index.php/tacl/article/view/798},
48 |         timestamp = {Tue, 09 Aug 2016 14:51:09 +0200},
49 |         biburl    = {http://dblp.uni-trier.de/rec/bib/journals/tacl/KiperwasserG16a},
50 |         bibsource = {dblp computer science bibliography, http://dblp.org}
51 |     }
52 | 
53 | #### License
54 | 
55 | This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
56 | 
57 | #### Contact
58 | 
59 | For questions and usage issues, please contact elikip@gmail.com
60 | 
61 | #### Credits
62 | 
63 | [Eliyahu Kiperwasser](http://elki.cc)
64 | 
65 | [Yoav Goldberg](https://www.cs.bgu.ac.il/~yoavg/uni/)
66 | 
67 | 


--------------------------------------------------------------------------------
/src/easylstm.py:
--------------------------------------------------------------------------------
  1 | from pycnn import *
  2 | from utils import ParseForest, read_conll, write_conll
  3 | import utils, time, random
  4 | import numpy as np
  5 | 
  6 | 
  7 | class EasyFirstLSTM:
  8 |     def __init__(self, words, pos, rels, w2i, options):
  9 |         random.seed(1)
 10 |         self.model = Model()
 11 |         self.trainer = AdamTrainer(self.model)
 12 | 
 13 |         self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))}
 14 |         self.activation = self.activations[options.activation]
 15 | 
 16 |         self.k = options.window
 17 |         self.ldims = options.lstm_dims
 18 |         self.wdims = options.wembedding_dims
 19 |         self.pdims = options.pembedding_dims
 20 |         self.rdims = options.rembedding_dims
 21 |         self.oracle = options.oracle
 22 |         self.layers = options.lstm_layers
 23 |         self.wordsCount = words
 24 |         self.vocab = {word: ind+3 for word, ind in w2i.iteritems()}
 25 |         self.pos = {word: ind+3 for ind, word in enumerate(pos)}
 26 |         self.rels = {word: ind for ind, word in enumerate(rels)}
 27 |         self.irels = rels
 28 | 
 29 |         self.builders = [LSTMBuilder(self.layers, self.ldims, self.ldims, self.model), LSTMBuilder(self.layers, self.ldims, self.ldims, self.model)]
 30 | 
 31 |         self.blstmFlag = options.blstmFlag
 32 |         if self.blstmFlag:
 33 |             self.surfaceBuilders = [LSTMBuilder(self.layers, self.ldims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, self.ldims, self.ldims * 0.5, self.model)]
 34 |         self.hidden_units = options.hidden_units
 35 |         self.hidden2_units = options.hidden2_units
 36 | 
 37 |         self.external_embedding = None
 38 |         if options.external_embedding is not None:
 39 |             external_embedding_fp = open(options.external_embedding,'r')
 40 |             external_embedding_fp.readline()
 41 |             self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp}
 42 |             external_embedding_fp.close()
 43 | 
 44 | 	    self.edim = len(self.external_embedding.values()[0])
 45 |             self.noextrn = [0.0 for _ in xrange(self.edim)]
 46 |             self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
 47 |             self.model.add_lookup_parameters("extrn-lookup", (len(self.external_embedding) + 3, self.edim))
 48 |             for word, i in self.extrnd.iteritems():
 49 |                 self.model["extrn-lookup"].init_row(i, self.external_embedding[word])
 50 |             self.extrnd['*PAD*'] = 1
 51 |             self.extrnd['*INITIAL*'] = 2
 52 | 
 53 | 	    print 'Load external embedding. Vector dimensions', self.edim
 54 | 
 55 |         self.vocab['*PAD*'] = 1
 56 |         self.pos['*PAD*'] = 1
 57 | 
 58 |         self.vocab['*INITIAL*'] = 2
 59 |         self.pos['*INITIAL*'] = 2
 60 | 
 61 |         self.model.add_lookup_parameters("word-lookup", (len(words) + 3, self.wdims))
 62 |         self.model.add_lookup_parameters("pos-lookup", (len(pos) + 3, self.pdims))
 63 |         self.model.add_lookup_parameters("rels-lookup", (len(rels), self.rdims))
 64 | 
 65 |         self.nnvecs = 2
 66 | 
 67 |         self.model.add_parameters("word-to-lstm", (self.ldims, self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0)))
 68 |         self.model.add_parameters("word-to-lstm-bias", (self.ldims))
 69 |         self.model.add_parameters("lstm-to-lstm", (self.ldims, self.ldims * self.nnvecs + self.rdims))
 70 |         self.model.add_parameters("lstm-to-lstm-bias", (self.ldims))
 71 | 
 72 |         self.model.add_parameters("hidden-layer", (self.hidden_units, self.ldims * self.nnvecs * ((self.k + 1) * 2)))
 73 |         self.model.add_parameters("hidden-bias", (self.hidden_units))
 74 | 
 75 |         self.model.add_parameters("hidden2-layer", (self.hidden2_units, self.hidden_units))
 76 |         self.model.add_parameters("hidden2-bias", (self.hidden2_units))
 77 | 
 78 |         self.model.add_parameters("output-layer", (2, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
 79 |         self.model.add_parameters("output-bias", (2))
 80 | 
 81 |         self.model.add_parameters("rhidden-layer", (self.hidden_units, self.ldims * self.nnvecs * ((self.k + 1) * 2)))
 82 |         self.model.add_parameters("rhidden-bias", (self.hidden_units))
 83 | 
 84 |         self.model.add_parameters("rhidden2-layer", (self.hidden2_units, self.hidden_units))
 85 |         self.model.add_parameters("rhidden2-bias", (self.hidden2_units))
 86 | 
 87 |         self.model.add_parameters("routput-layer", (2 * (len(self.irels) + 0), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
 88 |         self.model.add_parameters("routput-bias", (2 * (len(self.irels) + 0)))
 89 | 
 90 | 
 91 |     def  __getExpr(self, forest, i, train):
 92 |         roots = forest.roots
 93 |         nRoots = len(roots)
 94 | 
 95 |         if self.builders is None:
 96 |             input = concatenate([ concatenate(roots[j].lstms) if j>=0 and j<nRoots else self.empty for j in xrange(i-self.k, i+self.k+2) ])
 97 |         else:
 98 |             input = concatenate([ concatenate([roots[j].lstms[0].output(), roots[j].lstms[1].output()])
 99 |                                   if j>=0 and j<nRoots else self.empty for j in xrange(i-self.k, i+self.k+2) ])
100 | 
101 |         if self.hidden2_units > 0:
102 |             routput = (self.routLayer * self.activation(self.rhid2Bias + self.rhid2Layer * self.activation(self.rhidLayer * input + self.rhidBias)) + self.routBias)
103 |         else:
104 |             routput = (self.routLayer * self.activation(self.rhidLayer * input + self.rhidBias) + self.routBias)
105 | 
106 |         if self.hidden2_units > 0:
107 |             output = (self.outLayer * self.activation(self.hid2Bias + self.hid2Layer * self.activation(self.hidLayer * input + self.hidBias)) + self.outBias)
108 |         else:
109 |             output = (self.outLayer * self.activation(self.hidLayer * input + self.hidBias) + self.outBias)
110 | 
111 |         return routput, output
112 | 
113 | 
114 |     def __evaluate(self, forest, train):
115 |         nRoots = len(forest.roots)
116 |         nRels = len(self.irels)
117 |         for i in xrange(nRoots - 1):
118 |             if forest.roots[i].scores is None:
119 |                 output, uoutput = self.__getExpr(forest, i, train)
120 |                 scrs = output.value()
121 |                 uscrs = uoutput.value()
122 |                 forest.roots[i].exprs = [(pick(output, j * 2) + pick(uoutput, 0), pick(output, j * 2 + 1) + pick(uoutput, 1)) for j in xrange(len(self.irels))]
123 |                 forest.roots[i].scores = [(scrs[j * 2] + uscrs[0], scrs[j * 2 + 1] + uscrs[1]) for j in xrange(len(self.irels))]
124 | 
125 | 
126 |     def Save(self, filename):
127 |         self.model.save(filename)
128 | 
129 | 
130 |     def Load(self, filename):
131 |         self.model.load(filename)
132 | 
133 | 
134 |     def Init(self):
135 |         self.word2lstm = parameter(self.model["word-to-lstm"])
136 |         self.lstm2lstm = parameter(self.model["lstm-to-lstm"])
137 | 
138 |         self.word2lstmbias = parameter(self.model["word-to-lstm-bias"])
139 |         self.lstm2lstmbias = parameter(self.model["lstm-to-lstm-bias"])
140 | 
141 |         self.hid2Layer = parameter(self.model["hidden2-layer"])
142 |         self.hidLayer = parameter(self.model["hidden-layer"])
143 |         self.outLayer = parameter(self.model["output-layer"])
144 | 
145 |         self.hid2Bias = parameter(self.model["hidden2-bias"])
146 |         self.hidBias = parameter(self.model["hidden-bias"])
147 |         self.outBias = parameter(self.model["output-bias"])
148 | 
149 |         self.rhid2Layer = parameter(self.model["rhidden2-layer"])
150 |         self.rhidLayer = parameter(self.model["rhidden-layer"])
151 |         self.routLayer = parameter(self.model["routput-layer"])
152 | 
153 |         self.rhid2Bias = parameter(self.model["rhidden2-bias"])
154 |         self.rhidBias = parameter(self.model["rhidden-bias"])
155 |         self.routBias = parameter(self.model["routput-bias"])
156 | 
157 |         evec = lookup(self.model["extrn-lookup"], 1) if self.external_embedding is not None else None
158 |         paddingWordVec = lookup(self.model["word-lookup"], 1)
159 |         paddingPosVec = lookup(self.model["pos-lookup"], 1) if self.pdims > 0 else None
160 | 
161 |         paddingVec = tanh(self.word2lstm * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec])) + self.word2lstmbias )
162 | 	self.empty = (concatenate([self.builders[0].initial_state().add_input(paddingVec).output(), self.builders[1].initial_state().add_input(paddingVec).output()]))
163 | 
164 | 
165 |     def getWordEmbeddings(self, forest, train):
166 |         for root in forest.roots:
167 |             c = float(self.wordsCount.get(root.norm, 0))
168 |             root.wordvec = lookup(self.model["word-lookup"], int(self.vocab.get(root.norm, 0)) if not train or (random.random() < (c/(0.25+c))) else 0)
169 |             root.posvec = lookup(self.model["pos-lookup"], int(self.pos[root.pos])) if self.pdims > 0 else None
170 | 
171 |             if self.external_embedding is not None:
172 |                 if root.form in self.external_embedding:
173 |                     root.evec = lookup(self.model["extrn-lookup"], self.extrnd[root.form] )
174 |                 elif root.norm in self.external_embedding:
175 |                     root.evec = lookup(self.model["extrn-lookup"], self.extrnd[root.norm] )
176 |                 else:
177 |                     root.evec = lookup(self.model["extrn-lookup"], 0)
178 |             else:
179 |                 root.evec = None
180 | 
181 |             root.ivec = (self.word2lstm * concatenate(filter(None, [root.wordvec, root.posvec, root.evec]))) + self.word2lstmbias
182 | 
183 |         if self.blstmFlag:
184 |             forward  = self.surfaceBuilders[0].initial_state()
185 |             backward = self.surfaceBuilders[1].initial_state()
186 | 
187 |             for froot, rroot in zip(forest.roots, reversed(forest.roots)):
188 |                 forward = forward.add_input( froot.ivec )
189 |                 backward = backward.add_input( rroot.ivec )
190 |                 froot.fvec = forward.output()
191 |                 rroot.bvec = backward.output()
192 |             for root in forest.roots:
193 |                 root.vec = concatenate( [root.fvec, root.bvec] )
194 |         else:
195 |             for root in forest.roots:
196 |                 root.vec = tanh( root.ivec )
197 | 
198 | 
199 |     def Predict(self, conll_path):
200 |         with open(conll_path, 'r') as conllFP:
201 |             for iSentence, sentence in enumerate(read_conll(conllFP, False)):
202 |                 self.Init()
203 |                 forest = ParseForest(sentence)
204 |                 self.getWordEmbeddings(forest, False)
205 | 
206 |                 for root in forest.roots:
207 |                     root.lstms = [self.builders[0].initial_state().add_input(root.vec),
208 |                                   self.builders[1].initial_state().add_input(root.vec)]
209 | 
210 |                 while len(forest.roots) > 1:
211 | 
212 |                     self.__evaluate(forest, False)
213 |                     bestParent, bestChild, bestScore = None, None, float("-inf")
214 |                     bestIndex, bestOp = None, None
215 |                     roots = forest.roots
216 | 
217 |                     for i in xrange(len(forest.roots) - 1):
218 |                         for irel, rel in enumerate(self.irels):
219 |                             for op in xrange(2):
220 |                                 if bestScore < roots[i].scores[irel][op] and (i + (1 - op)) > 0:
221 |                                     bestParent, bestChild = i + op, i + (1 - op)
222 |                                     bestScore = roots[i].scores[irel][op]
223 |                                     bestIndex, bestOp = i, op
224 |                                     bestRelation, bestIRelation = rel, irel
225 | 
226 |                     for j in xrange(max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)):
227 |                         roots[j].scores = None
228 | 
229 |                     roots[bestChild].pred_parent_id = forest.roots[bestParent].id
230 |                     roots[bestChild].pred_relation = bestRelation
231 | 
232 |                     roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[bestOp].add_input((self.activation(self.lstm2lstmbias + self.lstm2lstm *
233 |                         	concatenate([roots[bestChild].lstms[0].output(), lookup(self.model["rels-lookup"], bestIRelation), roots[bestChild].lstms[1].output()]))))
234 | 
235 |                     forest.Attach(bestParent, bestChild)
236 | 
237 |                 renew_cg()
238 |                 yield sentence
239 | 
240 | 
241 |     def Train(self, conll_path):
242 |         mloss = 0.0
243 |         errors = 0
244 |         batch = 0
245 |         eloss = 0.0
246 |         eerrors = 0
247 |         lerrors = 0
248 |         etotal = 0
249 |         ltotal = 0
250 | 
251 |         start = time.time()
252 | 
253 |         with open(conll_path, 'r') as conllFP:
254 |             shuffledData = list(read_conll(conllFP, True))
255 |             random.shuffle(shuffledData)
256 | 
257 |             errs = []
258 |             eeloss = 0.0
259 | 
260 |             self.Init()
261 | 
262 |             for iSentence, sentence in enumerate(shuffledData):
263 |                 if iSentence % 100 == 0 and iSentence != 0:
264 |                     print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
265 |                     start = time.time()
266 |                     eerrors = 0
267 |                     eloss = 0.0
268 |                     etotal = 0
269 |                     lerrors = 0
270 |                     ltotal = 0
271 | 
272 |                 forest = ParseForest(sentence)
273 |                 self.getWordEmbeddings(forest, True)
274 | 
275 |                 for root in forest.roots:
276 |                     root.lstms = [self.builders[0].initial_state().add_input(root.vec),
277 |                         	  self.builders[1].initial_state().add_input(root.vec)]
278 | 
279 |                 unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}
280 | 
281 |                 while len(forest.roots) > 1:
282 |                     self.__evaluate(forest, True)
283 |                     bestValidOp, bestValidScore = None, float("-inf")
284 |                     bestWrongOp, bestWrongScore = None, float("-inf")
285 | 
286 |                     bestValidParent, bestValidChild = None, None
287 |                     bestValidIndex, bestWrongIndex = None, None
288 |                     roots = forest.roots
289 | 
290 |                     rootsIds = set([root.id for root in roots])
291 | 
292 |                     for i in xrange(len(forest.roots) - 1):
293 |                         for irel, rel in enumerate(self.irels):
294 |                             for op in xrange(2):
295 |                                 child = i + (1 - op)
296 |                                 parent = i + op
297 | 
298 |                                 oracleCost = unassigned[roots[child].id] + (0 if roots[child].parent_id not in rootsIds or roots[child].parent_id  == roots[parent].id else 1)
299 | 
300 |                                 if oracleCost == 0 and (roots[child].parent_id != roots[parent].id or roots[child].relation == rel):
301 |                                     if bestValidScore < forest.roots[i].scores[irel][op]:
302 |                                         bestValidScore = forest.roots[i].scores[irel][op]
303 |                                         bestValidOp = op
304 |                                         bestValidParent, bestValidChild = parent, child
305 |                                         bestValidIndex = i
306 |                                         bestValidIRel, bestValidRel = irel, rel
307 |                                         bestValidExpr = roots[bestValidIndex].exprs[bestValidIRel][bestValidOp]
308 |                                 elif bestWrongScore < forest.roots[i].scores[irel][op]:
309 |                                     bestWrongScore = forest.roots[i].scores[irel][op]
310 |                                     bestWrongParent, bestWrongChild = parent, child
311 |                                     bestWrongOp = op
312 |                                     bestWrongIndex = i
313 |                                     bestWrongIRel, bestWrongRel = irel, rel
314 |                                     bestWrongExpr = roots[bestWrongIndex].exprs[bestWrongIRel][bestWrongOp]
315 | 
316 |                     if bestValidScore < bestWrongScore + 1.0:
317 |                         loss = bestWrongExpr - bestValidExpr
318 |                         mloss += 1.0 + bestWrongScore - bestValidScore
319 |                         eloss += 1.0 + bestWrongScore - bestValidScore
320 |                         errs.append(loss)
321 | 
322 |                     if not self.oracle or bestValidScore - bestWrongScore > 1.0 or (bestValidScore > bestWrongScore and random.random() > 0.1): 
323 |                         selectedOp = bestValidOp
324 |                         selectedParent = bestValidParent
325 |                         selectedChild = bestValidChild
326 |                         selectedIndex = bestValidIndex
327 |                         selectedIRel, selectedRel = bestValidIRel, bestValidRel
328 |                     else:
329 |                         selectedOp = bestWrongOp
330 |                         selectedParent = bestWrongParent
331 |                         selectedChild = bestWrongChild
332 |                         selectedIndex = bestWrongIndex
333 |                         selectedIRel, selectedRel = bestWrongIRel, bestWrongRel
334 | 
335 |                     if roots[selectedChild].parent_id  != roots[selectedParent].id or selectedRel != roots[selectedChild].relation:
336 |                         lerrors += 1
337 |                         if roots[selectedChild].parent_id  != roots[selectedParent].id:
338 |                             errors += 1
339 |                             eerrors += 1
340 | 
341 |                     etotal += 1
342 | 
343 |                     for j in xrange(max(0, selectedIndex - self.k - 1), min(len(forest.roots), selectedIndex + self.k + 2)):
344 |                         roots[j].scores = None
345 | 
346 |                     unassigned[roots[selectedChild].parent_id] -= 1
347 | 
348 |                     roots[selectedParent].lstms[selectedOp] = roots[selectedParent].lstms[selectedOp].add_input(
349 |                         	    self.activation( self.lstm2lstm *
350 |                                 	noise(concatenate([roots[selectedChild].lstms[0].output(), lookup(self.model["rels-lookup"], selectedIRel),
351 |                                         	           roots[selectedChild].lstms[1].output()]), 0.0) + self.lstm2lstmbias))
352 | 
353 |                     forest.Attach(selectedParent, selectedChild)
354 | 
355 |                 if len(errs) > 50.0:
356 |                     eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
357 |                     scalar_loss = eerrs.scalar_value()
358 |                     eerrs.backward()
359 |                     self.trainer.update()
360 |                     errs = []
361 |                     lerrs = []
362 | 
363 |                     renew_cg()
364 |                     self.Init()
365 | 
366 |         if len(errs) > 0:
367 |             eerrs = (esum(errs)) * (1.0/(float(len(errs))))
368 |             eerrs.scalar_value()
369 |             eerrs.backward()
370 |             self.trainer.update()
371 | 
372 |             errs = []
373 |             lerrs = []
374 | 
375 |             renew_cg()
376 | 
377 |         self.trainer.update_epoch()
378 |         print "Loss: ", mloss/iSentence
379 | 


--------------------------------------------------------------------------------
/src/parser.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionParser
 2 | import json, utils, easylstm, os, pickle, time
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = OptionParser()
 6 |     parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="data/PTB_SD_3_3_0/train.conll")
 7 |     parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="data/PTB_SD_3_3_0/dev.conll")
 8 |     parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="data/PTB_SD_3_3_0/test.conll")
 9 |     parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE")
10 |     parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="easyfirst.model")
11 |     parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle")
12 |     parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100)
13 |     parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25)
14 |     parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25)
15 |     parser.add_option("--epochs", type="int", dest="epochs", default=30)
16 |     parser.add_option("--hidden", type="int", dest="hidden_units", default=100)
17 |     parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0)
18 |     parser.add_option("--k", type="int", dest="window", default=1)
19 |     parser.add_option("--lr", type="float", dest="learning_rate", default=0.1)
20 |     parser.add_option("--outdir", type="string", dest="output", default="results")
21 |     parser.add_option("--activation", type="string", dest="activation", default="tanh")
22 |     parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2)
23 |     parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=200)
24 |     parser.add_option("--disableoracle", action="store_false", dest="oracle", default=True)
25 |     parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True)
26 |     parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
27 |     parser.add_option("--cnn-seed", type="int", dest="seed", default=0)
28 | 
29 | 
30 |     (options, args) = parser.parse_args()
31 | 
32 |     print 'Using external embedding:', options.external_embedding
33 | 
34 |     if options.predictFlag:
35 |         with open(options.params, 'r') as paramsfp:
36 |             words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)
37 | 
38 |         stored_opt.external_embedding = options.external_embedding
39 | 
40 |         print 'Initializing Hierarchical Tree LSTM parser:'
41 |         parser = easylstm.EasyFirstLSTM(words, pos, rels, w2i, stored_opt) 
42 | 
43 |         parser.Load(options.model)
44 |         tespath = os.path.join(options.output, 'test_pred.conll')
45 |         
46 |         ts = time.time()
47 |         test_res = list(parser.Predict(options.conll_test))
48 |         te = time.time()
49 |         print 'Finished predicting test.', te-ts, 'seconds.'
50 |         utils.write_conll(tespath, test_res)
51 | 
52 |         os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt')
53 |     else:
54 |         print 'Preparing vocab'
55 |         words, w2i, pos, rels = utils.vocab(options.conll_train)
56 | 
57 |         with open(os.path.join(options.output, options.params), 'w') as paramsfp:
58 |             pickle.dump((words, w2i, pos, rels, options), paramsfp)
59 |         print 'Finished collecting vocab'
60 | 
61 |         print 'Initializing Hierarchical Tree LSTM parser:'
62 |         parser = easylstm.EasyFirstLSTM(words, pos, rels, w2i, options) 
63 | 
64 |         for epoch in xrange(options.epochs):
65 |             print 'Starting epoch', epoch
66 |             parser.Train(options.conll_train)
67 |             devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll')
68 |             utils.write_conll(devpath, parser.Predict(options.conll_dev))
69 |             parser.Save(os.path.join(options.output, os.path.basename(options.model) + str(epoch+1)))
70 |             os.system('perl src/utils/eval.pl -g ' + options.conll_dev  + ' -s ' + devpath  + ' > ' + devpath + '.txt')
71 | 
72 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import re
  3 | 
  4 | 
  5 | class ConllEntry:
  6 |     def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
  7 |         self.id = id
  8 |         self.form = form
  9 |         self.norm = normalize(form)
 10 |         self.cpos = cpos.upper()
 11 |         self.pos = pos.upper()
 12 |         self.parent_id = parent_id
 13 |         self.relation = relation
 14 | 
 15 | 
 16 | class ParseForest:
 17 |     def __init__(self, sentence):
 18 |         self.roots = list(sentence)
 19 | 
 20 |         for root in self.roots:
 21 |             root.children = []
 22 |             root.scores = None
 23 |             root.parent = None
 24 |             root.pred_parent_id = None
 25 |             root.pred_relation = None
 26 |             root.vecs = None
 27 |             root.lstms = None
 28 | 
 29 |     def Attach(self, parent_index, child_index):
 30 |         parent = self.roots[parent_index]
 31 |         child = self.roots[child_index]
 32 | 
 33 |         child.pred_parent_id = parent.id
 34 |         del self.roots[child_index]
 35 | 
 36 | 
 37 | def isProj(sentence):
 38 |     forest = ParseForest(sentence)
 39 |     unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}
 40 | 
 41 |     for _ in xrange(len(sentence)):
 42 |         for i in xrange(len(forest.roots) - 1):
 43 |             if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0:
 44 |                 unassigned[forest.roots[i+1].id]-=1
 45 |                 forest.Attach(i+1, i)
 46 |                 break
 47 |             if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0:
 48 |                 unassigned[forest.roots[i].id]-=1
 49 |                 forest.Attach(i, i+1)
 50 |                 break
 51 | 
 52 |     return len(forest.roots) == 1
 53 | 
 54 | 
 55 | def vocab(conll_path):
 56 |     wordsCount = Counter()
 57 |     posCount = Counter()
 58 |     relCount = Counter()
 59 | 
 60 |     with open(conll_path, 'r') as conllFP:
 61 |         for sentence in read_conll(conllFP, True):
 62 |             wordsCount.update([node.norm for node in sentence])
 63 |             posCount.update([node.pos for node in sentence])
 64 |             relCount.update([node.relation for node in sentence])
 65 | 
 66 |     return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())
 67 | 
 68 | 
 69 | def read_conll(fh, proj):
 70 |     root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
 71 |     tokens = [root]
 72 |     for line in fh:
 73 |         tok = line.strip().split()
 74 |         if not tok:
 75 |             if len(tokens)>1:
 76 |                 if not proj or isProj(tokens):
 77 |                     yield tokens
 78 |                 else:
 79 |                     print 'Non-projetive sentence dropped'
 80 |             tokens = [root]
 81 |         else:
 82 |             tokens.append(ConllEntry(int(tok[0]), tok[1], tok[3], tok[4], int(tok[6]), tok[7]))
 83 |     if len(tokens) > 1:
 84 |         yield tokens
 85 | 
 86 | 
 87 | def write_conll(fn, conll_gen):
 88 |     with open(fn, 'w') as fh:
 89 |         for sentence in conll_gen:
 90 |             for entry in sentence[1:]:
 91 |                 fh.write('\t'.join([str(entry.id), entry.form, '_', entry.pos, entry.cpos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
 92 |                 fh.write('\n')
 93 |             fh.write('\n')
 94 | 
 95 | 
 96 | numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
 97 | def normalize(word):
 98 |     return 'NUM' if numberRegex.match(word) else word.lower()
 99 | 
100 | 
101 | cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB", 
102 |              "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV", 
103 |              ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET", 
104 | 			 "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS", 
105 | 			 "-LRB-": ".", "-RRB-": "."}
106 | 


--------------------------------------------------------------------------------