├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── README.md
├── examples
    ├── ExampleUtils.kt
    ├── evaluation
    │   ├── CommandLineArguments.kt
    │   └── EvaluateLHR.kt
    ├── pom.xml
    └── training
    │   ├── CommandLineArguments.kt
    │   └── TrainLHR.kt
├── pom.xml
└── src
    └── main
        └── kotlin
            └── com
                └── kotlinnlp
                    └── neuralparser
                        ├── NeuralParser.kt
                        ├── NeuralParserModel.kt
                        ├── helpers
                            ├── Trainer.kt
                            ├── labelerselector
                            │   ├── LabelerSelector.kt
                            │   ├── MorphoSelector.kt
                            │   └── NoFilterSelector.kt
                            ├── preprocessors
                            │   ├── BasePreprocessor.kt
                            │   ├── CoNLLPreprocessor.kt
                            │   ├── MorphoPreprocessor.kt
                            │   └── SentencePreprocessor.kt
                            ├── sentencebuilder
                            │   ├── CompositeTokenHelper.kt
                            │   ├── LabeledMorphoSynBuilder.kt
                            │   └── UnlabeledMorphoSynBuilder.kt
                            ├── statistics
                            │   ├── BaseStatistics.kt
                            │   ├── MetricsCounter.kt
                            │   ├── SentenceMetrics.kt
                            │   └── Statistics.kt
                            └── validator
                            │   ├── CoNLLDependencyParser.kt
                            │   ├── CoNLLFileValidator.kt
                            │   └── Validator.kt
                        ├── language
                            ├── BaseSentence.kt
                            ├── BaseToken.kt
                            ├── CorpusDictionary.kt
                            ├── Extensions.kt
                            ├── ParsingSentence.kt
                            └── ParsingToken.kt
                        ├── parsers
                            └── lhrparser
                            │   ├── LHRModel.kt
                            │   ├── LHRParser.kt
                            │   ├── LHRTrainer.kt
                            │   ├── LHRTransferLearning.kt
                            │   ├── helpers
                            │       ├── CyclesFixer.kt
                            │       ├── GreedyDependencyTreeBuilder.kt
                            │       └── keyextractors
                            │       │   └── PosTagKeyExtractor.kt
                            │   ├── neuralmodules
                            │       ├── PositionalEncoder.kt
                            │       └── labeler
                            │       │   ├── Labeler.kt
                            │       │   ├── LabelerModel.kt
                            │       │   └── utils
                            │       │       ├── HingeLoss.kt
                            │       │       ├── LossCriterion.kt
                            │       │       ├── LossCriterionType.kt
                            │       │       ├── ScoredGrammar.kt
                            │       │       └── Softmax.kt
                            │   └── sentenceconverters
                            │       ├── BaseConverter.kt
                            │       ├── FormConverter.kt
                            │       └── MorphoConverter.kt
                        └── utils
                            └── Extensions.kt


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Configuration ###
 2 | config/*
 3 | !config/configuration.yaml.example
 4 | 
 5 | ### Intellij ###
 6 | .idea/
 7 | /out/
 8 | 
 9 | ### Intellij Patch ###
10 | *.iml
11 | 
12 | /resources/
13 | 
14 | ### Maven ###
15 | target/
16 | pom.xml.tag
17 | pom.xml.releaseBackup
18 | pom.xml.versionsBackup
19 | pom.xml.next
20 | release.properties
21 | dependency-reduced-pom.xml
22 | buildNumber.properties
23 | .mvn/timing.properties
24 | 
25 | # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
26 | !/.mvn/wrapper/maven-wrapper.jar
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | 
 3 | sudo: false
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - oracle-java8-installer
 9 | 
10 | os:
11 |   - linux
12 | 
13 | dist: trusty
14 | 
15 | jdk:
16 |   - oraclejdk8
17 | 
18 | install: true
19 | 
20 | script: mvn test compile -B -Dmaven.javadoc.skip=true
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NeuralParser [![Maven Central](https://img.shields.io/maven-central/v/com.kotlinnlp/neuralparser.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22com.kotlinnlp%22%20AND%20a:%22neuralparser%22) [![Build Status](https://travis-ci.org/KotlinNLP/NeuralParser.svg?branch=master)](https://travis-ci.org/KotlinNLP/NeuralParser)
 2 | 
 3 | NeuralParser is a very simple to use dependency parser, based on the
 4 | [SimpleDNN](https://github.com/kotlinnlp/SimpleDNN "SimpleDNN on GitHub") library and the
 5 | [SyntaxDecoder](https://github.com/kotlinnlp/SyntaxDecoder "SyntaxDecoder on GitHub") transition systems framework.
 6 | 
 7 | NeuralParser is part of [KotlinNLP](http://kotlinnlp.com/ "KotlinNLP").
 8 | 
 9 | 
10 | ## Getting Started
11 | 
12 | ### Import with Maven
13 | 
14 | ```xml
15 | <dependency>
16 |     <groupId>com.kotlinnlp</groupId>
17 |     <artifactId>neuralparser</artifactId>
18 |     <version>0.6.5</version>
19 | </dependency>
20 | ```
21 | 
22 | ### Examples
23 | 
24 | Try some examples of training and evaluation of NeuralParser running the files in the `examples` folder.
25 | 
26 | 
27 | ## License
28 | 
29 | This software is released under the terms of the 
30 | [Mozilla Public License, v. 2.0](https://mozilla.org/MPL/2.0/ "Mozilla Public License, v. 2.0")
31 | 
32 | 
33 | ## Contributions
34 | 
35 | We greatly appreciate any bug reports and contributions, which can be made by filing an issue or making a pull 
36 | request through the [github page](https://github.com/kotlinnlp/NeuralParser "NeuralParser on GitHub").
37 | 


--------------------------------------------------------------------------------
/examples/ExampleUtils.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
 9 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
10 | import com.kotlinnlp.neuralparser.helpers.preprocessors.MorphoPreprocessor
11 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
12 | 
13 | /**
14 |  * Build a [SentencePreprocessor].
15 |  *
16 |  * @param morphoDictionary a morphology dictionary
17 |  *
18 |  * @return a new sentence preprocessor
19 |  */
20 | internal fun buildSentencePreprocessor(morphoDictionary: MorphologyDictionary?): SentencePreprocessor =
21 |   morphoDictionary?.let { MorphoPreprocessor(dictionary = it) } ?: BasePreprocessor()
22 | 


--------------------------------------------------------------------------------
/examples/evaluation/CommandLineArguments.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package evaluation
 9 | 
10 | import com.xenomachina.argparser.ArgParser
11 | import com.xenomachina.argparser.default
12 | 
13 | /**
14 |  * The interpreter of command line arguments for the evaluation script.
15 |  *
16 |  * @param args the array of command line arguments
17 |  */
18 | class CommandLineArguments(args: Array<String>) {
19 | 
20 |   /**
21 |    * The parser of the string arguments.
22 |    */
23 |   private val parser = ArgParser(args)
24 | 
25 |   /**
26 |    * The file path of the serialized model.
27 |    */
28 |   val modelPath: String by parser.storing(
29 |     "-m",
30 |     "--model-path",
31 |     help="the file path of the serialized model"
32 |   )
33 | 
34 |   /**
35 |    * The file path of the validation set.
36 |    */
37 |   val validationSetPath: String by parser.storing(
38 |     "-v",
39 |     "--validation-set",
40 |     help="the file path of the validation set"
41 |   )
42 | 
43 |   /**
44 |    * The file path of the serialized morphology dictionary.
45 |    */
46 |   val morphoDictionaryPath: String? by parser.storing(
47 |     "-d",
48 |     "--dictionary",
49 |     help="the file path of the serialized morphology dictionary"
50 |   ).default { null }
51 | 
52 |   /**
53 |    * Force parsing all arguments (only read ones are parsed by default).
54 |    */
55 |   init {
56 |     parser.force()
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/evaluation/EvaluateLHR.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package evaluation
 9 | 
10 | import buildSentencePreprocessor
11 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
12 | import com.kotlinnlp.neuralparser.NeuralParser
13 | import com.kotlinnlp.neuralparser.NeuralParserModel
14 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel
16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser
17 | import com.kotlinnlp.neuralparser.utils.loadSentences
18 | import com.kotlinnlp.utils.Timer
19 | import com.xenomachina.argparser.mainBody
20 | import java.io.File
21 | import java.io.FileInputStream
22 | 
23 | /**
24 |  * Evaluate the model of an [LHRParser].
25 |  *
26 |  * Launch with the '-h' option for help about the command line arguments.
27 |  */
28 | fun main(args: Array<String>) = mainBody {
29 | 
30 |   val parsedArgs = CommandLineArguments(args)
31 | 
32 |   val parser: NeuralParser<*> = LHRParser(
33 |     model = parsedArgs.modelPath.let {
34 |       println("Loading model from '$it'.")
35 |       NeuralParserModel.load(FileInputStream(File(it))) as LHRModel
36 |     })
37 | 
38 |   val validator = Validator(
39 |     neuralParser = parser,
40 |     sentences = loadSentences(
41 |       type = "validation",
42 |       filePath = parsedArgs.validationSetPath,
43 |       maxSentences = null,
44 |       skipNonProjective = false),
45 |     sentencePreprocessor = buildSentencePreprocessor(
46 |       morphoDictionary = parsedArgs.morphoDictionaryPath?.let {
47 |         println("Loading serialized dictionary from '$it'...")
48 |         MorphologyDictionary.load(FileInputStream(File(it)))
49 |       }
50 |     ))
51 | 
52 |   val timer = Timer()
53 |   val evaluation = validator.evaluate()
54 | 
55 |   println("\n$evaluation")
56 |   println("\nElapsed time: ${timer.formatElapsedTime()}")
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/examples/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 | 
  6 |     <modelVersion>4.0.0</modelVersion>
  7 | 
  8 |     <groupId>com.kotlinnlp</groupId>
  9 |     <artifactId>neuralparser-examples</artifactId>
 10 |     <version>0.1.0</version>
 11 |     <packaging>jar</packaging>
 12 | 
 13 |     <properties>
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |         <assembly-plugin.version>2.6</assembly-plugin.version>
 16 |         <kotlin.version>1.3.31</kotlin.version>
 17 |         <xenocom.version>0.0.5</xenocom.version>
 18 |         <argparser.version>2.0.7</argparser.version>
 19 |         <klaxon.version>5.2</klaxon.version>
 20 |         <neuralparser.version>0.6.5</neuralparser.version>
 21 |     </properties>
 22 | 
 23 |     <repositories>
 24 |         <repository>
 25 |             <id>jcenter</id>
 26 |             <url>https://jcenter.bintray.com/</url>
 27 |         </repository>
 28 |     </repositories>
 29 | 
 30 |     <pluginRepositories>
 31 |         <pluginRepository>
 32 |             <id>jcenter</id>
 33 |             <name>JCenter</name>
 34 |             <url>https://jcenter.bintray.com/</url>
 35 |         </pluginRepository>
 36 |     </pluginRepositories>
 37 | 
 38 |     <build>
 39 |         <sourceDirectory>.</sourceDirectory>
 40 | 
 41 |         <plugins>
 42 |             <plugin>
 43 |                 <groupId>org.jetbrains.kotlin</groupId>
 44 |                 <artifactId>kotlin-maven-plugin</artifactId>
 45 |                 <version>${kotlin.version}</version>
 46 |                 <configuration>
 47 |                     <jvmTarget>1.8</jvmTarget>
 48 |                 </configuration>
 49 |                 <executions>
 50 |                     <execution>
 51 |                         <id>compile</id>
 52 |                         <phase>process-sources</phase>
 53 |                         <goals>
 54 |                             <goal>compile</goal>
 55 |                         </goals>
 56 |                     </execution>
 57 |                 </executions>
 58 |             </plugin>
 59 | 
 60 |             <plugin>
 61 |                 <groupId>org.apache.maven.plugins</groupId>
 62 |                 <artifactId>maven-assembly-plugin</artifactId>
 63 |                 <version>${assembly-plugin.version}</version>
 64 |                 <executions>
 65 |                     <execution>
 66 |                         <id>lhr-training-assembly</id>
 67 |                         <phase>package</phase>
 68 |                         <goals> <goal>single</goal> </goals>
 69 |                         <configuration>
 70 |                             <finalName>lhr-parser-train</finalName>
 71 |                             <archive>
 72 |                                 <manifest>
 73 |                                     <addClasspath>true</addClasspath>
 74 |                                     <mainClass>training.TrainLHRKt</mainClass>
 75 |                                 </manifest>
 76 |                             </archive>
 77 |                             <descriptorRefs>
 78 |                                 <descriptorRef>jar-with-dependencies</descriptorRef>
 79 |                             </descriptorRefs>
 80 |                         </configuration>
 81 |                     </execution>
 82 |                     <execution>
 83 |                         <id>lhr-evaluation-assembly</id>
 84 |                         <phase>package</phase>
 85 |                         <goals> <goal>single</goal> </goals>
 86 |                         <configuration>
 87 |                             <finalName>lhr-parser-eval</finalName>
 88 |                             <archive>
 89 |                                 <manifest>
 90 |                                     <addClasspath>true</addClasspath>
 91 |                                     <mainClass>evaluation.EvaluateLHRKt</mainClass>
 92 |                                 </manifest>
 93 |                             </archive>
 94 |                             <descriptorRefs>
 95 |                                 <descriptorRef>jar-with-dependencies</descriptorRef>
 96 |                             </descriptorRefs>
 97 |                         </configuration>
 98 |                     </execution>
 99 |                 </executions>
100 |             </plugin>
101 |         </plugins>
102 |     </build>
103 | 
104 |     <dependencies>
105 |         <dependency>
106 |             <groupId>org.jetbrains.kotlin</groupId>
107 |             <artifactId>kotlin-stdlib-jdk8</artifactId>
108 |             <version>${kotlin.version}</version>
109 |         </dependency>
110 | 
111 |         <dependency>
112 |             <groupId>org.jetbrains.kotlin</groupId>
113 |             <artifactId>kotlin-reflect</artifactId>
114 |             <version>${kotlin.version}</version>
115 |         </dependency>
116 | 
117 |         <dependency>
118 |             <groupId>com.beust</groupId>
119 |             <artifactId>klaxon</artifactId>
120 |             <version>${klaxon.version}</version>
121 |         </dependency>
122 | 
123 |         <dependency>
124 |             <groupId>com.xenomachina</groupId>
125 |             <artifactId>xenocom</artifactId>
126 |             <version>${xenocom.version}</version>
127 |         </dependency>
128 | 
129 |         <dependency>
130 |             <groupId>com.xenomachina</groupId>
131 |             <artifactId>kotlin-argparser</artifactId>
132 |             <version>${argparser.version}</version>
133 |         </dependency>
134 | 
135 |         <dependency>
136 |             <groupId>com.kotlinnlp</groupId>
137 |             <artifactId>neuralparser</artifactId>
138 |             <version>${neuralparser.version}</version>
139 |         </dependency>
140 |     </dependencies>
141 | </project>
142 | 


--------------------------------------------------------------------------------
/examples/training/CommandLineArguments.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package training
  9 | 
 10 | import com.xenomachina.argparser.ArgParser
 11 | import com.xenomachina.argparser.InvalidArgumentException
 12 | import com.xenomachina.argparser.default
 13 | 
 14 | /**
 15 |  * The interpreter of command line arguments for the training script.
 16 |  *
 17 |  * @param args the array of command line arguments
 18 |  */
 19 | class CommandLineArguments(args: Array<String>) {
 20 | 
 21 |   /**
 22 |    * The type of tokens encoding.
 23 |    *
 24 |    * TODO: AMBIGUOUS_POS
 25 |    */
 26 |   enum class TokensEncodingType {
 27 |     WORD_EMBEDDINGS,
 28 |     WORD_AND_POS_EMBEDDINGS,
 29 |     WORD_AND_EXT_AND_POS_EMBEDDINGS,
 30 |     MORPHO_FEATURES,
 31 |     CHARLM
 32 |   }
 33 | 
 34 |   /**
 35 |    * The parser of the string arguments.
 36 |    */
 37 |   private val parser = ArgParser(args)
 38 | 
 39 |   /**
 40 |    * The language code
 41 |    */
 42 |   val langCode: String by parser.storing(
 43 |     "-l",
 44 |     "--language",
 45 |     help="the language ISO 639-1 code"
 46 |   )
 47 | 
 48 |   /**
 49 |    * The number of training epochs (default = 10).
 50 |    */
 51 |   val epochs: Int by parser.storing(
 52 |     "-e",
 53 |     "--epochs",
 54 |     help="the number of training epochs (default = 10)"
 55 |   ) { toInt() }.default(10)
 56 | 
 57 |   /**
 58 |    * The size of the batches of sentences (default = 1).
 59 |    */
 60 |   val batchSize: Int by parser.storing(
 61 |     "-b",
 62 |     "--batch-size",
 63 |     help="the size of the batches of sentences (default = 1)"
 64 |   ) { toInt() }.default(1)
 65 | 
 66 |   /**
 67 |    * The maximum number of sentences to load for training (default unlimited)
 68 |    */
 69 |   val maxSentences: Int? by parser.storing(
 70 |     "-s",
 71 |     "--max-sentences",
 72 |     help="the maximum number of sentences to load for training (default unlimited)"
 73 |   ) { toInt() }.default { null }
 74 | 
 75 |   /**
 76 |    * The file path of the training set.
 77 |    */
 78 |   val trainingSetPath: String by parser.storing(
 79 |     "-t",
 80 |     "--training-set",
 81 |     help="the file path of the training set"
 82 |   )
 83 | 
 84 |   /**
 85 |    * The file path of the gold-POS training set.
 86 |    * TODO: Re-enable for LHR transfer learning.
 87 |    */
 88 | //  val goldPosSetPath: String? by parser.storing(
 89 | //    "-p",
 90 | //    "--pos-set",
 91 | //    help="the file path of the gold-POS training set"
 92 | //  ).default { null }
 93 | 
 94 |   /**
 95 |    * The file path of the validation set.
 96 |    */
 97 |   val validationSetPath: String by parser.storing(
 98 |     "-v",
 99 |     "--validation-set",
100 |     help="the file path of the validation set"
101 |   )
102 | 
103 |   /**
104 |    * The path of the file in which to save the serialized model.
105 |    */
106 |   val modelPath: String by parser.storing(
107 |     "-m",
108 |     "--model-path",
109 |     help="the path of the file in which to save the serialized model"
110 |   )
111 | 
112 |   /**
113 |    * The file path of the pre-trained word embeddings.
114 |    */
115 |   val embeddingsPath: String? by parser.storing(
116 |     "-w",
117 |     "--trained-word-emb-path",
118 |     help="the file path of the pre-trained word embeddings"
119 |   ).default { null }
120 | 
121 |   /**
122 |    * The number of stacked BiRNNs of the context encoder (default 2).
123 |    */
124 |   val numOfContextLayers: Int by parser.storing(
125 |     "-c",
126 |     "--context-layers",
127 |     help="the number of stacked BiRNNs of the context encoder (default 2)"
128 |   ){ toInt() }
129 |     .default(2)
130 |     .addValidator { if (value < 1) throw InvalidArgumentException( "The number of context-layers must >= 1") }
131 | 
132 |   /**
133 |    * The size of the word embedding vectors.
134 |    */
135 |   val wordEmbeddingSize: Int by parser.storing(
136 |     "--word-emb-size",
137 |     help="the size of the word embedding vectors (default 150)"
138 |   ){ toInt() }.default(150)
139 | 
140 |   /**
141 |    * The word embeddings dropout coefficient.
142 |    */
143 |   val wordDropoutCoefficient: Double by parser.storing(
144 |     "--word-dropout",
145 |     help="the word embeddings dropout coefficient (default 0.25)"
146 |   ){ toDouble() }.default(0.25)
147 | 
148 |   /**
149 |    * The size of the part-of-speech embedding vectors.
150 |    */
151 |   val posEmbeddingSize: Int by parser.storing(
152 |     "--pos-emb-size",
153 |     help="the size of the part-of-speech embedding vectors (default 50)"
154 |   ){ toInt() }.default(50)
155 | 
156 |   /**
157 |    * The part-of-speech embeddings dropout coefficient.
158 |    */
159 |   val posDropoutCoefficient: Double by parser.storing(
160 |     "--pos-dropout",
161 |     help="the part-of-speech embeddings dropout coefficient (default 0.0)"
162 |   ){ toDouble() }.default(0.0)
163 | 
164 |   /**
165 |    * Whether to skip non-projective sentences.
166 |    */
167 |   val skipNonProjective: Boolean by parser.flagging(
168 |     "--skip-non-projective",
169 |     help="whether to skip non-projective sentences"
170 |   )
171 | 
172 |   /**
173 |    * Whether to do not consider punctuation errors.
174 |    */
175 |   val skipPunctuationErrors: Boolean by parser.flagging(
176 |     "--skip-punct-err",
177 |     help="whether to do not consider punctuation errors"
178 |   )
179 | 
180 |   /**
181 |    * Whether to do not use the labeler.
182 |    */
183 |   val noLabeler: Boolean by parser.flagging(
184 |     "--no-labeler",
185 |     help="whether to do not use the labeler"
186 |   )
187 | 
188 |   /**
189 |    * Whether to do not predict the POS tags.
190 |    */
191 |   val noPosPrediction: Boolean by parser.flagging(
192 |     "--no-pos",
193 |     help="whether to do not predict the POS tags"
194 |   )
195 | 
196 |   /**
197 |    * The file path of the serialized morphology dictionary.
198 |    */
199 |   val morphoDictionaryPath: String? by parser.storing(
200 |     "-d",
201 |     "--dictionary",
202 |     help="the file path of the serialized morphology dictionary"
203 |   ).default { null }
204 | 
205 |   /**
206 |    * The file path of the lexicon dictionary.
207 |    */
208 |   val lexiconDictionaryPath: String? by parser.storing(
209 |     "-x",
210 |     "--lexicon",
211 |     help="the file path of the lexicon dictionary"
212 |   ).default { null }
213 | 
214 |   /**
215 |    * The file path of the serialized characters language model.
216 |    */
217 |   val charLMModelPath: String? by parser.storing(
218 |     "--charlm",
219 |     help="the file path of the serialized characters language model"
220 |   ).default { null }
221 | 
222 |   /**
223 |    * The file path of the serialized characters language model for reverse encodings.
224 |    */
225 |   val charLMRevModelPath: String? by parser.storing(
226 |     "--charlm-rev",
227 |     help="the file path of the serialized characters language model for reverse encodings"
228 |   ).default { null }
229 | 
230 |   /**
231 |    * The type of morphology encoding.
232 |    */
233 |   val tokensEncodingType: TokensEncodingType by parser.mapping(
234 |     "--tokens-word-emb"  to TokensEncodingType.WORD_EMBEDDINGS,
235 |     "--tokens-word-pos-emb" to TokensEncodingType.WORD_AND_POS_EMBEDDINGS,
236 |     "--tokens-word-ext-pos-emb" to TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS,
237 |     "--tokens-morpho" to TokensEncodingType.MORPHO_FEATURES,
238 |     "--tokens-charlm" to TokensEncodingType.CHARLM,
239 |     help = "the type of morphology encoding (default --tokens-word-pos-emb)"
240 |   ).default { TokensEncodingType.WORD_AND_POS_EMBEDDINGS }
241 | 
242 |   /**
243 |    * Whether to do not show details about the training.
244 |    */
245 |   val quiet: Boolean by parser.flagging(
246 |     "-q",
247 |     "--quiet",
248 |     help="whether to do not show details about the training "
249 |   )
250 | 
251 |   /**
252 |    * Force parsing all arguments (only read ones are parsed by default).
253 |    * Check the dependencies between more arguments.
254 |    */
255 |   init {
256 | 
257 |     parser.force()
258 | 
259 |     this.checkDependencies()
260 |   }
261 | 
262 |   /**
263 |    * Check the dependencies between more arguments.
264 |    */
265 |   private fun checkDependencies() {
266 | 
267 |     if (this.tokensEncodingType == TokensEncodingType.CHARLM) {
268 |       this.charLMModelPath ?: throw RuntimeException("Missing characters language model path")
269 |       this.charLMRevModelPath ?: throw RuntimeException("Missing reverse characters language model path")
270 |     }
271 |   }
272 | }
273 | 


--------------------------------------------------------------------------------
/examples/training/TrainLHR.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package training
  9 | 
 10 | import buildSentencePreprocessor
 11 | import com.kotlinnlp.languagemodel.CharLM
 12 | import com.kotlinnlp.linguisticdescription.language.getLanguageByIso
 13 | import com.kotlinnlp.linguisticdescription.lexicon.LexiconDictionary
 14 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
 15 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
 16 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
 17 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
 18 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
 19 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
 20 | import com.kotlinnlp.lssencoder.LSSModel
 21 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer
 22 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
 23 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
 24 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
 25 | import com.kotlinnlp.simplednn.core.layers.LayerType
 26 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
 27 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
 28 | import com.kotlinnlp.neuralparser.language.*
 29 | import com.kotlinnlp.simplednn.deeplearning.birnn.BiRNNConfig
 30 | import com.kotlinnlp.tokensencoder.embeddings.EmbeddingsEncoderModel
 31 | import com.xenomachina.argparser.mainBody
 32 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel
 33 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser
 34 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRTrainer
 35 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
 36 | import com.kotlinnlp.tokensencoder.wrapper.MirrorConverter
 37 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.MorphoConverter
 38 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors.PosTagKeyExtractor
 39 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.FormConverter
 40 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.NormWordKeyExtractor
 41 | import com.kotlinnlp.neuralparser.utils.loadSentences
 42 | import com.kotlinnlp.simplednn.core.embeddings.EmbeddingsMap
 43 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
 44 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge
 45 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.ConcatMerge
 46 | import com.kotlinnlp.tokensencoder.charlm.CharLMEncoderModel
 47 | import com.kotlinnlp.tokensencoder.ensemble.EnsembleTokensEncoderModel
 48 | import com.kotlinnlp.tokensencoder.morpho.FeaturesCollector
 49 | import com.kotlinnlp.tokensencoder.morpho.MorphoEncoderModel
 50 | import com.kotlinnlp.tokensencoder.wrapper.TokensEncoderWrapperModel
 51 | import java.io.File
 52 | import java.io.FileInputStream
 53 | 
 54 | /**
 55 |  * Train the [LHRParser].
 56 |  *
 57 |  * Launch with the '-h' option for help about the command line arguments.
 58 |  */
 59 | fun main(args: Array<String>) = mainBody {
 60 | 
 61 |   val parsedArgs = CommandLineArguments(args)
 62 | 
 63 |   val trainingSentences: List<CoNLLSentence> = loadSentences(
 64 |     type = "training",
 65 |     filePath = parsedArgs.trainingSetPath,
 66 |     maxSentences = parsedArgs.maxSentences,
 67 |     skipNonProjective = parsedArgs.skipNonProjective)
 68 | 
 69 |   val corpus: CorpusDictionary = trainingSentences.let {
 70 |     println("Creating corpus dictionary...")
 71 |     CorpusDictionary(it)
 72 |   }
 73 | 
 74 |   val morphologyDictionary: MorphologyDictionary? = parsedArgs.morphoDictionaryPath?.let {
 75 |     println("Loading serialized dictionary from '$it'...")
 76 |     MorphologyDictionary.load(FileInputStream(File(it)))
 77 |   }
 78 | 
 79 |   val parser: LHRParser = buildParser(
 80 |     parsedArgs = parsedArgs,
 81 |     tokensEncoderWrapperModel = buildTokensEncoderWrapperModel(
 82 |       parsedArgs = parsedArgs,
 83 |       sentences = trainingSentences,
 84 |       corpus = corpus,
 85 |       morphologyDictionary = morphologyDictionary),
 86 |     corpus = corpus)
 87 | 
 88 |   val trainer = buildTrainer(parser = parser, parsedArgs = parsedArgs, morphologyDictionary = morphologyDictionary)
 89 | 
 90 |   println("\n-- MODEL")
 91 |   println(parser.model)
 92 | 
 93 |   println("\n-- START TRAINING ON %d SENTENCES".format(trainingSentences.size))
 94 |   println(trainer)
 95 | 
 96 |   trainer.train(trainingSentences = trainingSentences)
 97 | }
 98 | 
 99 | /**
100 |  * Build the LHR Parser.
101 |  *
102 |  * @param parsedArgs the parsed command line arguments
103 |  * @param tokensEncoderWrapperModel the tokens-encoder wrapper model
104 |  * @param corpus the corpus dictionary
105 |  *
106 |  * @return a new parser
107 |  */
108 | private fun buildParser(
109 |   parsedArgs: CommandLineArguments,
110 |   tokensEncoderWrapperModel: TokensEncoderWrapperModel<ParsingToken, ParsingSentence, *, *>,
111 |   corpus: CorpusDictionary
112 | ): LHRParser = LHRParser(model = LHRModel(
113 |   corpusDictionary = corpus,
114 |   lssModel = LSSModel(
115 |     language = getLanguageByIso(parsedArgs.langCode),
116 |     tokensEncoderWrapperModel = tokensEncoderWrapperModel,
117 |     contextBiRNNConfig = BiRNNConfig(
118 |       connectionType = LayerType.Connection.LSTM,
119 |       hiddenActivation = Tanh,
120 |       numberOfLayers = parsedArgs.numOfContextLayers),
121 |     headsBiRNNConfig = BiRNNConfig(
122 |       connectionType = LayerType.Connection.LSTM,
123 |       hiddenActivation = Tanh)
124 |   ),
125 |   useLabeler = !parsedArgs.noLabeler,
126 |   lossCriterionType = LossCriterionType.Softmax,
127 |   predictPosTags = !parsedArgs.noPosPrediction))
128 | 
129 | /**
130 |  * Build a tokens-encoder wrapper model.
131 |  *
132 |  * @param parsedArgs the parsed command line arguments
133 |  * @param corpus the corpus dictionary
134 |  *
135 |  * @return a new tokens-encoder wrapper model
136 |  */
137 | private fun buildTokensEncoderWrapperModel(
138 |   parsedArgs: CommandLineArguments,
139 |   sentences: List<CoNLLSentence>, // TODO: it will be used to initialize the MorphoEncoder
140 |   corpus: CorpusDictionary,
141 |   morphologyDictionary: MorphologyDictionary?
142 | ): TokensEncoderWrapperModel<ParsingToken, ParsingSentence, *, *> =
143 | 
144 |   when (parsedArgs.tokensEncodingType) {
145 | 
146 |     CommandLineArguments.TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder
147 | 
148 |       val embeddingsMap = EmbeddingsMap.fromSet(
149 |         size = parsedArgs.wordEmbeddingSize,
150 |         elements = corpus.words.getElementsReversedSet())
151 | 
152 |       val preEmbeddingsMap = parsedArgs.embeddingsPath!!.let {
153 |         println("Loading pre-trained word embeddings from '$it'...")
154 |         EmbeddingsMap.load(filename = it)
155 |       }
156 | 
157 |       val posEmbeddingsMap = EmbeddingsMap.fromSet(
158 |         size = parsedArgs.posEmbeddingSize,
159 |         elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet())
160 | 
161 |       TokensEncoderWrapperModel(
162 |         model = EnsembleTokensEncoderModel(
163 |           components = listOf(
164 |             EnsembleTokensEncoderModel.ComponentModel(
165 |               model = TokensEncoderWrapperModel(
166 |                 model = EmbeddingsEncoderModel.Base(
167 |                   embeddingsMap = preEmbeddingsMap,
168 |                   embeddingKeyExtractor = NormWordKeyExtractor(),
169 |                   dropout = parsedArgs.wordDropoutCoefficient),
170 |                 converter = FormConverter()),
171 |               trainable = true),
172 |             EnsembleTokensEncoderModel.ComponentModel(
173 |               model = TokensEncoderWrapperModel(
174 |                 model = EmbeddingsEncoderModel.Base(
175 |                   embeddingsMap = embeddingsMap,
176 |                   embeddingKeyExtractor = NormWordKeyExtractor(),
177 |                   frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
178 |                   dropout = parsedArgs.wordDropoutCoefficient),
179 |                 converter = FormConverter()),
180 |               trainable = true),
181 |             EnsembleTokensEncoderModel.ComponentModel(
182 |               model = TokensEncoderWrapperModel(
183 |                 model = EmbeddingsEncoderModel.Base(
184 |                   embeddingsMap = posEmbeddingsMap,
185 |                   embeddingKeyExtractor = PosTagKeyExtractor,
186 |                   frequencyDictionary = corpus.grammaticalConfigurations.getElements()
187 |                     .mapNotNull { it.posToString }
188 |                     .associateWith { 1 },
189 |                   dropout = parsedArgs.posDropoutCoefficient),
190 |                 converter = MirrorConverter()),
191 |               trainable = true)
192 |           ),
193 |           outputMergeConfiguration = AffineMerge(
194 |             outputSize = 100, // TODO
195 |             activationFunction = null)),
196 |         converter = MirrorConverter()
197 |       )
198 |     }
199 | 
200 |     CommandLineArguments.TokensEncodingType.WORD_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder
201 | 
202 |       val embeddingsMap = EmbeddingsMap.fromSet(
203 |         size = parsedArgs.wordEmbeddingSize,
204 |         elements = corpus.words.getElementsReversedSet())
205 | 
206 |       val posEmbeddingsMap = EmbeddingsMap.fromSet(
207 |         size = parsedArgs.posEmbeddingSize,
208 |         elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet())
209 | 
210 |       TokensEncoderWrapperModel(
211 |         model = EnsembleTokensEncoderModel(
212 |           components = listOf(
213 |             EnsembleTokensEncoderModel.ComponentModel(
214 |               model = TokensEncoderWrapperModel(
215 |                 model = EmbeddingsEncoderModel.Base(
216 |                   embeddingsMap = embeddingsMap,
217 |                   embeddingKeyExtractor = NormWordKeyExtractor(),
218 |                   frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
219 |                   dropout = parsedArgs.wordDropoutCoefficient),
220 |                 converter = FormConverter()),
221 |               trainable = true),
222 |             EnsembleTokensEncoderModel.ComponentModel(
223 |               model = TokensEncoderWrapperModel(
224 |                 model = EmbeddingsEncoderModel.Base(
225 |                   embeddingsMap = posEmbeddingsMap,
226 |                   embeddingKeyExtractor = PosTagKeyExtractor,
227 |                   frequencyDictionary = corpus.grammaticalConfigurations.getElements()
228 |                     .mapNotNull { it.posToString }
229 |                     .associateWith { 1 },
230 |                   dropout = parsedArgs.posDropoutCoefficient),
231 |                 converter = MirrorConverter()),
232 |               trainable = true)
233 |           ),
234 |           outputMergeConfiguration = ConcatMerge()),
235 |         converter = MirrorConverter()
236 |       )
237 |     }
238 | 
239 |     CommandLineArguments.TokensEncodingType.WORD_EMBEDDINGS -> { // TODO: separate with a dedicated builder
240 | 
241 |       val embeddingsMap = EmbeddingsMap.fromSet(
242 |         size = parsedArgs.wordEmbeddingSize,
243 |         elements = corpus.words.getElementsReversedSet())
244 | 
245 |       TokensEncoderWrapperModel(
246 |         model = EmbeddingsEncoderModel.Base(
247 |           embeddingsMap = embeddingsMap,
248 |           embeddingKeyExtractor = NormWordKeyExtractor(),
249 |           frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
250 |           dropout = parsedArgs.wordDropoutCoefficient),
251 |         converter = FormConverter()
252 |       )
253 |     }
254 | 
255 |     CommandLineArguments.TokensEncodingType.CHARLM -> { // TODO: separate with a dedicated builder
256 |       TokensEncoderWrapperModel(
257 |         model = CharLMEncoderModel(
258 |           dirCharLM = CharLM.load(File(parsedArgs.charLMModelPath!!).inputStream()),
259 |           revCharLM = CharLM.load(File(parsedArgs.charLMRevModelPath!!).inputStream()),
260 |           outputMergeConfiguration = AffineMerge(
261 |             outputSize = 100, // TODO
262 |             activationFunction = Tanh)),
263 |         converter = FormConverter()
264 |       )
265 |     }
266 | 
267 |     CommandLineArguments.TokensEncodingType.MORPHO_FEATURES -> {
268 | 
269 |       val analyzer = MorphologicalAnalyzer(dictionary = morphologyDictionary!!)
270 | 
271 |       val lexiconDictionary = parsedArgs.lexiconDictionaryPath?.let {
272 |         println("Loading lexicon from '$it'...")
273 |         LexiconDictionary.load(it)
274 |       }
275 | 
276 |       val featuresDictionary = FeaturesCollector(
277 |         lexicalDictionary = lexiconDictionary,
278 |         sentences = sentences.mapIndexed { i, it -> it.toMorphoSentence(index = i, analyzer = analyzer)}
279 |       ).collect()
280 | 
281 |       TokensEncoderWrapperModel(
282 |         model = MorphoEncoderModel(
283 |           lexiconDictionary = lexiconDictionary,
284 |           featuresDictionary = featuresDictionary,
285 |           tokenEncodingSize = parsedArgs.wordEmbeddingSize,
286 |           activation = null),
287 |         converter = MorphoConverter()
288 |       )
289 |     }
290 |   }
291 | 
292 | 
293 | /**
294 |  * Build a [MorphoSentence] from this [CoNLLSentence].
295 |  *
296 |  * @param index the position index of this sentence
297 |  * @param analyzer a morphological analyzer
298 |  *
299 |  * @return a new morpho sentence
300 |  */
301 | private fun CoNLLSentence.toMorphoSentence(index: Int, analyzer: MorphologicalAnalyzer): MorphoSentence<FormToken> {
302 | 
303 |   val baseTokens = this.tokens.toBaseTokens()
304 |   val position = Position(
305 |     index = index,
306 |     start = baseTokens.first().position.start,
307 |     end = baseTokens.last().position.end)
308 |   @Suppress("UNCHECKED_CAST")
309 |   val sentence = BaseSentence(id = index, position = position, tokens = baseTokens) as RealSentence<RealToken>
310 | 
311 |   val analysis = analyzer.analyze(sentence)
312 | 
313 |   return object : MorphoSentence<FormToken> {
314 |     override val tokens: List<FormToken> = this@toMorphoSentence.tokens
315 |     override val morphoAnalysis: MorphologicalAnalysis? = analysis
316 |   }
317 | }
318 | 
319 | /**
320 |  * Build a trainer for a given [LHRParser].
321 |  *
322 |  * @param parser an LHR parser
323 |  * @param parsedArgs the parsed command line arguments
324 |  * @param morphologyDictionary a morphology dictionary
325 |  *
326 |  * @return a trainer for the given [parser]
327 |  */
328 | private fun buildTrainer(parser: LHRParser,
329 |                          parsedArgs: CommandLineArguments,
330 |                          morphologyDictionary: MorphologyDictionary?): LHRTrainer {
331 | 
332 |   val preprocessor: SentencePreprocessor = buildSentencePreprocessor(morphologyDictionary)
333 | 
334 |   return LHRTrainer(
335 |     parser = parser,
336 |     epochs = parsedArgs.epochs,
337 |     batchSize = parsedArgs.batchSize,
338 |     validator = Validator(
339 |       neuralParser = parser,
340 |       sentences = loadSentences(
341 |         type = "validation",
342 |         filePath = parsedArgs.validationSetPath,
343 |         maxSentences = null,
344 |         skipNonProjective = false),
345 |       sentencePreprocessor = preprocessor),
346 |     modelFilename = parsedArgs.modelPath,
347 |     skipPunctuationErrors = parsedArgs.skipPunctuationErrors,
348 |     usePositionalEncodingErrors = false,
349 |     updateMethod = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
350 |     sentencePreprocessor = preprocessor,
351 |     verbose = !parsedArgs.quiet)
352 | }
353 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 | 
  6 |     <modelVersion>4.0.0</modelVersion>
  7 | 
  8 |     <groupId>com.kotlinnlp</groupId>
  9 |     <artifactId>neuralparser</artifactId>
 10 |     <version>0.6.5</version>
 11 |     <packaging>jar</packaging>
 12 | 
 13 |     <name>${project.groupId}:${project.artifactId}</name>
 14 |     <description>
 15 |         NeuralParser is a very simple to use dependency parser, based on the SimpleDNN library and
 16 |         the SyntaxDecoder transition systems framework.
 17 |     </description>
 18 |     <url>http://github.com/kotlinnlp/neuralparser</url>
 19 | 
 20 |     <scm>
 21 |         <connection>scm:git:git://github.com/kotlinnlp/neuralparser.git</connection>
 22 |         <developerConnection>scm:git:ssh://github.com:kotlinnlp/neuralparser.git</developerConnection>
 23 |         <url>http://github.com/kotlinnlp/neuralparser/tree/master</url>
 24 |     </scm>
 25 | 
 26 |     <developers>
 27 |         <developer>
 28 |             <name>KotlinNLP Authors</name>
 29 |             <email>github@kotlinnlp.com</email>
 30 |             <organization>KotlinNLP</organization>
 31 |             <organizationUrl>http://www.kotlinnlp.com</organizationUrl>
 32 |         </developer>
 33 |     </developers>
 34 | 
 35 |     <licenses>
 36 |         <license>
 37 |             <name>Apache License, Version 2.0</name>
 38 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 39 |             <distribution>repo</distribution>
 40 |         </license>
 41 |     </licenses>
 42 | 
 43 |     <distributionManagement>
 44 |         <snapshotRepository>
 45 |             <id>ossrh</id>
 46 |             <url>https://oss.sonatype.org/content/repositories/snapshots</url>
 47 |         </snapshotRepository>
 48 |         <repository>
 49 |             <id>ossrh</id>
 50 |             <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 51 |         </repository>
 52 |     </distributionManagement>
 53 | 
 54 |     <properties>
 55 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 56 |         <nexus-staging-plugin.version>1.6.7</nexus-staging-plugin.version>
 57 |         <maven-source-plugin.version>3.0.0</maven-source-plugin.version>
 58 |         <maven-gpg-plugin.version>1.6</maven-gpg-plugin.version>
 59 |         <oss.server.id>oss_kotlinnlp</oss.server.id>
 60 |         <gpg.keyname>C73F18F0</gpg.keyname>
 61 |         <kotlin.version>1.3.31</kotlin.version>
 62 |         <klaxon.version>5.2</klaxon.version>
 63 |         <dokka.version>0.9.16</dokka.version>
 64 |         <dependencytree.version>0.5.2</dependencytree.version>
 65 |         <lssencoder.version>0.2.3</lssencoder.version>
 66 |     </properties>
 67 | 
 68 |     <repositories>
 69 |         <repository>
 70 |             <id>jcenter</id>
 71 |             <url>https://jcenter.bintray.com/</url>
 72 |         </repository>
 73 |     </repositories>
 74 | 
 75 |     <pluginRepositories>
 76 |         <pluginRepository>
 77 |             <id>jcenter</id>
 78 |             <name>JCenter</name>
 79 |             <url>https://jcenter.bintray.com/</url>
 80 |         </pluginRepository>
 81 |     </pluginRepositories>
 82 | 
 83 |     <build>
 84 |         <sourceDirectory>src/main/kotlin</sourceDirectory>
 85 | 
 86 |         <plugins>
 87 |             <plugin>
 88 |                 <groupId>org.jetbrains.kotlin</groupId>
 89 |                 <artifactId>kotlin-maven-plugin</artifactId>
 90 |                 <version>${kotlin.version}</version>
 91 |                 <configuration>
 92 |                     <jvmTarget>1.8</jvmTarget>
 93 |                 </configuration>
 94 |                 <executions>
 95 |                     <execution>
 96 |                         <id>compile</id>
 97 |                         <phase>compile</phase>
 98 |                         <goals>
 99 |                             <goal>compile</goal>
100 |                         </goals>
101 |                     </execution>
102 |                     <execution>
103 |                         <id>test-compile</id>
104 |                         <phase>test-compile</phase>
105 |                         <goals>
106 |                             <goal>test-compile</goal>
107 |                         </goals>
108 |                     </execution>
109 |                 </executions>
110 |             </plugin>
111 | 
112 |             <plugin>
113 |                 <groupId>org.jetbrains.dokka</groupId>
114 |                 <artifactId>dokka-maven-plugin</artifactId>
115 |                 <version>${dokka.version}</version>
116 |                 <configuration>
117 |                     <skip>true</skip>
118 |                 </configuration>
119 |                 <executions>
120 |                     <execution>
121 |                         <phase>prepare-package</phase>
122 |                         <goals>
123 |                             <goal>dokka</goal>
124 |                             <goal>javadoc</goal>
125 |                             <goal>javadocJar</goal>
126 |                         </goals>
127 |                         <configuration>
128 |                             <includes>
129 |                                 <file>packages.md</file>
130 |                             </includes>
131 |                         </configuration>
132 |                     </execution>
133 |                 </executions>
134 |             </plugin>
135 | 
136 |             <plugin>
137 |                 <groupId>org.sonatype.plugins</groupId>
138 |                 <artifactId>nexus-staging-maven-plugin</artifactId>
139 |                 <version>${nexus-staging-plugin.version}</version>
140 |                 <extensions>true</extensions>
141 |                 <configuration>
142 |                     <serverId>${oss.server.id}</serverId>
143 |                     <nexusUrl>https://oss.sonatype.org/</nexusUrl>
144 |                     <autoReleaseAfterClose>true</autoReleaseAfterClose>
145 |                 </configuration>
146 |             </plugin>
147 | 
148 |             <plugin>
149 |                 <groupId>org.apache.maven.plugins</groupId>
150 |                 <artifactId>maven-source-plugin</artifactId>
151 |                 <version>${maven-source-plugin.version}</version>
152 |                 <executions>
153 |                     <execution>
154 |                         <id>attach-sources</id>
155 |                         <goals>
156 |                             <goal>jar-no-fork</goal>
157 |                         </goals>
158 |                     </execution>
159 |                 </executions>
160 |             </plugin>
161 | 
162 |             <plugin>
163 |                 <groupId>org.apache.maven.plugins</groupId>
164 |                 <artifactId>maven-gpg-plugin</artifactId>
165 |                 <version>${maven-gpg-plugin.version}</version>
166 |                 <configuration>
167 |                     <skip>true</skip>
168 |                 </configuration>
169 |                 <executions>
170 |                     <execution>
171 |                         <id>sign-artifacts</id>
172 |                         <phase>verify</phase>
173 |                         <goals>
174 |                             <goal>sign</goal>
175 |                         </goals>
176 |                         <configuration>
177 |                             <keyname>${gpg.keyname}</keyname>
178 |                             <passphraseServerId>${gpg.keyname}</passphraseServerId>
179 |                         </configuration>
180 |                     </execution>
181 |                 </executions>
182 |             </plugin>
183 |         </plugins>
184 |     </build>
185 | 
186 |     <dependencies>
187 |         <dependency>
188 |             <groupId>org.jetbrains.kotlin</groupId>
189 |             <artifactId>kotlin-stdlib-jdk8</artifactId>
190 |             <version>${kotlin.version}</version>
191 |         </dependency>
192 | 
193 |         <dependency>
194 |             <groupId>org.jetbrains.kotlin</groupId>
195 |             <artifactId>kotlin-reflect</artifactId>
196 |             <version>${kotlin.version}</version>
197 |         </dependency>
198 | 
199 |         <dependency>
200 |             <groupId>org.jetbrains.kotlin</groupId>
201 |             <artifactId>kotlin-test</artifactId>
202 |             <version>${kotlin.version}</version>
203 |             <scope>test</scope>
204 |         </dependency>
205 | 
206 |         <dependency>
207 |             <groupId>com.beust</groupId>
208 |             <artifactId>klaxon</artifactId>
209 |             <version>${klaxon.version}</version>
210 |         </dependency>
211 | 
212 |         <dependency>
213 |             <groupId>com.kotlinnlp</groupId>
214 |             <artifactId>lssencoder</artifactId>
215 |             <version>${lssencoder.version}</version>
216 |         </dependency>
217 | 
218 |         <dependency>
219 |             <groupId>com.kotlinnlp</groupId>
220 |             <artifactId>dependencytree</artifactId>
221 |             <version>${dependencytree.version}</version>
222 |         </dependency>
223 |     </dependencies>
224 | </project>
225 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParser.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | 
14 | /**
15 |  * A Neural Parser.
16 |  */
17 | interface NeuralParser<out ModelType: NeuralParserModel> {
18 | 
19 |   /**
20 |    * The model of this neural parser.
21 |    */
22 |   val model: ModelType
23 | 
24 |   /**
25 |    * Whether this parser executes the morpho-syntactic labelling.
26 |    */
27 |   val labellingEnabled: Boolean
28 | 
29 |   /**
30 |    * Parse a sentence, giving its dependency tree.
31 |    *
32 |    * @param sentence a [Sentence]
33 |    *
34 |    * @return the dependency tree predicted for the given [sentence]
35 |    */
36 |   fun parse(sentence: ParsingSentence): MorphoSynSentence
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParserModel.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser
 9 | 
10 | import com.kotlinnlp.linguisticdescription.language.Language
11 | import com.kotlinnlp.utils.Serializer
12 | import java.io.InputStream
13 | import java.io.OutputStream
14 | import java.io.Serializable
15 | 
16 | /**
17 |  * The serializable model of a [NeuralParser].
18 |  *
19 |  * @property language the language within the parser works
20 |  */
21 | abstract class NeuralParserModel(val language: Language) : Serializable {
22 | 
23 |   companion object {
24 | 
25 |     /**
26 |      * Private val used to serialize the class (needed by Serializable).
27 |      */
28 |     @Suppress("unused")
29 |     private const val serialVersionUID: Long = 1L
30 | 
31 |     /**
32 |      * Read a [NeuralParserModel] (serialized) from an input stream and decode it.
33 |      *
34 |      * @param inputStream the [InputStream] from which to read the serialized [NeuralParserModel]
35 |      *
36 |      * @return the [NeuralParserModel] read from [inputStream] and decoded
37 |      */
38 |     fun load(inputStream: InputStream): NeuralParserModel = Serializer.deserialize(inputStream)
39 |   }
40 | 
41 |   /**
42 |    * Serialize this [NeuralParserModel] and write it to an output stream.
43 |    *
44 |    * @param outputStream the [OutputStream] in which to write this serialized [NeuralParserModel]
45 |    */
46 |   fun dump(outputStream: OutputStream) = Serializer.serialize(this, outputStream)
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/Trainer.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.helpers
  9 | 
 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
 11 | import com.kotlinnlp.dependencytree.DependencyTree
 12 | import com.kotlinnlp.neuralparser.NeuralParser
 13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
 14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
 15 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics
 16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
 17 | import com.kotlinnlp.neuralparser.language.BaseSentence
 18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 19 | import com.kotlinnlp.utils.ShuffledIndices
 20 | import com.kotlinnlp.utils.Shuffler
 21 | import com.kotlinnlp.utils.Timer
 22 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
 23 | import java.io.File
 24 | import java.io.FileOutputStream
 25 | 
 26 | /**
 27 |  * The training helper of the [NeuralParser].
 28 |  *
 29 |  * @param neuralParser a neural parser
 30 |  * @param batchSize the number of sentences that compose a batch
 31 |  * @param epochs the number of training epochs
 32 |  * @param validator the validation helper (if it is null no validation is done after each epoch)
 33 |  * @param modelFilename the name of the file in which to save the best trained model
 34 |  * @param minRelevantErrorsCountToUpdate the min count of relevant errors needed to update the neural parser (default 1)
 35 |  * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
 36 |  * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
 37 |  */
 38 | abstract class Trainer(
 39 |   private val neuralParser: NeuralParser<*>,
 40 |   private val batchSize: Int,
 41 |   private val epochs: Int,
 42 |   private val validator: Validator?,
 43 |   private val modelFilename: String,
 44 |   private val minRelevantErrorsCountToUpdate: Int = 1,
 45 |   private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
 46 |   private val verbose: Boolean = true
 47 | ) {
 48 | 
 49 |   /**
 50 |    * A timer to track the elapsed time.
 51 |    */
 52 |   private var timer = Timer()
 53 | 
 54 |   /**
 55 |    * The best accuracy reached during the training.
 56 |    */
 57 |   private var bestAccuracy: Double = -1.0 // -1 used as init value (all accuracy values are in the range [0.0, 1.0])
 58 | 
 59 |   /**
 60 |    * Check requirements.
 61 |    */
 62 |   init {
 63 |     require(this.epochs > 0) { "The number of epochs must be > 0" }
 64 |     require(this.batchSize > 0) { "The size of the batch must be > 0" }
 65 |     require(this.minRelevantErrorsCountToUpdate > 0) { "minRelevantErrorsCountToUpdate must be > 0" }
 66 |   }
 67 | 
 68 |   /**
 69 |    * Train the [neuralParser] with the given sentences.
 70 |    *
 71 |    * @param trainingSentences the sentences used to train the parser
 72 |    * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null)
 73 |    */
 74 |   fun train(trainingSentences: List<CoNLLSentence>,
 75 |             shuffler: Shuffler? = Shuffler(enablePseudoRandom = true, seed = 743)) {
 76 | 
 77 |     (0 until this.epochs).forEach { i ->
 78 | 
 79 |       this.logTrainingStart(epochIndex = i)
 80 | 
 81 |       this.newEpoch()
 82 |       this.trainEpoch(trainingSentences = trainingSentences, shuffler = shuffler)
 83 | 
 84 |       this.logTrainingEnd()
 85 | 
 86 |       this.validator?.apply {
 87 |         logValidationStart()
 88 |         validateAndSaveModel()
 89 |         logValidationEnd()
 90 |       }
 91 |     }
 92 |   }
 93 | 
 94 |   /**
 95 |    * Train the parser for an epoch.
 96 |    *
 97 |    * @param trainingSentences the training sentences
 98 |    * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null)
 99 |    */
100 |   private fun trainEpoch(trainingSentences: List<CoNLLSentence>,
101 |                          shuffler: Shuffler?) {
102 | 
103 |     val progress = ProgressIndicatorBar(trainingSentences.size)
104 | 
105 |     this.newBatch()
106 | 
107 |     ShuffledIndices(trainingSentences.size, shuffler = shuffler).forEachIndexed { i, sentenceIndex ->
108 | 
109 |       val endOfBatch: Boolean = (i + 1) % this.batchSize == 0 || i == trainingSentences.lastIndex
110 | 
111 |       progress.tick()
112 | 
113 |       val sentence: CoNLLSentence = trainingSentences[sentenceIndex]
114 | 
115 |       require(sentence.hasAnnotatedHeads()) {
116 |         "The gold dependency tree of a sentence cannot be null during the evaluation."
117 |       }
118 | 
119 |       this.trainSentence(
120 |         sentence = this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = sentenceIndex)),
121 |         goldTree = DependencyTree.Labeled(sentence))
122 | 
123 |       if (endOfBatch && this.getRelevantErrorsCount() >= this.minRelevantErrorsCountToUpdate) {
124 |         this.update()
125 |         this.newBatch()
126 |       }
127 |     }
128 |   }
129 | 
130 |   /**
131 |    * Validate the [neuralParser] with the validation helper and save the best model.
132 |    * The [validator] is required to be not null.
133 |    */
134 |   private fun validateAndSaveModel() {
135 | 
136 |     val stats: Statistics = this.validator!!.evaluate()
137 | 
138 |     println("\n$stats")
139 | 
140 |     if (stats.noPunctuation.uas.perc > this.bestAccuracy) {
141 | 
142 |       this.saveModel()
143 | 
144 |       this.bestAccuracy = stats.noPunctuation.uas.perc
145 |     }
146 |   }
147 | 
148 |   /**
149 |    * Save the model to [modelFilename].
150 |    */
151 |   private fun saveModel() {
152 | 
153 |     this.neuralParser.model.dump(FileOutputStream(File(this.modelFilename)))
154 | 
155 |     println("\nNEW BEST ACCURACY! Model saved to \"${this.modelFilename}\"")
156 |   }
157 | 
158 |   /**
159 |    * Log when training starts.
160 |    *
161 |    * @param epochIndex the current epoch index
162 |    */
163 |   private fun logTrainingStart(epochIndex: Int) {
164 | 
165 |     if (this.verbose) {
166 | 
167 |       this.timer.reset()
168 | 
169 |       println("\nEpoch ${epochIndex + 1} of ${this.epochs}")
170 |       println("\nStart training...")
171 |     }
172 |   }
173 | 
174 |   /**
175 |    * Log when training ends.
176 |    */
177 |   private fun logTrainingEnd() {
178 | 
179 |     if (this.verbose) {
180 |       println("Elapsed time: %s".format(this.timer.formatElapsedTime()))
181 |     }
182 |   }
183 | 
184 |   /**
185 |    * Log when validation starts.
186 |    */
187 |   private fun logValidationStart() {
188 | 
189 |     if (this.verbose) {
190 |       this.timer.reset()
191 |       println() // new line
192 |     }
193 |   }
194 | 
195 |   /**
196 |    * Log when validation ends.
197 |    */
198 |   private fun logValidationEnd() {
199 | 
200 |     if (this.verbose) {
201 |       println("Elapsed time: %s".format(this.timer.formatElapsedTime()))
202 |     }
203 |   }
204 | 
205 |   /**
206 |    * Beat the occurrence of a new batch.
207 |    */
208 |   protected open fun newBatch() = Unit
209 | 
210 |   /**
211 |    * Beat the occurrence of a new epoch.
212 |    */
213 |   protected open fun newEpoch() = Unit
214 | 
215 |   /**
216 |    * Update the [neuralParser].
217 |    */
218 |   protected abstract fun update()
219 | 
220 |   /**
221 |    * Train the parser with the given [sentence] and [goldTree].
222 |    *
223 |    * @param sentence a sentence
224 |    * @param goldTree the gold dependency tree
225 |    */
226 |   protected abstract fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled)
227 | 
228 |   /**
229 |    * @return the count of the relevant errors
230 |    */
231 |   protected abstract fun getRelevantErrorsCount(): Int
232 | }
233 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/LabelerSelector.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
 9 | 
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
14 | import java.io.Serializable
15 | 
16 | /**
17 |  * The selector of valid configurations of the labeler and compatible morphologies.
18 |  */
19 | interface LabelerSelector : Serializable {
20 | 
21 |   /**
22 |    * Get the list of scored grammatical configurations that are valid for a given attachment.
23 |    *
24 |    * @param configurations the list of grammatical configurations, sorted by descending score
25 |    * @param sentence the input sentence
26 |    * @param tokenIndex the index of the token to which the deprel must be assigned
27 |    * @param headIndex the index of the token head (can be null)
28 |    *
29 |    * @return the valid grammatical configurations for the given attachment
30 |    */
31 |   fun getValidConfigurations(configurations: List<ScoredGrammar>,
32 |                              sentence: ParsingSentence,
33 |                              tokenIndex: Int,
34 |                              headIndex: Int?): List<ScoredGrammar>
35 | 
36 |   /**
37 |    * Get the morphologies of a given token that are compatible with the given grammatical configuration.
38 |    *
39 |    * @param sentence the input sentence
40 |    * @param tokenIndex the index of a token of the sentence
41 |    * @param configuration the grammatical configuration of the token
42 |    *
43 |    * @return the morphologies compatible with the given grammatical configuration
44 |    */
45 |   fun getValidMorphologies(sentence: ParsingSentence,
46 |                            tokenIndex: Int,
47 |                            configuration: GrammaticalConfiguration): Morphologies
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/MorphoSelector.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
  9 | 
 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
 11 | import com.kotlinnlp.linguisticdescription.POSTag
 12 | import com.kotlinnlp.linguisticdescription.morphology.*
 13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
 14 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Unknown
 15 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
 17 | import com.kotlinnlp.utils.notEmptyOr
 18 | 
 19 | /**
 20 |  * The selector to use when the labeler predictions are defined as combinations of POS and syntactic types in the
 21 |  * Base format.
 22 |  */
 23 | object MorphoSelector : LabelerSelector {
 24 | 
 25 |   /**
 26 |    * Private val used to serialize the class (needed by Serializable).
 27 |    */
 28 |   @Suppress("unused")
 29 |   private const val serialVersionUID: Long = 1L
 30 | 
 31 |   /**
 32 |    * Get the list of scored grammatical configurations that are valid for a given attachment.
 33 |    *
 34 |    * @param configurations the list of grammatical configurations, sorted by descending score
 35 |    * @param sentence the input sentence
 36 |    * @param tokenIndex the index of the token to which the deprel must be assigned
 37 |    * @param headIndex the index of the token head (can be null)
 38 |    *
 39 |    * @return the valid grammatical configurations for the given attachment
 40 |    */
 41 |   override fun getValidConfigurations(configurations: List<ScoredGrammar>,
 42 |                                       sentence: ParsingSentence,
 43 |                                       tokenIndex: Int,
 44 |                                       headIndex: Int?): List<ScoredGrammar> {
 45 | 
 46 |     val possibleMorphologies: Morphologies = sentence.morphoAnalysis!!.allMorphologies[tokenIndex]
 47 |     val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex)
 48 |     val possibleConfigurations: List<ScoredGrammar> = configurations.filter { it.config.direction == correctDirection }
 49 |     val worstScore: Double = configurations.last().score
 50 | 
 51 |     return if (possibleMorphologies.isNotEmpty())
 52 |       possibleConfigurations
 53 |         .filter { sentence.areConfigurationCompatible(c = it.config, tokenIndex = tokenIndex) }
 54 |         .notEmptyOr {
 55 |           listOf(ScoredGrammar(
 56 |             config = possibleMorphologies.first().buildUnknownConfig(correctDirection),
 57 |             score = worstScore))
 58 |         }
 59 |     else
 60 |       possibleConfigurations.filter { it.config.isSingleContentWord() }.notEmptyOr {
 61 |         listOf(ScoredGrammar(
 62 |           config = GrammaticalConfiguration(GrammaticalConfiguration.Component(
 63 |             syntacticDependency = Unknown(correctDirection),
 64 |             pos = POSTag.Base(POS.Noun))),
 65 |           score = worstScore))
 66 |       }
 67 |   }
 68 | 
 69 |   /**
 70 |    * Get the morphologies of a given token that are compatible with the given grammatical configuration.
 71 |    *
 72 |    * @param sentence the input sentence
 73 |    * @param tokenIndex the index of a token of the sentence
 74 |    * @param configuration the grammatical configuration of the token
 75 |    *
 76 |    * @return the morphologies compatible with the given deprel
 77 |    */
 78 |   override fun getValidMorphologies(sentence: ParsingSentence,
 79 |                                     tokenIndex: Int,
 80 |                                     configuration: GrammaticalConfiguration): Morphologies {
 81 | 
 82 |     val possibleMorphologies: Morphologies =
 83 |       sentence.getCompatibleMorphologies(c = configuration, tokenIndex = tokenIndex)
 84 | 
 85 |     return when {
 86 | 
 87 |       possibleMorphologies.isNotEmpty() -> possibleMorphologies
 88 | 
 89 |       configuration.type == GrammaticalConfiguration.Type.Single -> {
 90 | 
 91 |         val pos: POSTag.Base = checkNotNull(configuration.components.single().pos as? POSTag.Base) {
 92 |           "The POS cannot be null."
 93 |         }
 94 | 
 95 |         require(pos.type.isContentWord) {
 96 |           "The grammatical configuration of tokens without morphological analysis must define a content word."
 97 |         }
 98 | 
 99 |         Morphologies(Morphology(SingleMorphology(
100 |           lemma = sentence.tokens[tokenIndex].form,
101 |           pos = pos.type,
102 |           allowDefaultValues = true)))
103 |       }
104 | 
105 |       else -> Morphologies()
106 |     }
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/NoFilterSelector.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
 9 | 
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
12 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
15 | 
16 | /**
17 |  * The selector that does not filter.
18 |  */
19 | object NoFilterSelector : LabelerSelector {
20 | 
21 |   /**
22 |    * Private val used to serialize the class (needed by Serializable).
23 |    */
24 |   @Suppress("unused")
25 |   private const val serialVersionUID: Long = 1L
26 | 
27 |   /**
28 |    * Get the list of scored grammatical configurations that are valid for a given attachment.
29 |    *
30 |    * @param configurations the list of grammatical configurations, sorted by descending score
31 |    * @param sentence the input sentence
32 |    * @param tokenIndex the index of the token to which the deprel must be assigned
33 |    * @param headIndex the index of the token head (can be null)
34 |    *
35 |    * @return the valid grammatical configurations for the given attachment
36 |    */
37 |   override fun getValidConfigurations(configurations: List<ScoredGrammar>,
38 |                                       sentence: ParsingSentence,
39 |                                       tokenIndex: Int,
40 |                                       headIndex: Int?): List<ScoredGrammar> {
41 | 
42 |     val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex)
43 | 
44 |     return configurations.filter { it.config.direction == correctDirection }
45 |   }
46 | 
47 |   /**
48 |    * Return all the morphologies as valid.
49 |    *
50 |    * @param sentence the input sentence
51 |    * @param tokenIndex the index of a token of the sentence
52 |    * @param configuration the grammatical configuration of the token
53 |    *
54 |    * @return all the given morphologies
55 |    */
56 |   override fun getValidMorphologies(sentence: ParsingSentence,
57 |                                     tokenIndex: Int,
58 |                                     configuration: GrammaticalConfiguration) = Morphologies(
59 |     sentence.morphoAnalysis?.allMorphologies?.get(tokenIndex)?.filter {
60 |       it.components.size == configuration.components.size
61 |     } ?: emptyList()
62 |   )
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/BasePreprocessor.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
 9 | 
10 | import com.kotlinnlp.neuralparser.language.BaseSentence
11 | import com.kotlinnlp.neuralparser.language.ParsingSentence
12 | import com.kotlinnlp.neuralparser.language.ParsingToken
13 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector
14 | 
15 | /**
16 |  * Pre-process a sentence before the parsing starts.
17 |  */
18 | class BasePreprocessor : SentencePreprocessor {
19 | 
20 |   companion object {
21 | 
22 |     /**
23 |      * Private val used to serialize the class (needed by Serializable).
24 |      */
25 |     @Suppress("unused")
26 |     private const val serialVersionUID: Long = 1L
27 |   }
28 | 
29 |   /**
30 |    * Convert a [BaseSentence] to a [ParsingSentence].
31 |    *
32 |    * @param sentence a base sentence
33 |    *
34 |    * @return a sentence ready to be parsed
35 |    */
36 |   override fun convert(sentence: BaseSentence) = ParsingSentence(
37 |     tokens = sentence.tokens.map { ParsingToken(id = it.id, form = it.form, position = it.position) },
38 |     labelerSelector = NoFilterSelector,
39 |     position = sentence.position
40 |   )
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/CoNLLPreprocessor.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
 9 | 
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.conllio.Token as CoNLLToken
12 | import com.kotlinnlp.neuralparser.language.BaseSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.language.ParsingToken
15 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector
16 | 
17 | /**
18 |  * Pre-process a sentence that has been built from a [CoNLLSentence].
19 |  *
20 |  * @param conllSentences the list of CoNLL sentences from which the input base sentences are built
21 |  */
22 | class CoNLLPreprocessor(private val conllSentences: List<CoNLLSentence>) : SentencePreprocessor {
23 | 
24 |   companion object {
25 | 
26 |     /**
27 |      * Private val used to serialize the class (needed by Serializable).
28 |      */
29 |     @Suppress("unused")
30 |     private const val serialVersionUID: Long = 1L
31 |   }
32 | 
33 |   /**
34 |    * Convert a [BaseSentence] to a [ParsingSentence].
35 |    *
36 |    * @param sentence a base sentence
37 |    *
38 |    * @return a sentence ready to be parsed
39 |    */
40 |   override fun convert(sentence: BaseSentence): ParsingSentence {
41 | 
42 |     val conllTokens: List<CoNLLToken> = this.conllSentences[sentence.position.index].tokens
43 | 
44 |     return ParsingSentence(
45 |       tokens = sentence.tokens.mapIndexed { i, it ->
46 |         ParsingToken(
47 |           id = it.id,
48 |           form = it.form,
49 |           position = it.position,
50 |           pos = conllTokens[i].posList
51 |         )
52 |       },
53 |       labelerSelector = NoFilterSelector,
54 |       position = sentence.position
55 |     )
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/MorphoPreprocessor.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
 9 | 
10 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
13 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer
14 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
15 | import com.kotlinnlp.neuralparser.language.BaseSentence
16 | import com.kotlinnlp.neuralparser.language.ParsingSentence
17 | import com.kotlinnlp.neuralparser.language.ParsingToken
18 | import com.kotlinnlp.neuralparser.helpers.labelerselector.MorphoSelector
19 | 
20 | /**
21 |  * Pre-process a sentence with a morphological analyzer, before the parsing starts.
22 |  *
23 |  * @param dictionary a morphologies dictionary
24 |  */
25 | class MorphoPreprocessor(private val dictionary: MorphologyDictionary) : SentencePreprocessor {
26 | 
27 |   companion object {
28 | 
29 |     /**
30 |      * Private val used to serialize the class (needed by Serializable).
31 |      */
32 |     @Suppress("unused")
33 |     private const val serialVersionUID: Long = 1L
34 |   }
35 | 
36 |   /**
37 |    * A morphological analyzer as transient property.
38 |    */
39 |   @kotlin.jvm.Transient private var morphologicalAnalyzer: MorphologicalAnalyzer? = null
40 | 
41 |   /**
42 |    * Convert a [BaseSentence] to a [ParsingSentence].
43 |    *
44 |    * @param sentence a base sentence
45 |    *
46 |    * @return a sentence ready to be parsed
47 |    */
48 |   override fun convert(sentence: BaseSentence): ParsingSentence {
49 | 
50 |     @Suppress("UNCHECKED_CAST")
51 |     val morphoAnalysis: MorphologicalAnalysis = this.getOrInitAnalyzer().analyze(sentence as RealSentence<RealToken>)
52 | 
53 |     return ParsingSentence(
54 |       tokens = sentence.tokens.map {
55 |         ParsingToken(
56 |           id = it.id,
57 |           form = it.form,
58 |           position = it.position
59 |         )
60 |       },
61 |       morphoAnalysis = morphoAnalysis,
62 |       labelerSelector = MorphoSelector,
63 |       position = sentence.position
64 |     )
65 |   }
66 | 
67 |   /**
68 |    * Get the [MorphologicalAnalyzer] of this preprocessor, eventually initializing it (in case this class has just been
69 |    * deserialized).
70 |    *
71 |    * @return the morphological analyzer of this preprocessor
72 |    */
73 |   private fun getOrInitAnalyzer(): MorphologicalAnalyzer {
74 | 
75 |     if (this.morphologicalAnalyzer == null)
76 |       this.morphologicalAnalyzer = MorphologicalAnalyzer(this.dictionary)
77 | 
78 |     return this.morphologicalAnalyzer!!
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/SentencePreprocessor.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
 9 | 
10 | import com.kotlinnlp.neuralparser.language.ParsingSentence
11 | import com.kotlinnlp.neuralparser.language.BaseSentence
12 | import com.kotlinnlp.neuralparser.language.BaseToken
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 | 
16 | /**
17 |  * Pre-process a sentence before the parsing starts.
18 |  */
19 | interface SentencePreprocessor : SentenceConverter<BaseToken, BaseSentence, ParsingToken, ParsingSentence>
20 | 
21 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/CompositeTokenHelper.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
 9 | 
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.linguisticdescription.morphology.POS
13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticType
14 | 
15 | /**
16 |  * @param dependencyTree the dependency tree
17 |  */
18 | internal class CompositeTokenHelper(private val dependencyTree: DependencyTree.Labeled) {
19 | 
20 |   /**
21 |    * Get the ID of the governor of a component of a composite token.
22 |    *
23 |    * @param tokenId the ID of a parsing token
24 |    * @param componentIndex the index of a component of the token
25 |    * @param prevComponentId the ID assigned to the precedent component (null at the first component)
26 |    *
27 |    * @return the ID of the governor of the given component
28 |    */
29 |   fun getComponentGovernorId(tokenId: Int,
30 |                              componentIndex: Int,
31 |                              prevComponentId: Int?): Int? {
32 | 
33 |     val governorId: Int? = this.dependencyTree.getHead(tokenId)
34 |     val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
35 | 
36 |     val isContin: Boolean = config.isContin()
37 |     val isPrepArt: Boolean = config.isPrepArt()
38 |     val isVerbEnclitic: Boolean = config.isVerbEnclitic()
39 | 
40 |     return when {
41 |       componentIndex == 0 -> governorId
42 |       isPrepArt && !isContin -> governorId
43 |       isPrepArt && isContin -> this.dependencyTree.getMultiWordGovernorId(tokenId)
44 |       isVerbEnclitic -> prevComponentId!!
45 |       else -> null
46 |     }
47 |   }
48 | 
49 |   /**
50 |    * Get the governor ID of a multi-word, given one of its tokens and going back through its ancestors in the dependency
51 |    * tree.
52 |    * Note: the governor of a multi-word is the governor of it first token.
53 |    *
54 |    * @param tokenId the id of a token that is part of a multi-word
55 |    *
56 |    * @return the governor id of the multi-word of which the given token is part of
57 |    */
58 |   private fun DependencyTree.Labeled.getMultiWordGovernorId(tokenId: Int): Int? {
59 | 
60 |     var multiWordStartId: Int = this.getHead(tokenId)!!
61 | 
62 |     while (this.getConfiguration(multiWordStartId).isContin())
63 |       multiWordStartId = this.getHead(multiWordStartId)!!
64 | 
65 |     return this.getHead(multiWordStartId)
66 |   }
67 | 
68 |   /**
69 |    * @return true if this configuration defines the continuation of a multi-word, otherwise false
70 |    */
71 |   private fun GrammaticalConfiguration.isContin(): Boolean = this.components.any {
72 |     it.syntacticDependency.isSubTypeOf(SyntacticType.Contin)
73 |   }
74 | 
75 |   /**
76 |    * @return true if this configuration defines a composite PREP + ART, otherwise false
77 |    */
78 |   private fun GrammaticalConfiguration.isPrepArt(): Boolean =
79 |     this.components.size == 2 &&
80 |       this.components[0].pos?.isSubTypeOf(POS.Prep) == true &&
81 |       this.components[1].pos?.isSubTypeOf(POS.Art) == true
82 | 
83 |   /**
84 |    * @return true if this configuration defines a composite VERB + PRON, otherwise false
85 |    */
86 |   private fun GrammaticalConfiguration.isVerbEnclitic(): Boolean =
87 |     this.components.size >= 2 &&
88 |       this.components[0].pos?.isSubTypeOf(POS.Verb) == true &&
89 |       this.components.subList(1, this.components.size).all { it.pos?.isSubTypeOf(POS.Pron) == true }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/LabeledMorphoSynBuilder.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
  9 | 
 10 | import com.kotlinnlp.dependencytree.DependencyTree
 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
 12 | import com.kotlinnlp.linguisticdescription.morphology.ScoredMorphology
 13 | import com.kotlinnlp.linguisticdescription.morphology.ScoredSingleMorphology
 14 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
 15 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
 16 | import com.kotlinnlp.linguisticdescription.sentence.token.Word
 17 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace
 18 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation
 19 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 20 | import com.kotlinnlp.neuralparser.language.ParsingToken
 21 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector
 22 | 
 23 | /**
 24 |  * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree].
 25 |  *
 26 |  * @param parsingSentence a parsing sentence
 27 |  * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence
 28 |  */
 29 | internal class LabeledMorphoSynBuilder(
 30 |   private val parsingSentence: ParsingSentence,
 31 |   private val dependencyTree: DependencyTree.Labeled
 32 | ) {
 33 | 
 34 |   /**
 35 |    * The next id that can be assigned to a new token of the sentence, used in case a new single component has to be
 36 |    * created.
 37 |    */
 38 |   private var nextAvailableId: Int = this.parsingSentence.tokens.asSequence().map { it.id }.max()!! + 1
 39 | 
 40 |   /**
 41 |    * Build the morpho-syntactic sentence using a [LabelerSelector] to select the valid morphologies.
 42 |    *
 43 |    * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree]
 44 |    */
 45 |   fun buildSentence(): MorphoSynSentence = MorphoSynSentence(
 46 |     id = 0,
 47 |     confidence = 0.0,
 48 |     tokens = this.parsingSentence.tokens.mapIndexed { i, token ->
 49 | 
 50 |       val attachmentScore: Double = this.dependencyTree.getAttachmentScore(token.id)
 51 | 
 52 |       val morphologies: List<ScoredMorphology> = this.parsingSentence.getValidMorphologies(
 53 |         tokenIndex = i,
 54 |         configuration = this.dependencyTree.getConfiguration(token.id)
 55 |       ).map { morpho ->
 56 |         ScoredMorphology(components = morpho.components, score = attachmentScore)
 57 |       }
 58 | 
 59 |       this.buildToken(tokenId = token.id, morphologies = morphologies)
 60 |     },
 61 |     position = this.parsingSentence.position
 62 |   )
 63 | 
 64 |   /**
 65 |    * @param tokenId the id of a parsing token
 66 |    * @param morphologies the possible morphologies of the token
 67 |    *
 68 |    * @return a new morpho-syntactic token build from the given parsing token
 69 |    */
 70 |   private fun buildToken(tokenId: Int, morphologies: List<ScoredMorphology>): MorphoSynToken {
 71 | 
 72 |     val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
 73 | 
 74 |     require(morphologies.all { it.components.size == config.components.size }) {
 75 |       "The given morphologies must have the same number of components of the given grammatical configuration."
 76 |     }
 77 | 
 78 |     return if (config.components.size == 1)
 79 |       this.buildSingleToken(
 80 |         tokenId = tokenId,
 81 |         governorId = this.dependencyTree.getHead(tokenId),
 82 |         grammaticalComponent = config.components.single(),
 83 |         morphologies = morphologies.map { it.toSingle() })
 84 |     else
 85 |       this.buildCompositeToken(tokenId = tokenId, morphologies = morphologies)
 86 |   }
 87 | 
 88 |   /**
 89 |    * @param tokenId the id of the new token
 90 |    * @param morphologies the list of possible scored morphologies of the token
 91 |    *
 92 |    * @return a new composite token
 93 |    */
 94 |   private fun buildCompositeToken(tokenId: Int, morphologies: List<ScoredMorphology>): MorphoSynToken.Composite {
 95 | 
 96 |     val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
 97 |     val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
 98 |     val compositeTokenHandler = CompositeTokenHelper(this.dependencyTree)
 99 | 
100 |     val newToken = MorphoSynToken.Composite(
101 |       id = parsingToken.id,
102 |       form = parsingToken.form,
103 |       position = checkNotNull(parsingToken.position) { "Composite words must have a position." },
104 |       components = config.components.mapIndexed { i, component ->
105 |         this.buildSingleToken(
106 |           tokenId = tokenId,
107 |           componentId = this.nextAvailableId + i,
108 |           governorId = compositeTokenHandler.getComponentGovernorId(
109 |             tokenId = tokenId,
110 |             componentIndex = i,
111 |             prevComponentId = if (i > 0) this.nextAvailableId else null),
112 |           grammaticalComponent = component,
113 |           morphologies = morphologies.map { ScoredSingleMorphology(value = it.components[i], score = it.score) }
114 |         ) as Word
115 |       }
116 |     )
117 | 
118 |     // Attention: the nextAvailableId must be set after the token has been created in order to calculate the
119 |     // components governors correctly.
120 |     this.nextAvailableId += config.components.size
121 | 
122 |     return newToken
123 |   }
124 | 
125 |   /**
126 |    * @param tokenId the id of the original token
127 |    * @param componentId the id of the token in case it is a component (otherwise null)
128 |    * @param governorId the id of the governor (null if it is the top)
129 |    * @param grammaticalComponent the grammatical configuration of the token as single component
130 |    * @param morphologies the list of possible scored morphologies of the token
131 |    *
132 |    * @return a new single token
133 |    */
134 |   private fun buildSingleToken(tokenId: Int,
135 |                                componentId: Int? = null,
136 |                                governorId: Int?,
137 |                                grammaticalComponent: GrammaticalConfiguration.Component,
138 |                                morphologies: List<ScoredSingleMorphology>): MorphoSynToken.Single {
139 | 
140 |     val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
141 |     val syntacticRelation = SyntacticRelation(
142 |       governor = governorId,
143 |       attachmentScore = this.dependencyTree.getAttachmentScore(tokenId),
144 |       dependency = grammaticalComponent.syntacticDependency)
145 | 
146 |     // Unique morphologies by lemma and POS.
147 |     val uniqueMorphologies: List<ScoredSingleMorphology> =
148 |       morphologies.associateBy { Pair(it.value.lemma, it.value.pos) }.values.toList()
149 | 
150 |     return if (parsingToken.position != null)
151 |       Word(
152 |         id = componentId ?: tokenId,
153 |         form = parsingToken.form,
154 |         position = parsingToken.position,
155 |         pos = grammaticalComponent.pos,
156 |         morphologies = uniqueMorphologies,
157 |         contextMorphologies = listOf(), // TODO: set it
158 |         syntacticRelation = syntacticRelation,
159 |         coReferences = null, // TODO: set it
160 |         semanticRelations = null) // TODO: set it
161 |     else
162 |       WordTrace(
163 |         id = componentId ?: tokenId,
164 |         form = parsingToken.form,
165 |         pos = grammaticalComponent.pos,
166 |         morphologies = uniqueMorphologies,
167 |         contextMorphologies = listOf(), // TODO: set it
168 |         syntacticRelation = syntacticRelation,
169 |         coReferences = null, // TODO: set it
170 |         semanticRelations = null)
171 |   }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/UnlabeledMorphoSynBuilder.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
 9 | 
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
13 | import com.kotlinnlp.linguisticdescription.sentence.token.Word
14 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace
15 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation
16 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
17 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Undefined
18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
19 | import com.kotlinnlp.neuralparser.language.ParsingToken
20 | 
21 | /**
22 |  * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree].
23 |  *
24 |  * @param parsingSentence a parsing sentence
25 |  * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence
26 |  */
27 | internal class UnlabeledMorphoSynBuilder(
28 |   private val parsingSentence: ParsingSentence,
29 |   private val dependencyTree: DependencyTree.Unlabeled
30 | ) {
31 | 
32 |   /**
33 |    * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree]
34 |    */
35 |   fun buildSentence(): MorphoSynSentence = MorphoSynSentence(
36 |     id = 0,
37 |     confidence = 0.0,
38 |     tokens = this.parsingSentence.tokens.map {token ->
39 |       this.buildSingleToken(tokenId = token.id, governorId = this.dependencyTree.getHead(token.id))
40 |     },
41 |     position = this.parsingSentence.position
42 |   )
43 | 
44 |   /**
45 |    * @param tokenId the token id
46 |    * @param governorId the governor id or null if it is the top
47 |    *
48 |    * @return a new single morpho-syntactic token built from the given parsing token
49 |    */
50 |   private fun buildSingleToken(tokenId: Int, governorId: Int?): MorphoSynToken.Single {
51 | 
52 |     val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
53 |     val syntacticRelation = SyntacticRelation(
54 |       governor = governorId,
55 |       attachmentScore = this.dependencyTree.getAttachmentScore(tokenId),
56 |       dependency = Undefined(direction = SyntacticDependency.Direction.NULL))
57 | 
58 |     return if (parsingToken.position != null)
59 |       Word(
60 |         id = tokenId,
61 |         form = parsingToken.form,
62 |         position = parsingToken.position,
63 |         pos = null,
64 |         morphologies = listOf(),
65 |         contextMorphologies = listOf(), // TODO: set it
66 |         syntacticRelation = syntacticRelation,
67 |         coReferences = null, // TODO: set it
68 |         semanticRelations = null) // TODO: set it
69 |     else
70 |       WordTrace(
71 |         id = tokenId,
72 |         form = parsingToken.form,
73 |         pos = null,
74 |         morphologies = listOf(),
75 |         contextMorphologies = listOf(), // TODO: set it
76 |         syntacticRelation = syntacticRelation,
77 |         coReferences = null, // TODO: set it
78 |         semanticRelations = null)
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/BaseStatistics.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.statistics
 9 | 
10 | import com.kotlinnlp.utils.stats.StatMetric
11 | 
12 | /**
13 |  * Base parsing statistics.
14 |  *
15 |  * @property las labeled attachment score
16 |  * @property uas unlabeled attachment score
17 |  * @property ps POS tag accuracy score
18 |  * @property ds deprel accuracy score
19 |  * @property slas sentence labeled attachment score
20 |  * @property suas sentence unlabeled attachment score
21 |  */
22 | open class BaseStatistics(
23 |   val las: StatMetric,
24 |   val uas: StatMetric,
25 |   val ps: StatMetric,
26 |   val ds: StatMetric,
27 |   val slas: StatMetric,
28 |   val suas: StatMetric) {
29 | 
30 |   /**
31 |    * @return this statistics formatted into a string
32 |    */
33 |   override fun toString(): String = """
34 |     - Labeled   attachment score:          $las
35 |     - Unlabeled attachment score:          $uas
36 |     - Deprel  accuracy score:              $ds
37 |     - POS tag accuracy score:              $ps
38 |     - Sentence labeled   attachment score: $slas
39 |     - Sentence unlabeled attachment score: $suas
40 |     """
41 |     .removePrefix("\n")
42 |     .trimIndent()
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/MetricsCounter.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.statistics
 9 | 
10 | import com.kotlinnlp.utils.stats.StatMetric
11 | 
12 | /**
13 |  * A counter of statistic metrics.
14 |  *
15 |  * @property labeledAttachments the counter of labeled attachments
16 |  * @property unlabeledAttachments the counter of unlabeled attachments
17 |  * @property correctPOSTags the counter of correct POS tags
18 |  * @property correctDeprels the counter of correct deprels
19 |  * @property correctLabeledSentences the counter of correct labeled sentences
20 |  * @property correctUnlabeledSentences the counter of correct unlabeled sentences
21 |  * @property totalSentences the total amount of sentences
22 |  * @property totalTokens the total amount of tokens
23 |  */
24 | data class MetricsCounter(
25 |   var labeledAttachments: Int = 0,
26 |   var unlabeledAttachments: Int = 0,
27 |   var correctPOSTags: Int = 0,
28 |   var correctDeprels: Int = 0,
29 |   var correctLabeledSentences: Int = 0,
30 |   var correctUnlabeledSentences: Int = 0,
31 |   var totalSentences: Int = 0,
32 |   var totalTokens: Int = 0) {
33 | 
34 |   /**
35 |    * @return the base statistics
36 |    */
37 |   fun toStatistics() = with(this) {
38 |     BaseStatistics(
39 |       las = StatMetric(count = labeledAttachments, total = totalTokens),
40 |       uas = StatMetric(count = unlabeledAttachments, total = totalTokens),
41 |       ps = StatMetric(count = correctPOSTags, total = totalTokens),
42 |       ds = StatMetric(count = correctDeprels, total = totalTokens),
43 |       slas = StatMetric(count = correctLabeledSentences, total = totalSentences),
44 |       suas = StatMetric(count = correctUnlabeledSentences, total = totalSentences)
45 |     )
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/SentenceMetrics.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.statistics
 9 | 
10 | /**
11 |  * The metrics of a sentence.
12 |  *
13 |  * @property correctLabeled if the parsed sentence has all correct attachments, including deprel labels
14 |  * @property correctUnlabeled if the parsed sentence has all correct attachments, excluding deprel labels
15 |  * @property correctLabeledNoPunct same as [correctLabeled], without considering the punctuation tokens
16 |  * @property correctUnlabeledNoPunct same as [correctUnlabeled], without considering the punctuation tokens
17 |  */
18 | internal data class SentenceMetrics(
19 |   var correctLabeled: Boolean = true,
20 |   var correctUnlabeled: Boolean = true,
21 |   var correctLabeledNoPunct: Boolean = true,
22 |   var correctUnlabeledNoPunct: Boolean = true
23 | )
24 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/Statistics.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.statistics
 9 | 
10 | import com.kotlinnlp.utils.stats.StatMetric
11 | 
12 | /**
13 |  * Parsing statistics, including ones calculated without considering the punctuation.
14 |  *
15 |  * @property las labeled attachment score
16 |  * @property uas unlabeled attachment score
17 |  * @property ps POS tag accuracy score
18 |  * @property ds deprel accuracy score
19 |  * @property slas sentence labeled attachment score
20 |  * @property suas sentence unlabeled attachment score
21 |  * @property noPunctuation statistics without considering punctuation tokens
22 |  */
23 | class Statistics(
24 |   las: StatMetric,
25 |   uas: StatMetric,
26 |   ps: StatMetric,
27 |   ds: StatMetric,
28 |   slas: StatMetric,
29 |   suas: StatMetric,
30 |   val noPunctuation: BaseStatistics
31 | ) : BaseStatistics(las = las, uas = uas, ps = ps, ds = ds, slas = slas, suas = suas) {
32 | 
33 |   /**
34 |    * @return this statistics formatted into a string
35 |    */
36 |   override fun toString(): String = """
37 |     Evaluation stats:
38 |     %s
39 | 
40 |     Evaluation stats without considering punctuation:
41 |     %s
42 |     """
43 |     .removePrefix("\n")
44 |     .trimIndent()
45 |     .format(
46 |       super.toString(),
47 |       this.noPunctuation.toString()
48 |     )
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLDependencyParser.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.helpers.validator
 9 | 
10 | import com.kotlinnlp.linguisticdescription.POSTag
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Token as CoNLLToken
13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
14 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
15 | import com.kotlinnlp.neuralparser.NeuralParser
16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
18 | import com.kotlinnlp.neuralparser.language.BaseSentence
19 | import com.kotlinnlp.utils.notEmptyOr
20 | 
21 | /**
22 |  * A helper that wraps a generic [NeuralParser] to let it working on CoNLL sentences.
23 |  *
24 |  * @property neuralParser a generic neural parser to use it with input/output sentences in CoNLL format
25 |  * @property sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
26 |  */
27 | class CoNLLDependencyParser(
28 |   private val neuralParser: NeuralParser<*>,
29 |   private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor()
30 | ) {
31 | 
32 |   /**
33 |    * Parse a CoNLL sentence.
34 |    *
35 |    * @param sentence the sentence to parse, in CoNLL format
36 |    * @param index the index of the sentence within the list of sentences of the input dataset
37 |    *
38 |    * @return the parsed sentence in CoNLL format
39 |    */
40 |   fun parse(sentence: CoNLLSentence, index: Int): CoNLLSentence {
41 | 
42 |     val parsedSentence: MorphoSynSentence = this.neuralParser.parse(
43 |       this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = index)))
44 | 
45 |     return sentence.copy(tokens = sentence.tokens.map {
46 | 
47 |       val parsedToken: MorphoSynToken = parsedSentence.getTokenById(it.id)
48 | 
49 |       it.copy(
50 |         head = parsedToken.syntacticRelation.governor ?: 0, // Note: the CoNLL root ID is 0
51 |         posList = parsedToken.flatPOS.notEmptyOr { listOf(POSTag(CoNLLToken.EMPTY_FILLER)) },
52 |         syntacticDependencies = parsedToken.flatSyntacticRelations.map { it.dependency }
53 |       )
54 |     })
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLFileValidator.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.helpers.validator
  9 | 
 10 | import com.kotlinnlp.conllio.CoNLLUEvaluator
 11 | import com.kotlinnlp.conllio.CoNLLWriter
 12 | import com.kotlinnlp.conllio.CoNLLXEvaluator
 13 | import com.kotlinnlp.conllio.Token as CoNLLToken
 14 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
 15 | import com.kotlinnlp.neuralparser.NeuralParser
 16 | import com.kotlinnlp.neuralparser.utils.loadSentences
 17 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
 18 | import java.io.File
 19 | 
 20 | /**
 21 |  * Validate a system output CoNLL file comparing it to a gold CoNLL file.
 22 |  *
 23 |  * @param neuralParser a neural parser
 24 |  * @param goldFilePath the path of the file containing the gold tree-bank, in CoNLL format.
 25 |  * @param outputFilePath the file path of the output CoNLL corpus (default = null -> a temporary file is used)
 26 |  * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
 27 |  */
 28 | class CoNLLFileValidator(
 29 |   neuralParser: NeuralParser<*>,
 30 |   private val goldFilePath: String,
 31 |   private val outputFilePath: String? = null,
 32 |   private val verbose: Boolean = true
 33 | ) {
 34 | 
 35 |   /**
 36 |    * Return a temporary file absolute path.
 37 |    *
 38 |    * @return the path of a temporary file generated at runtime
 39 |    */
 40 |   private val defaultOutputPath: String get() = File.createTempFile("${this.goldFilePath}_output", ".conll").path
 41 | 
 42 |   /**
 43 |    * The CoNLL Evaluator
 44 |    */
 45 |   private val conllEvaluator = if (this.goldFilePath.endsWith(".conllu")) CoNLLUEvaluator else CoNLLXEvaluator
 46 | 
 47 |   /**
 48 |    * The parser wrapper to parse sentences in CoNLL format.
 49 |    */
 50 |   private val conllParser = CoNLLDependencyParser(neuralParser)
 51 | 
 52 |   /**
 53 |    * Print the statistics resulting from the official CoNLL evaluation script.
 54 |    *
 55 |    * @return the statistics of the evaluation
 56 |    */
 57 |   fun evaluate() {
 58 | 
 59 |     val parsedSentences: List<CoNLLSentence> = this.parseSentences(sentences = loadSentences(
 60 |       type = "validation",
 61 |       filePath = goldFilePath,
 62 |       maxSentences = null,
 63 |       skipNonProjective = false))
 64 | 
 65 |     print("\nCoNLL official script evaluation:\n%s".format(this.evaluateWithCoNLLScript(parsedSentences)))
 66 |   }
 67 | 
 68 |   /**
 69 |    * Parse the validation CoNLL sentences.
 70 |    *
 71 |    * @return the list of parsed CoNLL sentences
 72 |    */
 73 |   private fun parseSentences(sentences: List<CoNLLSentence>): List<CoNLLSentence> {
 74 | 
 75 |     val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(sentences.size) else null
 76 | 
 77 |     if (this.verbose) println("Start parsing of %d sentences:".format(sentences.size))
 78 | 
 79 |     return sentences.mapIndexed { i, sentence ->
 80 | 
 81 |       progress?.tick()
 82 | 
 83 |       this.conllParser.parse(sentence, index = i)
 84 |     }
 85 |   }
 86 | 
 87 |   /**
 88 |    * Get the output of the official CoNLL evaluation script.
 89 |    *
 90 |    * @param parsedSentences a list of parsed sentences, parallel to the gold sentences
 91 |    *
 92 |    * @return the output of the official CoNLL evaluation script
 93 |    */
 94 |   private fun evaluateWithCoNLLScript(parsedSentences: List<CoNLLSentence>): String? {
 95 | 
 96 |     val outputPath: String = this.outputFilePath ?: this.defaultOutputPath
 97 | 
 98 |     CoNLLWriter.toFile(sentences = parsedSentences, writeComments = true, outputFilePath = outputPath)
 99 | 
100 |     return this.conllEvaluator.evaluate(systemFilePath = outputPath, goldFilePath = this.goldFilePath)
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/Validator.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.helpers.validator
  9 | 
 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
 11 | import com.kotlinnlp.conllio.Token as CoNLLToken
 12 | import com.kotlinnlp.dependencytree.DependencyTree
 13 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
 14 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
 15 | import com.kotlinnlp.neuralparser.NeuralParser
 16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
 17 | import com.kotlinnlp.neuralparser.helpers.statistics.MetricsCounter
 18 | import com.kotlinnlp.neuralparser.helpers.statistics.SentenceMetrics
 19 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics
 20 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
 21 | 
 22 | /**
 23 |  * The Validator.
 24 |  *
 25 |  * @param neuralParser the neural parser
 26 |  * @property sentences the sentences to parse containing the gold annotation
 27 |  * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
 28 |  * @property verbose a Boolean indicating if the verbose mode is enabled (default = true)
 29 |  */
 30 | class Validator(
 31 |   private val neuralParser: NeuralParser<*>,
 32 |   val sentences: List<CoNLLSentence>,
 33 |   sentencePreprocessor: SentencePreprocessor,
 34 |   private val verbose: Boolean = true
 35 | ) {
 36 | 
 37 |   companion object {
 38 | 
 39 |     /**
 40 |      * The regular expression to match punctuation forms.
 41 |      */
 42 |     val punctuationRegex = Regex("^[-!\"#%&'()*,./:;?@\\[\\]_{}]+$")
 43 |   }
 44 | 
 45 |   init {
 46 |     require(sentences.all { it.hasAnnotatedHeads() }) {
 47 |       "A gold sentence must have a dependency tree with all heads annotated."
 48 |     }
 49 |   }
 50 | 
 51 |   /**
 52 |    * A counter of statistic metrics.
 53 |    */
 54 |   private lateinit var counter: MetricsCounter
 55 | 
 56 |   /**
 57 |    * A counter of statistic metrics, without considering punctuation.
 58 |    */
 59 |   private lateinit var counterNoPunct: MetricsCounter
 60 | 
 61 |   /**
 62 |    * The metrics of a sentence.
 63 |    */
 64 |   private lateinit var sentenceMetrics: SentenceMetrics
 65 | 
 66 |   /**
 67 |    * The parser wrapper to parse sentences in CoNLL format.
 68 |    */
 69 |   private val conllParser = CoNLLDependencyParser(
 70 |     neuralParser = this.neuralParser,
 71 |     sentencePreprocessor = sentencePreprocessor)
 72 | 
 73 |   /**
 74 |    * Get statistics about the evaluation of the parsing accuracy on the given [sentences].
 75 |    *
 76 |    * @return the statistics of the parsing accuracy
 77 |    */
 78 |   fun evaluate(): Statistics {
 79 | 
 80 |     val parsedSentences: List<CoNLLSentence> = this.parseSentences()
 81 | 
 82 |     this.initCounters(parsedSentences)
 83 | 
 84 |     this.sentences.zip(parsedSentences).forEach { (goldSentence, parsedSentence) ->
 85 | 
 86 |       val goldTree: DependencyTree = this.buildTree(goldSentence)
 87 |       val parsedTree: DependencyTree = this.buildTree(parsedSentence, allowCycles = true)
 88 | 
 89 |       require(parsedTree.size == goldTree.size) { "The dependency tree and its gold haven't the same size" }
 90 | 
 91 |       this.sentenceMetrics = SentenceMetrics()
 92 | 
 93 |       goldSentence.tokens.forEach { this.addTokenMetrics(token = it, parsedTree = parsedTree, goldTree = goldTree) }
 94 | 
 95 |       this.updateCorrectSentences()
 96 |     }
 97 | 
 98 |     return this.buildStats()
 99 |   }
100 | 
101 |   /**
102 |    * @param sentence a CoNLL sentence
103 |    * @param allowCycles if true it allows to create cycles when building the tree
104 |    *
105 |    * @return a new dependency tree based on the given sentence
106 |    */
107 |   private fun buildTree(sentence: CoNLLSentence, allowCycles: Boolean = false): DependencyTree =
108 |     if (this.neuralParser.labellingEnabled)
109 |       DependencyTree.Labeled(sentence = sentence, allowCycles = allowCycles)
110 |     else
111 |       DependencyTree.Unlabeled(sentence = sentence, allowCycles = allowCycles)
112 | 
113 |   /**
114 |    * Parse the validation CoNLL sentences.
115 |    *
116 |    * @return the list of parsed CoNLL sentences
117 |    */
118 |   private fun parseSentences(): List<CoNLLSentence> {
119 | 
120 |     val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(this.sentences.size) else null
121 | 
122 |     if (this.verbose) println("Start parsing of %d sentences:".format(this.sentences.size))
123 | 
124 |     return this.sentences.mapIndexed { i, sentence ->
125 | 
126 |       progress?.tick()
127 | 
128 |       this.conllParser.parse(sentence, index = i)
129 |     }
130 |   }
131 | 
132 |   /**
133 |    * Initialize the metrics counters.
134 |    *
135 |    * @param parsedSentences a list of parsed sentences
136 |    */
137 |   private fun initCounters(parsedSentences: List<CoNLLSentence>) {
138 | 
139 |     this.counter = MetricsCounter()
140 |     this.counterNoPunct = MetricsCounter()
141 | 
142 |     this.counter.totalSentences = parsedSentences.size
143 |     this.counterNoPunct.totalSentences = parsedSentences.size
144 |     this.counter.totalTokens = parsedSentences.sumBy { it.tokens.count() }
145 |   }
146 | 
147 |   /**
148 |    * Add the statistic metrics of a given [token].
149 |    *
150 |    * @param token a token of a sentence
151 |    * @param parsedTree the dependency tree of the parsed sentence
152 |    * @param goldTree the gold dependency tree of the parsed sentence
153 |    */
154 |   private fun addTokenMetrics(token: CoNLLToken, parsedTree: DependencyTree, goldTree: DependencyTree) {
155 | 
156 |     val isNotPunct: Boolean = !punctuationRegex.matches(token.form)
157 |     val parsedConfig: GrammaticalConfiguration? = (parsedTree as? DependencyTree.Labeled)?.getConfiguration(token.id)
158 |     val goldConfig: GrammaticalConfiguration? = (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id)
159 |     val parsedDependencies: List<SyntacticDependency>? =
160 |       parsedConfig?.components?.map { it.syntacticDependency }
161 |     val goldDependencies: List<SyntacticDependency>? =
162 |       (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id)?.components?.map { it.syntacticDependency }
163 | 
164 |     if (isNotPunct) this.counterNoPunct.totalTokens++
165 | 
166 |     if (parsedTree.getHead(token.id) == goldTree.getHead(token.id)) {
167 | 
168 |       this.addCorrectAttachment(isNotPunct)
169 | 
170 |       if (parsedDependencies == goldDependencies)
171 |         this.addCorrectLabeledAttachment(isNotPunct)
172 |       else
173 |         this.addUncorrectLabeledAttachment(isNotPunct)
174 | 
175 |     } else {
176 |       this.addUncorrectAttachment(isNotPunct)
177 |       this.addUncorrectLabeledAttachment(isNotPunct)
178 |     }
179 | 
180 |     if (parsedConfig?.components?.map { it.pos } == goldConfig?.components?.map { it.pos })
181 |       this.addCorrectPOSTag(isNotPunct)
182 | 
183 |     if ((parsedDependencies != null && goldDependencies != null
184 |         && parsedDependencies.zip(goldDependencies).all { it.first.softEquals(it.second) })
185 |       || (parsedDependencies == goldDependencies))
186 |       this.addCorrectDeprel(isNotPunct)
187 |   }
188 | 
189 |   /**
190 |    * Add a correct attachment to the current statistic metrics.
191 |    *
192 |    * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
193 |    */
194 |   private fun addCorrectAttachment(isNotPunct: Boolean) {
195 | 
196 |     this.counter.unlabeledAttachments++
197 | 
198 |     if (isNotPunct) this.counterNoPunct.unlabeledAttachments++
199 |   }
200 | 
201 |   /**
202 |    * Add an uncorrect attachment to the current statistic metrics.
203 |    *
204 |    * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
205 |    */
206 |   private fun addUncorrectAttachment(isNotPunct: Boolean) {
207 | 
208 |     this.sentenceMetrics.correctUnlabeled = false
209 | 
210 |     if (isNotPunct) this.sentenceMetrics.correctUnlabeledNoPunct = false
211 |   }
212 | 
213 |   /**
214 |    * Add a correct labeled attachment to the current statistic metrics.
215 |    *
216 |    * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
217 |    */
218 |   private fun addCorrectLabeledAttachment(isNotPunct: Boolean) {
219 | 
220 |     this.counter.labeledAttachments++
221 | 
222 |     if (isNotPunct) this.counterNoPunct.labeledAttachments++
223 |   }
224 | 
225 |   /**
226 |    * Add an uncorrect labeled attachment to the current statistic metrics.
227 |    *
228 |    * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
229 |    */
230 |   private fun addUncorrectLabeledAttachment(isNotPunct: Boolean) {
231 | 
232 |     this.sentenceMetrics.correctLabeled = false
233 | 
234 |     if (isNotPunct) this.sentenceMetrics.correctLabeledNoPunct = false
235 |   }
236 | 
237 |   /**
238 |    * Add a correct POS tag to the current statistic metrics.
239 |    *
240 |    * @param isNotPunct a Boolean indicating if the POS tag is related to a non-punctuation token
241 |    */
242 |   private fun addCorrectPOSTag(isNotPunct: Boolean) {
243 | 
244 |     this.counter.correctPOSTags++
245 | 
246 |     if (isNotPunct) this.counterNoPunct.correctPOSTags++
247 |   }
248 | 
249 |   /**
250 |    * Add a correct deprel to the current statistic metrics.
251 |    *
252 |    * @param isNotPunct a Boolean indicating if the deprel is related to a non-punctuation token
253 |    */
254 |   private fun addCorrectDeprel(isNotPunct: Boolean) {
255 | 
256 |     this.counter.correctDeprels++
257 | 
258 |     if (isNotPunct) this.counterNoPunct.correctDeprels++
259 |   }
260 | 
261 |   /**
262 |    * Update the counters of correct sentences with the current [sentenceMetrics].
263 |    */
264 |   private fun updateCorrectSentences() {
265 | 
266 |     if (this.sentenceMetrics.correctLabeled) this.counter.correctLabeledSentences++
267 |     if (this.sentenceMetrics.correctUnlabeled) this.counter.correctUnlabeledSentences++
268 |     if (this.sentenceMetrics.correctLabeledNoPunct) this.counterNoPunct.correctLabeledSentences++
269 |     if (this.sentenceMetrics.correctUnlabeledNoPunct) this.counterNoPunct.correctUnlabeledSentences++
270 |   }
271 | 
272 |   /**
273 |    * Build the statistics related to the current counted metrics.
274 |    */
275 |   private fun buildStats(): Statistics {
276 | 
277 |     val punctStats = this.counter.toStatistics()
278 |     val noPunctStats = this.counterNoPunct.toStatistics()
279 | 
280 |     return Statistics(
281 |       las = punctStats.las,
282 |       uas = punctStats.uas,
283 |       ps = punctStats.ps,
284 |       ds = punctStats.ds,
285 |       slas = punctStats.slas,
286 |       suas = punctStats.suas,
287 |       noPunctuation = noPunctStats)
288 |   }
289 | }
290 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseSentence.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable
13 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
14 | 
15 | /**
16 |  * A base real sentence.
17 |  *
18 |  * @property id the id of the sentence, unique within a list of sentences
19 |  * @property tokens the list of tokens that compose the sentence
20 |  * @property position the position of this sentence in the original text
21 |  */
22 | data class BaseSentence(
23 |   val id: Int,
24 |   override val tokens: List<BaseToken>,
25 |   override val position: Position
26 | ) : RealSentence<BaseToken>, SentenceIdentificable<BaseToken>() {
27 | 
28 |   companion object {
29 | 
30 |     /**
31 |      * Convert a CoNLL sentence to a [BaseSentence].
32 |      *
33 |      * @param sentence a CoNLL sentence
34 |      * @param index the index of the sentence within a list of sentences
35 |      *
36 |      * @return a real sentence of real tokens
37 |      */
38 |     fun fromCoNLL(sentence: CoNLLSentence, index: Int): BaseSentence {
39 | 
40 |       val baseTokens = sentence.tokens.toBaseTokens()
41 | 
42 |       return BaseSentence(
43 |         id = index, // the index is unique within a list of sentences
44 |         tokens = baseTokens,
45 |         position = Position(
46 |           index = index,
47 |           start = baseTokens.first().position.start,
48 |           end = baseTokens.last().position.end))
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseToken.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
11 | import com.kotlinnlp.linguisticdescription.sentence.token.TokenIdentificable
12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
13 | 
14 | /**
15 |  * A base real token.
16 |  *
17 |  * @property id the token id, an incremental integer starting from 0 within a sentence
18 |  * @property form the form of the token
19 |  * @property position the position of the token in the original text
20 |  */
21 | data class BaseToken(
22 |   override val id: Int,
23 |   override val form: String,
24 |   override val position: Position
25 | ) : RealToken, TokenIdentificable
26 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/CorpusDictionary.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.google.common.collect.HashMultimap
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Token as CoNLLToken
13 | import com.kotlinnlp.linguisticdescription.POSTag
14 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
15 | import com.kotlinnlp.utils.DictionarySet
16 | import java.io.Serializable
17 | 
18 | /**
19 |  * The CorpusDictionary.
20 |  */
21 | class CorpusDictionary : Serializable {
22 | 
23 |   companion object {
24 | 
25 |     /**
26 |      * Private val used to serialize the class (needed by Serializable).
27 |      */
28 |     @Suppress("unused")
29 |     private const val serialVersionUID: Long = 1L
30 | 
31 |     /**
32 |      * Create a new corpus populated with the information contained in the given [sentences] (words, POS tags and
33 |      * deprels).
34 |      *
35 |      * @param sentences a list of sentences
36 |      *
37 |      * @return a new corpus dictionary
38 |      */
39 |     operator fun invoke(sentences: List<CoNLLSentence>): CorpusDictionary {
40 | 
41 |       val dictionary = CorpusDictionary()
42 | 
43 |       sentences.forEach { it.tokens.forEach { token -> dictionary.addInfo(token) } }
44 | 
45 |       return dictionary
46 |     }
47 |   }
48 | 
49 |   /**
50 |    * The words.
51 |    */
52 |   val words = DictionarySet<String>()
53 | 
54 |   /**
55 |    * The map of forms to their possible POS tags.
56 |    */
57 |   val formsToPosTags: HashMultimap<String, List<POSTag>> = HashMultimap.create()
58 | 
59 |   /**
60 |    * The dictionary set of all the possible grammatical configurations.
61 |    */
62 |   val grammaticalConfigurations = DictionarySet<GrammaticalConfiguration>()
63 | 
64 |   /**
65 |    * Add the info of a given [token] into this dictionary.
66 |    *
67 |    * @param token the token of a sentence
68 |    */
69 |   private fun addInfo(token: CoNLLToken) {
70 | 
71 |     this.words.add(token.normalizedForm)
72 | 
73 |     this.formsToPosTags.put(token.normalizedForm, token.posList)
74 | 
75 |     this.grammaticalConfigurations.add(GrammaticalConfiguration(*Array(
76 |       size = maxOf(token.posList.size, token.syntacticDependencies.size),
77 |       init = { i -> GrammaticalConfiguration.Component(
78 |         pos = token.posList.getOrElse(i) { token.posList.single() },
79 |         syntacticDependency = token.syntacticDependencies.getOrElse(i) { token.syntacticDependencies.single() })
80 |       })))
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/Extensions.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
11 | import com.kotlinnlp.conllio.Token as CoNLLToken
12 | 
13 | /**
14 |  * @return a list of base real tokens
15 |  */
16 | fun List<CoNLLToken>.toBaseTokens(): List<BaseToken> {
17 | 
18 |   var end = -2
19 | 
20 |   return this.mapIndexed { i, it ->
21 | 
22 |     val start = end + 2 // each couple of consecutive tokens is separated by a spacing char
23 |     end = start + it.form.length - 1
24 | 
25 |     BaseToken(
26 |       id = it.id,
27 |       form = it.form,
28 |       position = Position(index = i, start = start, end = end)
29 |     )
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingSentence.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
12 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
14 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
15 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable
16 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
17 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector
18 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
19 | 
20 | /**
21 |  * The sentence used as input of the [com.kotlinnlp.neuralparser.NeuralParser].
22 |  *
23 |  * @property tokens the list of tokens of the sentence
24 |  * @property morphoAnalysis the morphological analysis of the tokens (can be null)
25 |  * @property position the position of the sentence in the text
26 |  * @param labelerSelector the labeler selector used to select the grammatical configurations compatible with the sentence
27 |  */
28 | class ParsingSentence(
29 |   override val tokens: List<ParsingToken>,
30 |   override val morphoAnalysis: MorphologicalAnalysis? = null,
31 |   override val position: Position,
32 |   private val labelerSelector: LabelerSelector
33 | ) : MorphoSentence<ParsingToken>, RealSentence<ParsingToken>, SentenceIdentificable<ParsingToken>() {
34 | 
35 |   /**
36 |    * Check whether the morphologies of the token are compatible with the given configuration [c].
37 |    * Middle multi-words morphologies are compared partially (only with the "CONTIN" components).
38 |    *
39 |    * @param c the grammatical configuration
40 |    * @param tokenIndex the index of a token of the sentence
41 |    *
42 |    * @return true if the morphologies of the token are compatible with the given configuration, otherwise false
43 |    */
44 |   fun areConfigurationCompatible(c: GrammaticalConfiguration, tokenIndex: Int): Boolean =
45 |     this.morphoAnalysis!!.startMorphologies[tokenIndex].any { c.isCompatible(it) } ||
46 |       this.morphoAnalysis.middleMWMorphologies[tokenIndex].any { c.isPartiallyCompatible(it) }
47 | 
48 |   /**
49 |    * @param c the grammatical configuration
50 |    * @param tokenIndex the index of a token of the sentence
51 |    *
52 |    * @return the token morphologies (including the multi-words) that are compatible with the given configuration
53 |    */
54 |   fun getCompatibleMorphologies(c: GrammaticalConfiguration, tokenIndex: Int) = Morphologies(
55 |     this.morphoAnalysis!!.allMorphologies[tokenIndex].filter {
56 |       c.isCompatible(it) // TODO: || c.isPartiallyCompatible(it)
57 |     })
58 | 
59 |   /**
60 |    * Get the list of scored grammatical configurations that are valid for a given attachment.
61 |    *
62 |    * @param tokenIndex the index of the token to which one of the [configurations] must be assigned
63 |    * @param headIndex the index of the token head (can be null)
64 |    * @param configurations the list of grammatical configurations, sorted by descending score
65 |    *
66 |    * @return the valid grammatical configurations for the given attachment
67 |    */
68 |   fun getValidConfigurations(tokenIndex: Int,
69 |                              headIndex: Int?,
70 |                              configurations: List<ScoredGrammar>): List<ScoredGrammar> =
71 |     this.labelerSelector.getValidConfigurations(
72 |       sentence = this,
73 |       tokenIndex = tokenIndex,
74 |       headIndex = headIndex,
75 |       configurations = configurations)
76 | 
77 |   /**
78 |    * Get the morphologies of a given token that are compatible with the given grammatical configuration.
79 |    *
80 |    * @param tokenIndex the index of a token of the sentence
81 |    * @param configuration the grammatical configuration of the token
82 |    *
83 |    * @return the morphologies compatible with the given grammatical configuration
84 |    */
85 |   fun getValidMorphologies(tokenIndex: Int,
86 |                            configuration: GrammaticalConfiguration): Morphologies =
87 |     this.labelerSelector.getValidMorphologies(
88 |       sentence = this,
89 |       tokenIndex = tokenIndex,
90 |       configuration = configuration)
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingToken.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.language
 9 | 
10 | import com.kotlinnlp.linguisticdescription.POSTag
11 | import com.kotlinnlp.linguisticdescription.sentence.token.*
12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
13 | 
14 | /**
15 |  * The token of the [ParsingSentence].
16 |  *
17 |  * @property id the id of the token, unique within its sentence
18 |  * @property form the form
19 |  * @property pos the list of part-of-speech tags associated to the token (more for composite tokens, can be null)
20 |  * @property position the position of the token in the text (null if it is a trace)
21 |  */
22 | data class ParsingToken(
23 |   override val id: Int,
24 |   override val form: String,
25 |   val pos: List<POSTag>? = null, // TODO: find a better solution
26 |   val position: Position?
27 | ) : FormToken, TokenIdentificable
28 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRModel.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
 9 | 
10 | import com.kotlinnlp.lssencoder.LSSModel
11 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.LabelerModel
12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
13 | import com.kotlinnlp.neuralparser.NeuralParserModel
14 | import com.kotlinnlp.neuralparser.language.CorpusDictionary
15 | import com.kotlinnlp.neuralparser.language.ParsingSentence
16 | import com.kotlinnlp.neuralparser.language.ParsingToken
17 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
18 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge
19 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel
20 | import com.kotlinnlp.utils.Serializer
21 | import java.io.InputStream
22 | 
23 | /**
24 |  * The model of the [LHRParser].
25 |  *
26 |  * @property language the language within the parser works (default = unknown)
27 |  * @param corpusDictionary a corpus dictionary
28 |  * @property lssModel the model of the LSS encoder
29 |  * @property useLabeler whether to use the labeler
30 |  * @property lossCriterionType the training mode of the labeler
31 |  * @property predictPosTags whether to predict the POS tags together with the Deprels
32 |  */
33 | class LHRModel(
34 |   corpusDictionary: CorpusDictionary,
35 |   val lssModel: LSSModel<ParsingToken, ParsingSentence>,
36 |   val useLabeler: Boolean,
37 |   val lossCriterionType: LossCriterionType,
38 |   val predictPosTags: Boolean
39 | ) : NeuralParserModel(lssModel.language) {
40 | 
41 |   companion object {
42 | 
43 |     /**
44 |      * Private val used to serialize the class (needed by Serializable).
45 |      */
46 |     @Suppress("unused")
47 |     private const val serialVersionUID: Long = 1L
48 | 
49 |     /**
50 |      * Read a [LHRModel] (serialized) from an input stream and decode it.
51 |      *
52 |      * @param inputStream the [InputStream] from which to read the serialized [LHRModel]
53 |      *
54 |      * @return the [LHRModel] read from [inputStream] and decoded
55 |      */
56 |     fun load(inputStream: InputStream): LHRModel = Serializer.deserialize(inputStream)
57 |   }
58 | 
59 |   /**
60 |    * The model of the Labeler.
61 |    */
62 |   val labelerModel: LabelerModel? = if (this.useLabeler)
63 |     LabelerModel(
64 |       contextEncodingSize = this.lssModel.contextVectorsSize,
65 |       grammaticalConfigurations = corpusDictionary.grammaticalConfigurations,
66 |       lossCriterionType = this.lossCriterionType)
67 |   else
68 |     null
69 | 
70 |   /**
71 |    * The model of the pointer network used for the positional encoding.
72 |    */
73 |   val pointerNetworkModel = PointerNetworkModel(
74 |     inputSize = this.lssModel.contextVectorsSize,
75 |     vectorSize = this.lssModel.contextVectorsSize,
76 |     mergeConfig = AffineMerge(outputSize = 100, activationFunction = Tanh))
77 | 
78 |   /**
79 |    * @return the string representation of this model
80 |    */
81 |   override fun toString(): String = """
82 |     %-33s : %s
83 |     %-33s : %s
84 |     %-33s : %s
85 |     %-33s : %s
86 |     %-33s : %s
87 |   """.trimIndent().format(
88 |     this.lssModel.tokensEncoderWrapperModel.model::class.simpleName, this.lssModel.tokensEncoderWrapperModel.model,
89 |     "Context Encoder", this.lssModel.contextBiRNNConfig,
90 |     "Heads Encoder", this.lssModel.headsBiRNNConfig,
91 |     "Labeler training mode", this.lossCriterionType,
92 |     "Predict POS tags", this.predictPosTags
93 |   )
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRParser.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
 9 | 
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
12 | import com.kotlinnlp.lssencoder.LSSEncoder
13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
14 | import com.kotlinnlp.lssencoder.decoder.CosineDecoder
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
16 | import com.kotlinnlp.neuralparser.NeuralParser
17 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.LabeledMorphoSynBuilder
18 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.UnlabeledMorphoSynBuilder
19 | import com.kotlinnlp.neuralparser.language.ParsingSentence
20 | import com.kotlinnlp.neuralparser.language.ParsingToken
21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.GreedyDependencyTreeBuilder
22 | 
23 | /**
24 |  * The Latent Head Representation (LHR) Parser.
25 |  *
26 |  * Implemented as described in the following publication:
27 |  *   [Non-Projective Dependency Parsing via Latent Heads Representation (LHR)](https://arxiv.org/abs/1802.02116)
28 |  *
29 |  * @property model the parser model
30 |  * @param contextDropout the dropout probability of the context encodings (default 0.0)
31 |  * @param headsDropout the dropout probability of the latent heads encodings (default 0.0)
32 |  */
33 | class LHRParser(
34 |   override val model: LHRModel,
35 |   contextDropout: Double = 0.0,
36 |   headsDropout: Double = 0.0
37 | ) : NeuralParser<LHRModel> {
38 | 
39 |   /**
40 |    * Whether this parser executes the morpho-syntactic labelling.
41 |    */
42 |   override val labellingEnabled: Boolean = this.model.useLabeler
43 | 
44 |   /**
45 |    * The Encoder of the Latent Syntactic Structure.
46 |    */
47 |   private val lssEncoder =
48 |     LSSEncoder(model = this.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
49 | 
50 |   /**
51 |    * The builder of the labeler.
52 |    */
53 |   private val labeler: Labeler? = this.model.labelerModel?.let { Labeler(it) }
54 | 
55 |   /**
56 |    * Parse a sentence, returning its dependency tree.
57 |    * The dependency tree is obtained by decoding a latent syntactic structure.
58 |    * If the labeler is available, the dependency tree could contain grammatical information.
59 |    *
60 |    * @param sentence a parsing sentence
61 |    *
62 |    * @return the dependency tree predicted for the given [sentence]
63 |    */
64 |   override fun parse(sentence: ParsingSentence): MorphoSynSentence {
65 | 
66 |     val lss: LatentSyntacticStructure<ParsingToken, ParsingSentence> = this.lssEncoder.forward(sentence)
67 | 
68 |     val dependencyTree: DependencyTree = GreedyDependencyTreeBuilder(
69 |       lss = lss,
70 |       scoresMap = CosineDecoder().decode(lss),
71 |       labeler = this.labeler
72 |     ).build()
73 | 
74 |     return when (dependencyTree) {
75 | 
76 |       is DependencyTree.Labeled ->
77 |         LabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence()
78 | 
79 |       is DependencyTree.Unlabeled ->
80 |         UnlabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence()
81 |     }
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTrainer.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
  9 | 
 10 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
 11 | import com.kotlinnlp.dependencytree.DependencyTree
 12 | import com.kotlinnlp.lssencoder.LSSEncoder
 13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
 14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
 15 | import com.kotlinnlp.neuralparser.helpers.Trainer
 16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
 17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
 18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 19 | import com.kotlinnlp.neuralparser.language.ParsingToken
 20 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder
 21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder.Companion.calculateErrors
 22 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator
 23 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod
 24 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
 25 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer
 26 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor
 27 | import com.kotlinnlp.simplednn.simplemath.assignSum
 28 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
 29 | import com.kotlinnlp.simplednn.utils.scheduling.BatchScheduling
 30 | import com.kotlinnlp.simplednn.utils.scheduling.EpochScheduling
 31 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling
 32 | 
 33 | /**
 34 |  * The training helper.
 35 |  *
 36 |  * @param parser a neural parser
 37 |  * @param batchSize the size of the batches of sentences
 38 |  * @param epochs the number of training epochs
 39 |  * @param validator the validation helper (if it is null no validation is done after each epoch)
 40 |  * @param modelFilename the name of the file in which to save the best trained model
 41 |  * @param updateMethod the update method shared to all the parameters of the parser (Learning Rate, ADAM, AdaGrad, ...)
 42 |  * @param contextDropout the dropout probability of the context encodings (default 0.25)
 43 |  * @param headsDropout the dropout probability of the latent heads encodings (default 0.25)
 44 |  * @param labelerDropout the dropout probability of the labeler (default 0.25)
 45 |  * @param skipPunctuationErrors whether to do not consider punctuation errors
 46 |  * @param usePositionalEncodingErrors whether to calculate and propagate the positional encoding errors
 47 |  * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
 48 |  * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
 49 |  */
 50 | class LHRTrainer(
 51 |   private val parser: LHRParser,
 52 |   private val batchSize: Int,
 53 |   private val epochs: Int,
 54 |   validator: Validator?,
 55 |   modelFilename: String,
 56 |   private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
 57 |   contextDropout: Double = 0.25,
 58 |   headsDropout: Double = 0.25,
 59 |   labelerDropout: Double = 0.25,
 60 |   private val skipPunctuationErrors: Boolean,
 61 |   usePositionalEncodingErrors: Boolean,
 62 |   sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
 63 |   verbose: Boolean = true
 64 | ) : Trainer(
 65 |   neuralParser = parser,
 66 |   batchSize = batchSize,
 67 |   epochs = epochs,
 68 |   validator = validator,
 69 |   modelFilename = modelFilename,
 70 |   minRelevantErrorsCountToUpdate = 1,
 71 |   sentencePreprocessor = sentencePreprocessor,
 72 |   verbose = verbose
 73 | ) {
 74 | 
 75 |   /**
 76 |    * The encoder of the Latent Syntactic Structure.
 77 |    */
 78 |   private val lssEncoder =
 79 |     LSSEncoder(model = this.parser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
 80 | 
 81 |   /**
 82 |    * The labeler.
 83 |    */
 84 |   private val labeler: Labeler? = this.parser.model.labelerModel?.let { Labeler(model = it, dropout = labelerDropout) }
 85 | 
 86 |   /**
 87 |    * The positional encoder.
 88 |    */
 89 |   private val positionalEncoder: PositionalEncoder? = if (usePositionalEncodingErrors)
 90 |     PositionalEncoder(this.parser.model.pointerNetworkModel)
 91 |   else
 92 |     null
 93 | 
 94 |   /**
 95 |    * The pointer network optimizer.
 96 |    */
 97 |   private val pointerNetworkOptimizer = ParamsOptimizer(this.updateMethod)
 98 | 
 99 |   /**
100 |    * The optimizer of the LSS encoder.
101 |    */
102 |   private val lssEncoderOptimizer = ParamsOptimizer(this.updateMethod)
103 | 
104 |   /**
105 |    * The optimizer of the labeler (can be null).
106 |    */
107 |   private val labelerOptimizer: ParamsOptimizer? = this.parser.model.labelerModel?.let {
108 |     ParamsOptimizer(this.updateMethod)
109 |   }
110 | 
111 |   /**
112 |    * The epoch counter.
113 |    */
114 |   private var epochCount: Int = 0
115 | 
116 |   /**
117 |    * Group the optimizers all together.
118 |    */
119 |   private val optimizers: List<ParamsOptimizer?> = listOf(
120 |     this.lssEncoderOptimizer,
121 |     this.labelerOptimizer,
122 |     this.pointerNetworkOptimizer)
123 | 
124 |   /**
125 |    * @return a string representation of the configuration of this Trainer
126 |    */
127 |   override fun toString(): String = """
128 |     %-33s : %s
129 |     %-33s : %s
130 |     %-33s : %s
131 |   """.trimIndent().format(
132 |     "Epochs", this.epochs,
133 |     "Batch size", this.batchSize,
134 |     "Skip punctuation errors", this.skipPunctuationErrors
135 |   )
136 | 
137 |   /**
138 |    * Beat the occurrence of a new batch.
139 |    */
140 |   override fun newBatch() {
141 |     if (this.updateMethod is BatchScheduling) this.updateMethod.newBatch()
142 |   }
143 | 
144 |   /**
145 |    * Beat the occurrence of a new epoch.
146 |    */
147 |   override fun newEpoch() {
148 | 
149 |     if (this.updateMethod is EpochScheduling) this.updateMethod.newEpoch()
150 | 
151 |     this.epochCount++
152 |   }
153 | 
154 |   /**
155 |    * Update the model parameters.
156 |    */
157 |   override fun update() {
158 |     this.optimizers.forEach { it?.update() }
159 |   }
160 | 
161 |   /**
162 |    * @return the count of the relevant errors
163 |    */
164 |   override fun getRelevantErrorsCount(): Int = 1
165 | 
166 |   /**
167 |    * Method to call before learning a new sentence.
168 |    */
169 |   private fun beforeSentenceLearning() {
170 |     if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample()
171 |   }
172 | 
173 |   /**
174 |    * Train the Transition System with the given [sentence] and [goldTree].
175 |    *
176 |    * @param sentence the sentence
177 |    * @param goldTree the gold tree of the sentence
178 |    */
179 |   override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) {
180 | 
181 |     this.beforeSentenceLearning()
182 | 
183 |     val lss: LatentSyntacticStructure<ParsingToken, ParsingSentence> = this.lssEncoder.forward(sentence)
184 |     val latentHeadsErrors = calculateLatentHeadsErrors(lss, goldTree)
185 | 
186 |     val labelerErrors: List<DenseNDArray>? = this.labeler?.let {
187 |       val labelerPrediction: List<DenseNDArray> = it.forward(Labeler.Input(lss, goldTree))
188 |       this.parser.model.labelerModel?.calculateLoss(labelerPrediction, goldTree)
189 |     }
190 | 
191 |     val positionalEncoderErrors: PointerNetworkProcessor.InputErrors? = this.positionalEncoder?.let {
192 |       it.propagateErrors(calculateErrors(it.forward(lss.contextVectors)), this.pointerNetworkOptimizer)
193 |     }
194 | 
195 |     this.propagateErrors(
196 |       latentHeadsErrors = latentHeadsErrors,
197 |       labelerErrors = labelerErrors,
198 |       positionalEncoderErrors = positionalEncoderErrors)
199 |   }
200 | 
201 |   /**
202 |    * Calculate the errors of the latent heads
203 |    *
204 |    * @param lss the latent syntactic structure
205 |    * @param goldTree the gold tree of the sentence
206 |    *
207 |    * @return the errors of the latent heads
208 |    */
209 |   private fun calculateLatentHeadsErrors(lss: LatentSyntacticStructure<ParsingToken, ParsingSentence>,
210 |                                          goldTree: DependencyTree): List<DenseNDArray> =
211 |     MSECalculator().calculateErrors(
212 |       outputSequence = lss.latentHeads,
213 |       outputGoldSequence = this.getExpectedLatentHeads(lss, goldTree))
214 | 
215 |   /**
216 |    * Return a list containing the expected latent heads, one for each token of the sentence.
217 |    *
218 |    * @param lss the latent syntactic structure
219 |    * @param goldTree the gold tree of the sentence
220 |    *
221 |    * @return the expected latent heads
222 |    */
223 |   private fun getExpectedLatentHeads(lss: LatentSyntacticStructure<ParsingToken, ParsingSentence>,
224 |                                      goldTree: DependencyTree): List<DenseNDArray> =
225 | 
226 |     lss.sentence.tokens.map { token ->
227 | 
228 |       val goldHeadId: Int? = goldTree.getHead(token.id)
229 | 
230 |       when {
231 |         goldHeadId == null -> lss.virtualRoot
232 |         this.skipPunctuationErrors && token.isComma -> lss.getLatentHeadById(token.id) // no errors
233 |         else -> lss.getContextVectorById(goldHeadId)
234 |       }
235 |     }
236 | 
237 |   /**
238 |    * Propagate the errors through the encoders.
239 |    *
240 |    * @param latentHeadsErrors the latent heads errors
241 |    * @param labelerErrors the labeler errors
242 |    * @param positionalEncoderErrors the positional encoder errors
243 |    */
244 |   private fun propagateErrors(latentHeadsErrors: List<DenseNDArray>,
245 |                               labelerErrors: List<DenseNDArray>?,
246 |                               positionalEncoderErrors: PointerNetworkProcessor.InputErrors?) {
247 | 
248 |     val contextVectorsErrors: List<DenseNDArray> = latentHeadsErrors.map { it.zerosLike() }
249 | 
250 |     positionalEncoderErrors?.let { contextVectorsErrors.assignSum(it.inputVectorsErrors) }
251 | 
252 |     this.labeler?.propagateErrors(labelerErrors!!, this.labelerOptimizer!!, copy = false)?.let { labelerInputErrors ->
253 |       contextVectorsErrors.assignSum(labelerInputErrors.contextErrors)
254 |       this.propagateRootErrors(labelerInputErrors.rootErrors)
255 |     }
256 | 
257 |     this.lssEncoder.backward(outputErrors = LSSEncoder.OutputErrors(
258 |       size = latentHeadsErrors.size,
259 |       contextVectors = contextVectorsErrors,
260 |       latentHeads = latentHeadsErrors))
261 | 
262 |     this.lssEncoderOptimizer.accumulate(this.lssEncoder.getParamsErrors(copy = false))
263 |   }
264 | 
265 |   /**
266 |    * Propagate the [errors] through the virtual root embedding.
267 |    *
268 |    * @param errors the errors
269 |    */
270 |   private fun propagateRootErrors(errors: DenseNDArray) {
271 |     this.updateMethod.update(array = this.parser.model.lssModel.rootEmbedding, errors = errors)
272 |   }
273 | }
274 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTransferLearning.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * ------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
  9 | 
 10 | import com.kotlinnlp.dependencytree.DependencyTree
 11 | import com.kotlinnlp.lssencoder.LSSEncoder
 12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
 13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
 14 | import com.kotlinnlp.neuralparser.helpers.Trainer
 15 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
 16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
 17 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 18 | import com.kotlinnlp.neuralparser.language.ParsingToken
 19 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator
 20 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod
 21 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
 22 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer
 23 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
 24 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling
 25 | 
 26 | /**
 27 |  * The transfer learning training helper.
 28 |  *
 29 |  * @param referenceParser the neural parser used as reference
 30 |  * @param targetParser the neural parser to train via transfer learning
 31 |  * @param epochs the number of training epochs
 32 |  * @param validator the validation helper (if it is null no validation is done after each epoch)
 33 |  * @param modelFilename the name of the file in which to save the best trained model
 34 |  * @param updateMethod the update method (Learning Rate, ADAM, AdaGrad, ...)
 35 |  * @param contextDropout the dropout probability of the target context encodings (default 0.0)
 36 |  * @param headsDropout the dropout probability of the target latent heads encodings (default 0.0)
 37 |  * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
 38 |  * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
 39 |  */
 40 | class LHRTransferLearning(
 41 |   private val referenceParser: LHRParser,
 42 |   private val targetParser: LHRParser,
 43 |   private val epochs: Int,
 44 |   validator: Validator?,
 45 |   modelFilename: String,
 46 |   private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
 47 |   contextDropout: Double = 0.0,
 48 |   headsDropout: Double = 0.0,
 49 |   sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
 50 |   verbose: Boolean = true
 51 | ) : Trainer(
 52 |   neuralParser = targetParser,
 53 |   batchSize = 1,
 54 |   epochs = epochs,
 55 |   validator = validator,
 56 |   modelFilename = modelFilename,
 57 |   minRelevantErrorsCountToUpdate = 1,
 58 |   sentencePreprocessor = sentencePreprocessor,
 59 |   verbose = verbose
 60 | ) {
 61 | 
 62 |   /**
 63 |    * The [LSSEncoder] of the reference parser.
 64 |    */
 65 |   private val referenceLSSEncoder: LSSEncoder<ParsingToken, ParsingSentence> =
 66 |     LSSEncoder(model = this.referenceParser.model.lssModel)
 67 | 
 68 |   /**
 69 |    * The [LSSEncoder] of the target parser.
 70 |    */
 71 |   private val targetLSSEncoder: LSSEncoder<ParsingToken, ParsingSentence> =
 72 |     LSSEncoder(model = this.targetParser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
 73 | 
 74 |   /**
 75 |    * The optimizer of the context encoder.
 76 |    */
 77 |   private val targetLSSEncoderOptimizer = ParamsOptimizer(this.updateMethod)
 78 | 
 79 |   /**
 80 |    * Train the [targetParser] with the given [sentence] and [goldTree].
 81 |    * Transfer the knowledge acquired by the LSS encoder of a reference parser to that of the target parser.
 82 |    *
 83 |    * @param sentence the input sentence
 84 |    * @param goldTree the gold tree of the sentence
 85 |    */
 86 |   override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) {
 87 | 
 88 |     this.beforeSentenceLearning()
 89 | 
 90 |     val targetLSS: LatentSyntacticStructure<ParsingToken, ParsingSentence> = this.targetLSSEncoder.forward(sentence)
 91 |     val refLSS: LatentSyntacticStructure<ParsingToken, ParsingSentence> = this.referenceLSSEncoder.forward(sentence)
 92 | 
 93 |     val contextErrors: List<DenseNDArray> = MSECalculator().calculateErrors(
 94 |       outputSequence = targetLSS.contextVectors,
 95 |       outputGoldSequence = refLSS.contextVectors)
 96 | 
 97 |     this.targetLSSEncoder.backward(LSSEncoder.OutputErrors(size = sentence.tokens.size, contextVectors = contextErrors))
 98 |     this.targetLSSEncoderOptimizer.accumulate((this.targetLSSEncoder.getParamsErrors()))
 99 |   }
100 | 
101 |   /**
102 |    * Method to call before learning a new sentence.
103 |    */
104 |   private fun beforeSentenceLearning() {
105 |     if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample()
106 |   }
107 | 
108 |   /**
109 |    * Update the model parameters.
110 |    */
111 |   override fun update() {
112 |     this.targetLSSEncoderOptimizer.update()
113 |   }
114 | 
115 |   /**
116 |    * @return the count of the relevant errors
117 |    */
118 |   override fun getRelevantErrorsCount(): Int = 1
119 | 
120 |   /**
121 |    * @return a string representation of the configuration of this Trainer
122 |    */
123 |   override fun toString(): String = """
124 |     %-33s : %s
125 |   """.trimIndent().format(
126 |     "Epochs", this.epochs
127 |   )
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/CyclesFixer.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers
  9 | 
 10 | import com.kotlinnlp.dependencytree.DependencyTree
 11 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs
 12 | 
 13 | /**
 14 |  * Naive strategy to fix possible cycles in a [dependencyTree].
 15 |  *
 16 |  * @param dependencyTree the dependency tree
 17 |  * @param scoredArcs the scored arcs between pair of tree elements
 18 |  */
 19 | internal class CyclesFixer(private val dependencyTree: DependencyTree, private val scoredArcs: ScoredArcs) {
 20 | 
 21 |   /**
 22 |    * The set of direct elements of the tree (elements that aren't involved in cycles).
 23 |    */
 24 |   private lateinit var directElements: Set<Int>
 25 | 
 26 |   /**
 27 |    * Fix the cycles of the dependency tree.
 28 |    */
 29 |   fun fixCycles() {
 30 | 
 31 |     val cycles: List<DependencyTree.Path> = this.dependencyTree.getCycles()
 32 | 
 33 |     this.directElements = this.dependencyTree.elements.toSet() - cycles.toElementsSet()
 34 | 
 35 |     cycles.forEach { this.fixCycle(it) }
 36 |   }
 37 | 
 38 |   /**
 39 |    * @return the set of elements from a list of path
 40 |    */
 41 |   private fun List<DependencyTree.Path>.toElementsSet(): Set<Int> {
 42 | 
 43 |     val elements = mutableSetOf<Int>()
 44 |     this.forEach { path -> elements += path.arcs.map { it.dependent } }
 45 |     return elements
 46 |   }
 47 | 
 48 |   /**
 49 |    * Remove a [cycle] from the dependency tree.
 50 |    *
 51 |    * @param cycle a cycle of the dependency tree
 52 |    */
 53 |   private fun fixCycle(cycle: DependencyTree.Path) {
 54 | 
 55 |     val dep: Int = this.removeLowestScoringArc(cycle.arcs)
 56 |     val (newGov: Int, score: Double) = this.findBestGovernor(dep)
 57 |     this.dependencyTree.setArc(dependent = dep, governor = newGov, score = score)
 58 |   }
 59 | 
 60 |   /**
 61 |    * Remove the lowest scoring arc and return the related dependent to be reattached.
 62 |    *
 63 |    * @param arcs a list of arcs
 64 |    *
 65 |    * @return the element to be reattached.
 66 |    */
 67 |   private fun removeLowestScoringArc(arcs: List<DependencyTree.Arc>): Int {
 68 | 
 69 |     val arc: DependencyTree.Arc = this.getLowestScoringArc(arcs)
 70 |     this.dependencyTree.removeArc(dependent = arc.dependent, governor = arc.governor)
 71 |     return arc.dependent
 72 |   }
 73 | 
 74 |   /**
 75 |    * @param arcs a list of arcs
 76 |    *
 77 |    * @return the lowest scoring arc according to the [scoredArcs].
 78 |    */
 79 |   private fun getLowestScoringArc(arcs: List<DependencyTree.Arc>): DependencyTree.Arc =
 80 |     arcs.minBy { arc -> this.scoredArcs.getScore(dependentId = arc.dependent, governorId = arc.governor) }!!
 81 | 
 82 |   /**
 83 |    * Find the best governor for the given element that doesn't introduce a cycle.
 84 |    *
 85 |    * @param element an element of the dependency tree
 86 |    *
 87 |    * @return the new governor id and the related score
 88 |    */
 89 |   private fun findBestGovernor(element: Int): Pair<Int, Double> {
 90 | 
 91 |     val headsMap: Map<Int, Double> = this.scoredArcs.getHeadsMap(element)
 92 | 
 93 |     val candidates: List<Int> = this.directElements.intersect(headsMap.keys).filter { candidateGov ->
 94 |       !this.dependencyTree.introduceCycle(dependent = element, governor = candidateGov)
 95 |     }
 96 | 
 97 |     return headsMap.filter { it.key in candidates }.maxBy { it.value }!!.toPair()
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/GreedyDependencyTreeBuilder.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers
 9 | 
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
12 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.language.ParsingToken
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
16 | 
17 | /**
18 |  * A helper that builds the dependency tree with the highest scoring configurations.
19 |  *
20 |  * @param lss the latent syntactic structure of the input sentence
21 |  */
22 | internal class GreedyDependencyTreeBuilder(
23 |   private val lss: LatentSyntacticStructure<ParsingToken, ParsingSentence>,
24 |   private val scoresMap: ScoredArcs,
25 |   private val labeler: Labeler?
26 | ) {
27 | 
28 |   /**
29 |    * Build a new dependency tree from the latent syntactic structure [lss], using the possible attachments in the
30 |    * [scoresMap].
31 |    *
32 |    * @return the annotated dependency tree with the highest score, built from the given LSS
33 |    */
34 |   fun build(): DependencyTree =
35 |     if (this.labeler != null)
36 |       DependencyTree.Labeled(this.lss.sentence.tokens.map { it.id }).apply {
37 |         assignHighestScoringHeads()
38 |         fixCycles()
39 |         assignLabels()
40 |       }
41 |     else
42 |       DependencyTree.Unlabeled(this.lss.sentence.tokens.map { it.id }).apply {
43 |         assignHighestScoringHeads()
44 |         fixCycles()
45 |       }
46 | 
47 | 
48 |   /**
49 |    * Assign the heads to this dependency tree using the highest scoring arcs of the [scoresMap].
50 |    */
51 |   private fun DependencyTree.assignHighestScoringHeads() {
52 | 
53 |     val (topId: Int, topScore: Double) = scoresMap.findHighestScoringTop()
54 | 
55 |     this.setAttachmentScore(dependent = topId, score = topScore)
56 | 
57 |     this.elements.filter { it != topId }.forEach { depId ->
58 | 
59 |       val (govId: Int, score: Double) = scoresMap.findHighestScoringHead(
60 |         dependentId = depId,
61 |         except = listOf(ScoredArcs.rootId))!!
62 | 
63 |       this.setArc(
64 |         dependent = depId,
65 |         governor = govId,
66 |         allowCycle = true,
67 |         score = score)
68 |     }
69 |   }
70 | 
71 |   /**
72 |    * Fix possible cycles using the [scoresMap].
73 |    */
74 |   private fun DependencyTree.fixCycles() = CyclesFixer(dependencyTree = this, scoredArcs = scoresMap).fixCycles()
75 | 
76 |   /**
77 |    * Annotate this dependency tree with the labels.
78 |    */
79 |   private fun DependencyTree.Labeled.assignLabels() {
80 | 
81 |     labeler!!.predict(Labeler.Input(lss, this)).forEach { tokenId, configurations ->
82 |       this.setGrammaticalConfiguration(dependent = tokenId, configuration = configurations.first().config)
83 |     }
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/keyextractors/PosTagKeyExtractor.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors
 9 | 
10 | import com.kotlinnlp.neuralparser.language.ParsingSentence
11 | import com.kotlinnlp.neuralparser.language.ParsingToken
12 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.EmbeddingKeyExtractor
13 | 
14 | /**
15 |  * An [EmbeddingKeyExtractor] by POS tag.
16 |  */
17 | object PosTagKeyExtractor : EmbeddingKeyExtractor<ParsingToken, ParsingSentence> {
18 | 
19 |   /**
20 |    * Private val used to serialize the class (needed by Serializable).
21 |    */
22 |   @Suppress("unused")
23 |   private const val serialVersionUID: Long = 1L
24 | 
25 |   /**
26 |    * @param sentence a generic sentence
27 |    * @param tokenId the id of the token from which to extract the key
28 |    *
29 |    * @return the POS as string
30 |    */
31 |   override fun getKey(sentence: ParsingSentence, tokenId: Int): String =
32 |     sentence.tokens[tokenId].pos?.toString() ?: "_"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/PositionalEncoder.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules
  9 | 
 10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator
 11 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor
 12 | import com.kotlinnlp.simplednn.core.optimizer.ParamsErrorsList
 13 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel
 14 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor
 15 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
 16 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory
 17 | 
 18 | /**
 19 |  * The PositionalEncoder.
 20 |  *
 21 |  * @param model the model of the pointer network
 22 |  * @property id an identification number useful to track a specific encoder
 23 |  */
 24 | class PositionalEncoder(
 25 |   private val model: PointerNetworkModel,
 26 |   override val id: Int = 0
 27 | ) : NeuralProcessor<
 28 |   List<DenseNDArray>, // InputType
 29 |   List<DenseNDArray>, // OutputType
 30 |   List<DenseNDArray>, // ErrorsType
 31 |   PointerNetworkProcessor.InputErrors // InputErrorsType
 32 |   > {
 33 | 
 34 |   companion object {
 35 | 
 36 |     /**
 37 |      * @param predictions the list of prediction
 38 |      *
 39 |      * @return the errors of the given predictions
 40 |      */
 41 |     fun calculateErrors(predictions: List<DenseNDArray>): List<DenseNDArray> {
 42 | 
 43 |       return predictions.mapIndexed { index, prediction ->
 44 |         val expectedValues = DenseNDArrayFactory.oneHotEncoder(length = predictions.size, oneAt = index)
 45 |         SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, outputGold = expectedValues)
 46 |       }
 47 |     }
 48 |   }
 49 | 
 50 |   /**
 51 |    * Propagate the errors to the input.
 52 |    */
 53 |   override val propagateToInput: Boolean = true
 54 | 
 55 |   /**
 56 |    * The pointer processor used as encoder.
 57 |    */
 58 |   private val encoder = PointerNetworkProcessor(this.model)
 59 | 
 60 |   /**
 61 |    * The Forward.
 62 |    *
 63 |    * @param input the input
 64 |    *
 65 |    * @return the result of the forward
 66 |    */
 67 |   override fun forward(input: List<DenseNDArray>): List<DenseNDArray> {
 68 | 
 69 |     this.encoder.setInputSequence(input)
 70 | 
 71 |     return input.map { this.encoder.forward(it) }
 72 |   }
 73 | 
 74 |   /**
 75 |    * The Backward.
 76 |    *
 77 |    * @param outputErrors the errors of the last forward
 78 |    */
 79 |   override fun backward(outputErrors: List<DenseNDArray>) {
 80 |     this.encoder.backward(outputErrors)
 81 |   }
 82 | 
 83 |   /**
 84 |    * Return the input errors of the last backward.
 85 |    *
 86 |    * @param copy whether to return by value or by reference (default true)
 87 |    *
 88 |    * @return the input errors
 89 |    */
 90 |   override fun getInputErrors(copy: Boolean): PointerNetworkProcessor.InputErrors = this.encoder.getInputErrors()
 91 | 
 92 |   /**
 93 |    * Return the params errors of the last backward.
 94 |    *
 95 |    * @param copy a Boolean indicating whether the returned errors must be a copy or a reference (default true)
 96 |    *
 97 |    * @return the parameters errors
 98 |    */
 99 |   override fun getParamsErrors(copy: Boolean): ParamsErrorsList = this.encoder.getParamsErrors(copy = copy)
100 | }
101 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/Labeler.kt:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
  2 |  *
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 |  * -----------------------------------------------------------------------------*/
  7 | 
  8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler
  9 | 
 10 | import com.kotlinnlp.dependencytree.DependencyTree
 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
 12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
 13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
 14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
 15 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor
 16 | import com.kotlinnlp.simplednn.core.neuralprocessor.batchfeedforward.BatchFeedforwardProcessor
 17 | import com.kotlinnlp.simplednn.simplemath.ndarray.Shape
 18 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
 19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory
 20 | import com.kotlinnlp.utils.notEmptyOr
 21 | 
 22 | /**
 23 |  * The Labeler.
 24 |  *
 25 |  * @property model the model
 26 |  * @param dropout the dropout probability (default 0.0)
 27 |  * @property id an identification number useful to track a specific encoder
 28 |  */
 29 | class Labeler(
 30 |   val model: LabelerModel,
 31 |   dropout: Double = 0.0,
 32 |   override val id: Int = 0
 33 | ) : NeuralProcessor<
 34 |   Labeler.Input, // InputType
 35 |   List<DenseNDArray>, // OutputType
 36 |   List<DenseNDArray>, // ErrorsType
 37 |   Labeler.InputErrors // InputErrorsType
 38 |   > {
 39 | 
 40 |   /**
 41 |    * The input of this labeler.
 42 |    *
 43 |    * @param lss the latent syntactic structure
 44 |    * @param dependencyTree the dependency tree
 45 |    */
 46 |   data class Input(val lss: LatentSyntacticStructure<*, *>, val dependencyTree: DependencyTree)
 47 | 
 48 |   /**
 49 |    * The input errors of this labeler.
 50 |    *
 51 |    * @param rootErrors the errors of the virtual root
 52 |    * @param contextErrors the errors of the context vectors
 53 |    */
 54 |   data class InputErrors(val rootErrors: DenseNDArray, val contextErrors: List<DenseNDArray>)
 55 | 
 56 |   /**
 57 |    * This encoder propagate the errors to the input.
 58 |    */
 59 |   override val propagateToInput: Boolean = true
 60 | 
 61 |   /**
 62 |    * The processor that classify the grammar of a token.
 63 |    */
 64 |   private val processor =
 65 |     BatchFeedforwardProcessor<DenseNDArray>(model = this.model.networkModel, dropout = dropout, propagateToInput = true)
 66 | 
 67 |   /**
 68 |    * The dependency tree of the last input, used during the training.
 69 |    */
 70 |   private lateinit var dependencyTree: DependencyTree
 71 | 
 72 |   /**
 73 |    * Score the possible grammatical configurations of each token of a given input.
 74 |    *
 75 |    * @param input a [Labeler] input
 76 |    *
 77 |    * @return a map of valid grammatical configurations (sorted by descending score) associated to each token id
 78 |    */
 79 |   fun predict(input: Input): Map<Int, List<ScoredGrammar>> {
 80 | 
 81 |     return this.forward(input)
 82 |       .asSequence()
 83 |       .map { it.toScoredGrammar() }
 84 |       .withIndex()
 85 |       .associate { (tokenIndex, configurations) ->
 86 | 
 87 |         val tokenId: Int = input.dependencyTree.elements[tokenIndex]
 88 | 
 89 |         val validConfigurations: List<ScoredGrammar> = (input.lss.sentence as ParsingSentence).getValidConfigurations(
 90 |           tokenIndex = tokenIndex,
 91 |           headIndex = input.dependencyTree.getHead(tokenId)?.let { input.dependencyTree.getPosition(it) },
 92 |           configurations = configurations)
 93 | 
 94 |         tokenId to validConfigurations
 95 |           .filter { it.score >= this.model.labelerScoreThreshold }
 96 |           .notEmptyOr { validConfigurations.subList(0, 1) }
 97 |       }
 98 |   }
 99 | 
100 | 
101 |   /**
102 |    * Return the network outcomes for each token.
103 |    *
104 |    * @param input a [Labeler] input
105 |    *
106 |    * @return the network outcomes for each token
107 |    */
108 |   override fun forward(input: Input): List<DenseNDArray> {
109 | 
110 |     this.dependencyTree = input.dependencyTree
111 | 
112 |     return this.processor.forward(
113 |       input = input.lss.sentence.tokens.map { this.extractFeatures(tokenId = it.id, lss = input.lss) }.toTypedArray())
114 |   }
115 | 
116 |   /**
117 |    * Propagate the errors through the neural components of the labeler.
118 |    *
119 |    * @param outputErrors the list of errors
120 |    */
121 |   override fun backward(outputErrors: List<DenseNDArray>) {
122 | 
123 |     this.processor.backward(outputErrors)
124 |   }
125 | 
126 |   /**
127 |    * @return the input errors and the root errors
128 |    */
129 |   override fun getInputErrors(copy: Boolean): InputErrors {
130 | 
131 |     val inputErrors: List<List<DenseNDArray>> = this.processor.getInputsErrors(copy = false)
132 | 
133 |     val contextErrors = List(size = inputErrors.size, init = {
134 |       DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize))
135 |     })
136 | 
137 |     val rootErrors: DenseNDArray = DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize))
138 | 
139 |     inputErrors.forEachIndexed { tokenIndex, (depErrors, govErrors) ->
140 | 
141 |       val tokenId: Int = this.dependencyTree.elements[tokenIndex]
142 |       val depVector: DenseNDArray = contextErrors[tokenIndex]
143 |       val govVector: DenseNDArray = this.dependencyTree.getHead(tokenId)?.let {
144 |         contextErrors[this.dependencyTree.getPosition(it)]
145 |       } ?: rootErrors
146 | 
147 |       depVector.assignSum(depErrors)
148 |       govVector.assignSum(govErrors)
149 |     }
150 | 
151 |     return InputErrors(rootErrors = rootErrors, contextErrors = contextErrors)
152 |   }
153 | 
154 |   /**
155 |    * @param copy a Boolean indicating whether the returned errors must be a copy or a reference
156 |    *
157 |    * @return the errors of the [Labeler] parameters
158 |    */
159 |   override fun getParamsErrors(copy: Boolean) = this.processor.getParamsErrors(copy = copy)
160 | 
161 |   /**
162 |    * Transform the array resulting from the prediction into a list of [ScoredGrammar].
163 |    *
164 |    * @return a list of [ScoredGrammar]
165 |    */
166 |   private fun DenseNDArray.toScoredGrammar(): List<ScoredGrammar> = (0 until this.length)
167 |     .map { i -> ScoredGrammar(getGrammaticalConfiguration(i), score = this[i]) }
168 |     .sortedWith(compareByDescending { it.score })
169 | 
170 |   /**
171 |    * @param index a prediction index
172 |    *
173 |    * @return the grammatical configuration with the given [index]
174 |    */
175 |   private fun getGrammaticalConfiguration(index: Int): GrammaticalConfiguration =
176 |     this.model.grammaticalConfigurations.getElement(index)!!
177 | 
178 |   /**
179 |    * @param tokenId the id of a token of the input sentence
180 |    * @param lss the latent syntactic structure of the input sentence
181 |    *
182 |    * @return the list of features that encode the given token
183 |    */
184 |   private fun extractFeatures(tokenId: Int, lss: LatentSyntacticStructure<*, *>): List<DenseNDArray> =
185 |     listOf(
186 |       lss.getContextVectorById(tokenId),
187 |       this.dependencyTree.getHead(tokenId)?.let { lss.getContextVectorById(it) } ?: lss.virtualRoot
188 |     )
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/LabelerModel.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler
 9 | 
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterion
13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
14 | import com.kotlinnlp.simplednn.core.functionalities.activations.Softmax
15 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
16 | import com.kotlinnlp.simplednn.core.layers.LayerInterface
17 | import com.kotlinnlp.simplednn.core.layers.LayerType
18 | import com.kotlinnlp.simplednn.core.layers.StackedLayersParameters
19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
20 | import com.kotlinnlp.utils.DictionarySet
21 | import java.io.Serializable
22 | 
23 | /**
24 |  * The model of the [Labeler].
25 |  *
26 |  * @property contextEncodingSize the size of the token encoding vectors
27 |  * @property grammaticalConfigurations the dictionary set of all the possible grammatical configurations
28 |  * @property lossCriterionType the training mode
29 |  */
30 | class LabelerModel(
31 |   val contextEncodingSize: Int,
32 |   val grammaticalConfigurations: DictionarySet<GrammaticalConfiguration>,
33 |   val lossCriterionType: LossCriterionType
34 | ) : Serializable {
35 | 
36 |   companion object {
37 | 
38 |     /**
39 |      * Private val used to serialize the class (needed by Serializable).
40 |      */
41 |     @Suppress("unused")
42 |     private const val serialVersionUID: Long = 1L
43 |   }
44 | 
45 |   /**
46 |    * The score threshold above which to consider a labeler output valid.
47 |    * It makes sense with the Softmax activation function.
48 |    */
49 |   internal val labelerScoreThreshold: Double = 1.0 / this.grammaticalConfigurations.size
50 | 
51 |   /**
52 |    * The Network model that predicts the grammatical configurations.
53 |    */
54 |   val networkModel = StackedLayersParameters(
55 |     LayerInterface(sizes = listOf(this.contextEncodingSize, this.contextEncodingSize)),
56 |     LayerInterface(
57 |       size = this.contextEncodingSize,
58 |       connectionType = LayerType.Connection.Affine,
59 |       activationFunction = Tanh),
60 |     LayerInterface(
61 |       type = LayerType.Input.Dense,
62 |       size = this.grammaticalConfigurations.size,
63 |       connectionType = LayerType.Connection.Feedforward,
64 |       activationFunction = when (this.lossCriterionType) {
65 |         LossCriterionType.Softmax -> Softmax()
66 |         LossCriterionType.HingeLoss -> null
67 |       })
68 |   )
69 | 
70 |   /**
71 |    * Return the errors of a given labeler predictions, respect to a gold dependency tree.
72 |    * Errors are calculated comparing the last predictions done with the given gold grammatical configurations.
73 |    *
74 |    * @param predictions the current network predictions
75 |    * @param goldTree the gold tree of the sentence
76 |    *
77 |    * @return a list of predictions errors
78 |    */
79 |   fun calculateLoss(predictions: List<DenseNDArray>, goldTree: DependencyTree.Labeled): List<DenseNDArray> {
80 | 
81 |     val errorsList = mutableListOf<DenseNDArray>()
82 | 
83 |     predictions.forEachIndexed { tokenIndex, prediction ->
84 | 
85 |       val tokenId: Int = goldTree.elements[tokenIndex]
86 |       val errors: DenseNDArray = LossCriterion(this.lossCriterionType).getPredictionErrors(
87 |         prediction = prediction,
88 |         goldIndex = this.grammaticalConfigurations.getId(goldTree.getConfiguration(tokenId))!!)
89 | 
90 |       errorsList.add(errors)
91 |     }
92 | 
93 |     return errorsList
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/HingeLoss.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
 9 | 
10 | import com.kotlinnlp.simplednn.core.functionalities.losses.getErrorsByHingeLoss
11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
12 | 
13 | /**
14 |  * The loss criterion that calculates the errors with the hinge loss method.
15 |  */
16 | class HingeLoss : LossCriterion {
17 | 
18 |   /**
19 |    * @param prediction a prediction array
20 |    * @param goldIndex the index of the gold value
21 |    *
22 |    * @return the errors of the given prediction
23 |    */
24 |   override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray =
25 |     getErrorsByHingeLoss(prediction = prediction, goldIndex = goldIndex)
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterion.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
 9 | 
10 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
11 | 
12 | /**
13 |  * The LossCriterion interface.
14 |  */
15 | interface LossCriterion {
16 | 
17 |   companion object {
18 | 
19 |     /**
20 |      * The LossCriterion builder.
21 |      *
22 |      * @param type the loss criterion type
23 |      */
24 |     operator fun invoke(type: LossCriterionType): LossCriterion = when (type) {
25 |       LossCriterionType.Softmax -> Softmax()
26 |       LossCriterionType.HingeLoss -> HingeLoss()
27 |     }
28 |   }
29 | 
30 |   /**
31 |    * @param prediction a prediction array
32 |    * @param goldIndex the index of the gold value
33 |    *
34 |    * @return the errors of the given prediction
35 |    */
36 |   fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterionType.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
 9 | 
10 | /**
11 |  * The available loss criterion.
12 |  *
13 |  * @property Softmax calculate the errors with cross-entropy softmax
14 |  * @property HingeLoss calculate the errors with the hinge loss method
15 |  */
16 | enum class LossCriterionType { Softmax, HingeLoss }
17 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/ScoredGrammar.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
 9 | 
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | 
12 | /**
13 |  * The outcome of a single prediction of the labeler.
14 |  *
15 |  * @property config the grammatical configuration
16 |  * @property score the score
17 |  */
18 | data class ScoredGrammar(val config: GrammaticalConfiguration, val score: Double)
19 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/Softmax.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
 9 | 
10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator
11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
12 | 
13 | /**
14 |  * The loss criterion that calculates the errors with the cross-entropy softmax.
15 |  */
16 | class Softmax : LossCriterion {
17 | 
18 |   /**
19 |    * @param prediction a prediction array
20 |    * @param goldIndex the index of the gold value
21 |    *
22 |    * @return the errors of the given prediction
23 |    */
24 |   override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray =
25 |     SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, goldIndex = goldIndex)
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/BaseConverter.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.Token
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 | 
16 | /**
17 |  * The sentence converter from a [ParsingSentence] to a generic [Sentence].
18 |  */
19 | class BaseConverter : SentenceConverter<ParsingToken, ParsingSentence, Token, Sentence<Token>> {
20 | 
21 |   companion object {
22 | 
23 |     /**
24 |      * Private val used to serialize the class (needed by Serializable).
25 |      */
26 |     @Suppress("unused")
27 |     private const val serialVersionUID: Long = 1L
28 |   }
29 | 
30 |   /**
31 |    * Convert a given [ParsingSentence] to a generic [Sentence] simply casting it.
32 |    *
33 |    * @param sentence the input sentence
34 |    *
35 |    * @return the converted sentence
36 |    */
37 |   @Suppress("UNCHECKED_CAST")
38 |   override fun convert(sentence: ParsingSentence): Sentence<Token> = sentence as Sentence<Token>
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/FormConverter.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 | 
16 | /**
17 |  * The sentence converter from a [ParsingSentence] to a sentence of FormToken.
18 |  */
19 | class FormConverter : SentenceConverter<ParsingToken, ParsingSentence, FormToken, Sentence<FormToken>> {
20 | 
21 |   companion object {
22 | 
23 |     /**
24 |      * Private val used to serialize the class (needed by Serializable).
25 |      */
26 |     @Suppress("unused")
27 |     private const val serialVersionUID: Long = 1L
28 |   }
29 | 
30 |   /**
31 |    * Convert a given [ParsingSentence] to a to a sentence of FormToken simply casting it.
32 |    *
33 |    * @param sentence the input sentence
34 |    *
35 |    * @return the converted sentence
36 |    */
37 |   @Suppress("UNCHECKED_CAST")
38 |   override fun convert(sentence: ParsingSentence): Sentence<FormToken> = sentence as Sentence<FormToken>
39 | }


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/MorphoConverter.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * -----------------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
 9 | 
10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 | 
16 | /**
17 |  * The sentence converter from a [ParsingSentence] to a [MorphoSentence].
18 |  */
19 | class MorphoConverter : SentenceConverter<ParsingToken, ParsingSentence, FormToken, MorphoSentence<FormToken>> {
20 | 
21 |   companion object {
22 | 
23 |     /**
24 |      * Private val used to serialize the class (needed by Serializable).
25 |      */
26 |     @Suppress("unused")
27 |     private const val serialVersionUID: Long = 1L
28 |   }
29 | 
30 |   /**
31 |    * Convert a given [ParsingSentence] to a [MorphoSentence] simply casting it.
32 |    *
33 |    * @param sentence the input sentence
34 |    *
35 |    * @return the converted sentence
36 |    */
37 |   @Suppress("UNCHECKED_CAST")
38 |   override fun convert(sentence: ParsingSentence): MorphoSentence<FormToken> = sentence as MorphoSentence<FormToken>
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/utils/Extensions.kt:
--------------------------------------------------------------------------------
 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
 2 |  *
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 6 |  * ------------------------------------------------------------------*/
 7 | 
 8 | package com.kotlinnlp.neuralparser.utils
 9 | 
10 | import com.kotlinnlp.conllio.CoNLLReader
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Sentence.InvalidTree
13 | import java.io.File
14 | 
15 | /**
16 |  * Load sentences from a CoNLL file.
17 |  *
18 |  * @param type the string that describes the type of sentences
19 |  * @param filePath the file path
20 |  * @param maxSentences the max number of sentences to load
21 |  * @param skipNonProjective whether to skip non-projective sentences
22 |  *
23 |  * @return the list of loaded sentences
24 |  */
25 | fun loadSentences(type: String,
26 |                   filePath: String,
27 |                   maxSentences: Int?,
28 |                   skipNonProjective: Boolean): List<CoNLLSentence> {
29 | 
30 |   println("Loading $type sentences from '%s'%s%s...".format(
31 |     filePath,
32 |     maxSentences?.let { " (max $it)" } ?: "",
33 |     if (skipNonProjective) " skipping non-projective" else ""
34 |   ))
35 | 
36 |   return filePath.loadFromTreeBank(skipNonProjective = skipNonProjective, maxSentences = maxSentences)
37 | }
38 | 
39 | /**
40 |  * Return a list of CoNLL sentences from a tree-bank at this path.
41 |  *
42 |  * @param maxSentences the maximum number of sentences to load (null = unlimited)
43 |  * @param skipNonProjective whether to skip non-projective sentences
44 |  *
45 |  * @throws InvalidTree if the tree of a sentence is not valid
46 |  */
47 | private fun String.loadFromTreeBank(maxSentences: Int? = null,
48 |                                     skipNonProjective: Boolean = false): List<CoNLLSentence> {
49 | 
50 |   var index = 0
51 |   val sentences = ArrayList<CoNLLSentence>()
52 | 
53 |   CoNLLReader.forEachSentence(File(this)) { sentence ->
54 | 
55 |     if (maxSentences == null || index < maxSentences) {
56 | 
57 |       if (sentence.hasAnnotatedHeads()) sentence.assertValidCoNLLTree()
58 | 
59 |       val skip: Boolean = skipNonProjective && sentence.isNonProjective()
60 | 
61 |       if (!skip) sentences.add(sentence)
62 |     }
63 | 
64 |     index++
65 |   }
66 | 
67 |   return sentences.toList()
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------