├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── README.md
├── examples
├── ExampleUtils.kt
├── evaluation
│ ├── CommandLineArguments.kt
│ └── EvaluateLHR.kt
├── pom.xml
└── training
│ ├── CommandLineArguments.kt
│ └── TrainLHR.kt
├── pom.xml
└── src
└── main
└── kotlin
└── com
└── kotlinnlp
└── neuralparser
├── NeuralParser.kt
├── NeuralParserModel.kt
├── helpers
├── Trainer.kt
├── labelerselector
│ ├── LabelerSelector.kt
│ ├── MorphoSelector.kt
│ └── NoFilterSelector.kt
├── preprocessors
│ ├── BasePreprocessor.kt
│ ├── CoNLLPreprocessor.kt
│ ├── MorphoPreprocessor.kt
│ └── SentencePreprocessor.kt
├── sentencebuilder
│ ├── CompositeTokenHelper.kt
│ ├── LabeledMorphoSynBuilder.kt
│ └── UnlabeledMorphoSynBuilder.kt
├── statistics
│ ├── BaseStatistics.kt
│ ├── MetricsCounter.kt
│ ├── SentenceMetrics.kt
│ └── Statistics.kt
└── validator
│ ├── CoNLLDependencyParser.kt
│ ├── CoNLLFileValidator.kt
│ └── Validator.kt
├── language
├── BaseSentence.kt
├── BaseToken.kt
├── CorpusDictionary.kt
├── Extensions.kt
├── ParsingSentence.kt
└── ParsingToken.kt
├── parsers
└── lhrparser
│ ├── LHRModel.kt
│ ├── LHRParser.kt
│ ├── LHRTrainer.kt
│ ├── LHRTransferLearning.kt
│ ├── helpers
│ ├── CyclesFixer.kt
│ ├── GreedyDependencyTreeBuilder.kt
│ └── keyextractors
│ │ └── PosTagKeyExtractor.kt
│ ├── neuralmodules
│ ├── PositionalEncoder.kt
│ └── labeler
│ │ ├── Labeler.kt
│ │ ├── LabelerModel.kt
│ │ └── utils
│ │ ├── HingeLoss.kt
│ │ ├── LossCriterion.kt
│ │ ├── LossCriterionType.kt
│ │ ├── ScoredGrammar.kt
│ │ └── Softmax.kt
│ └── sentenceconverters
│ ├── BaseConverter.kt
│ ├── FormConverter.kt
│ └── MorphoConverter.kt
└── utils
└── Extensions.kt
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Configuration ###
2 | config/*
3 | !config/configuration.yaml.example
4 |
5 | ### Intellij ###
6 | .idea/
7 | /out/
8 |
9 | ### Intellij Patch ###
10 | *.iml
11 |
12 | /resources/
13 |
14 | ### Maven ###
15 | target/
16 | pom.xml.tag
17 | pom.xml.releaseBackup
18 | pom.xml.versionsBackup
19 | pom.xml.next
20 | release.properties
21 | dependency-reduced-pom.xml
22 | buildNumber.properties
23 | .mvn/timing.properties
24 |
25 | # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
26 | !/.mvn/wrapper/maven-wrapper.jar
27 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | sudo: false
4 |
5 | addons:
6 | apt:
7 | packages:
8 | - oracle-java8-installer
9 |
10 | os:
11 | - linux
12 |
13 | dist: trusty
14 |
15 | jdk:
16 | - oraclejdk8
17 |
18 | install: true
19 |
20 | script: mvn test compile -B -Dmaven.javadoc.skip=true
21 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Mozilla Public License Version 2.0
2 | ==================================
3 |
4 | 1. Definitions
5 | --------------
6 |
7 | 1.1. "Contributor"
8 | means each individual or legal entity that creates, contributes to
9 | the creation of, or owns Covered Software.
10 |
11 | 1.2. "Contributor Version"
12 | means the combination of the Contributions of others (if any) used
13 | by a Contributor and that particular Contributor's Contribution.
14 |
15 | 1.3. "Contribution"
16 | means Covered Software of a particular Contributor.
17 |
18 | 1.4. "Covered Software"
19 | means Source Code Form to which the initial Contributor has attached
20 | the notice in Exhibit A, the Executable Form of such Source Code
21 | Form, and Modifications of such Source Code Form, in each case
22 | including portions thereof.
23 |
24 | 1.5. "Incompatible With Secondary Licenses"
25 | means
26 |
27 | (a) that the initial Contributor has attached the notice described
28 | in Exhibit B to the Covered Software; or
29 |
30 | (b) that the Covered Software was made available under the terms of
31 | version 1.1 or earlier of the License, but not also under the
32 | terms of a Secondary License.
33 |
34 | 1.6. "Executable Form"
35 | means any form of the work other than Source Code Form.
36 |
37 | 1.7. "Larger Work"
38 | means a work that combines Covered Software with other material, in
39 | a separate file or files, that is not Covered Software.
40 |
41 | 1.8. "License"
42 | means this document.
43 |
44 | 1.9. "Licensable"
45 | means having the right to grant, to the maximum extent possible,
46 | whether at the time of the initial grant or subsequently, any and
47 | all of the rights conveyed by this License.
48 |
49 | 1.10. "Modifications"
50 | means any of the following:
51 |
52 | (a) any file in Source Code Form that results from an addition to,
53 | deletion from, or modification of the contents of Covered
54 | Software; or
55 |
56 | (b) any new file in Source Code Form that contains any Covered
57 | Software.
58 |
59 | 1.11. "Patent Claims" of a Contributor
60 | means any patent claim(s), including without limitation, method,
61 | process, and apparatus claims, in any patent Licensable by such
62 | Contributor that would be infringed, but for the grant of the
63 | License, by the making, using, selling, offering for sale, having
64 | made, import, or transfer of either its Contributions or its
65 | Contributor Version.
66 |
67 | 1.12. "Secondary License"
68 | means either the GNU General Public License, Version 2.0, the GNU
69 | Lesser General Public License, Version 2.1, the GNU Affero General
70 | Public License, Version 3.0, or any later versions of those
71 | licenses.
72 |
73 | 1.13. "Source Code Form"
74 | means the form of the work preferred for making modifications.
75 |
76 | 1.14. "You" (or "Your")
77 | means an individual or a legal entity exercising rights under this
78 | License. For legal entities, "You" includes any entity that
79 | controls, is controlled by, or is under common control with You. For
80 | purposes of this definition, "control" means (a) the power, direct
81 | or indirect, to cause the direction or management of such entity,
82 | whether by contract or otherwise, or (b) ownership of more than
83 | fifty percent (50%) of the outstanding shares or beneficial
84 | ownership of such entity.
85 |
86 | 2. License Grants and Conditions
87 | --------------------------------
88 |
89 | 2.1. Grants
90 |
91 | Each Contributor hereby grants You a world-wide, royalty-free,
92 | non-exclusive license:
93 |
94 | (a) under intellectual property rights (other than patent or trademark)
95 | Licensable by such Contributor to use, reproduce, make available,
96 | modify, display, perform, distribute, and otherwise exploit its
97 | Contributions, either on an unmodified basis, with Modifications, or
98 | as part of a Larger Work; and
99 |
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 | for sale, have made, import, and otherwise transfer either its
102 | Contributions or its Contributor Version.
103 |
104 | 2.2. Effective Date
105 |
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 |
110 | 2.3. Limitations on Grant Scope
111 |
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 |
118 | (a) for any code that a Contributor has removed from Covered Software;
119 | or
120 |
121 | (b) for infringements caused by: (i) Your and any other third party's
122 | modifications of Covered Software, or (ii) the combination of its
123 | Contributions with other software (except as part of its Contributor
124 | Version); or
125 |
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 | its Contributions.
128 |
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 |
133 | 2.4. Subsequent Licenses
134 |
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 |
140 | 2.5. Representation
141 |
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 |
146 | 2.6. Fair Use
147 |
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 |
152 | 2.7. Conditions
153 |
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 |
157 | 3. Responsibilities
158 | -------------------
159 |
160 | 3.1. Distribution of Source Form
161 |
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 |
170 | 3.2. Distribution of Executable Form
171 |
172 | If You distribute Covered Software in Executable Form then:
173 |
174 | (a) such Covered Software must also be made available in Source Code
175 | Form, as described in Section 3.1, and You must inform recipients of
176 | the Executable Form how they can obtain a copy of such Source Code
177 | Form by reasonable means in a timely manner, at a charge no more
178 | than the cost of distribution to the recipient; and
179 |
180 | (b) You may distribute such Executable Form under the terms of this
181 | License, or sublicense it under different terms, provided that the
182 | license for the Executable Form does not attempt to limit or alter
183 | the recipients' rights in the Source Code Form under this License.
184 |
185 | 3.3. Distribution of a Larger Work
186 |
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 |
198 | 3.4. Notices
199 |
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 |
206 | 3.5. Application of Additional Terms
207 |
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 |
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 |
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 |
232 | 5. Termination
233 | --------------
234 |
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 |
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 |
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 |
261 | ************************************************************************
262 | * *
263 | * 6. Disclaimer of Warranty *
264 | * ------------------------- *
265 | * *
266 | * Covered Software is provided under this License on an "as is" *
267 | * basis, without warranty of any kind, either expressed, implied, or *
268 | * statutory, including, without limitation, warranties that the *
269 | * Covered Software is free of defects, merchantable, fit for a *
270 | * particular purpose or non-infringing. The entire risk as to the *
271 | * quality and performance of the Covered Software is with You. *
272 | * Should any Covered Software prove defective in any respect, You *
273 | * (not any Contributor) assume the cost of any necessary servicing, *
274 | * repair, or correction. This disclaimer of warranty constitutes an *
275 | * essential part of this License. No use of any Covered Software is *
276 | * authorized under this License except under this disclaimer. *
277 | * *
278 | ************************************************************************
279 |
280 | ************************************************************************
281 | * *
282 | * 7. Limitation of Liability *
283 | * -------------------------- *
284 | * *
285 | * Under no circumstances and under no legal theory, whether tort *
286 | * (including negligence), contract, or otherwise, shall any *
287 | * Contributor, or anyone who distributes Covered Software as *
288 | * permitted above, be liable to You for any direct, indirect, *
289 | * special, incidental, or consequential damages of any character *
290 | * including, without limitation, damages for lost profits, loss of *
291 | * goodwill, work stoppage, computer failure or malfunction, or any *
292 | * and all other commercial damages or losses, even if such party *
293 | * shall have been informed of the possibility of such damages. This *
294 | * limitation of liability shall not apply to liability for death or *
295 | * personal injury resulting from such party's negligence to the *
296 | * extent applicable law prohibits such limitation. Some *
297 | * jurisdictions do not allow the exclusion or limitation of *
298 | * incidental or consequential damages, so this exclusion and *
299 | * limitation may not apply to You. *
300 | * *
301 | ************************************************************************
302 |
303 | 8. Litigation
304 | -------------
305 |
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 |
313 | 9. Miscellaneous
314 | ----------------
315 |
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 |
323 | 10. Versions of the License
324 | ---------------------------
325 |
326 | 10.1. New Versions
327 |
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 |
333 | 10.2. Effect of New Versions
334 |
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 |
340 | 10.3. Modified Versions
341 |
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 |
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 |
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 |
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 |
358 | This Source Code Form is subject to the terms of the Mozilla Public
359 | License, v. 2.0. If a copy of the MPL was not distributed with this
360 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 |
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 |
367 | You may add additional accurate notices of copyright ownership.
368 |
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 |
372 | This Source Code Form is "Incompatible With Secondary Licenses", as
373 | defined by the Mozilla Public License, v. 2.0.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NeuralParser [](https://search.maven.org/search?q=g:%22com.kotlinnlp%22%20AND%20a:%22neuralparser%22) [](https://travis-ci.org/KotlinNLP/NeuralParser)
2 |
3 | NeuralParser is a very simple to use dependency parser, based on the
4 | [SimpleDNN](https://github.com/kotlinnlp/SimpleDNN "SimpleDNN on GitHub") library and the
5 | [SyntaxDecoder](https://github.com/kotlinnlp/SyntaxDecoder "SyntaxDecoder on GitHub") transition systems framework.
6 |
7 | NeuralParser is part of [KotlinNLP](http://kotlinnlp.com/ "KotlinNLP").
8 |
9 |
10 | ## Getting Started
11 |
12 | ### Import with Maven
13 |
14 | ```xml
15 |
16 | com.kotlinnlp
17 | neuralparser
18 | 0.6.5
19 |
20 | ```
21 |
22 | ### Examples
23 |
24 | Try some examples of training and evaluation of NeuralParser running the files in the `examples` folder.
25 |
26 |
27 | ## License
28 |
29 | This software is released under the terms of the
30 | [Mozilla Public License, v. 2.0](https://mozilla.org/MPL/2.0/ "Mozilla Public License, v. 2.0")
31 |
32 |
33 | ## Contributions
34 |
35 | We greatly appreciate any bug reports and contributions, which can be made by filing an issue or making a pull
36 | request through the [github page](https://github.com/kotlinnlp/NeuralParser "NeuralParser on GitHub").
37 |
--------------------------------------------------------------------------------
/examples/ExampleUtils.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
9 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
10 | import com.kotlinnlp.neuralparser.helpers.preprocessors.MorphoPreprocessor
11 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
12 |
13 | /**
14 | * Build a [SentencePreprocessor].
15 | *
16 | * @param morphoDictionary a morphology dictionary
17 | *
18 | * @return a new sentence preprocessor
19 | */
20 | internal fun buildSentencePreprocessor(morphoDictionary: MorphologyDictionary?): SentencePreprocessor =
21 | morphoDictionary?.let { MorphoPreprocessor(dictionary = it) } ?: BasePreprocessor()
22 |
--------------------------------------------------------------------------------
/examples/evaluation/CommandLineArguments.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package evaluation
9 |
10 | import com.xenomachina.argparser.ArgParser
11 | import com.xenomachina.argparser.default
12 |
13 | /**
14 | * The interpreter of command line arguments for the evaluation script.
15 | *
16 | * @param args the array of command line arguments
17 | */
18 | class CommandLineArguments(args: Array) {
19 |
20 | /**
21 | * The parser of the string arguments.
22 | */
23 | private val parser = ArgParser(args)
24 |
25 | /**
26 | * The file path of the serialized model.
27 | */
28 | val modelPath: String by parser.storing(
29 | "-m",
30 | "--model-path",
31 | help="the file path of the serialized model"
32 | )
33 |
34 | /**
35 | * The file path of the validation set.
36 | */
37 | val validationSetPath: String by parser.storing(
38 | "-v",
39 | "--validation-set",
40 | help="the file path of the validation set"
41 | )
42 |
43 | /**
44 | * The file path of the serialized morphology dictionary.
45 | */
46 | val morphoDictionaryPath: String? by parser.storing(
47 | "-d",
48 | "--dictionary",
49 | help="the file path of the serialized morphology dictionary"
50 | ).default { null }
51 |
52 | /**
53 | * Force parsing all arguments (only read ones are parsed by default).
54 | */
55 | init {
56 | parser.force()
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/examples/evaluation/EvaluateLHR.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package evaluation
9 |
10 | import buildSentencePreprocessor
11 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
12 | import com.kotlinnlp.neuralparser.NeuralParser
13 | import com.kotlinnlp.neuralparser.NeuralParserModel
14 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel
16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser
17 | import com.kotlinnlp.neuralparser.utils.loadSentences
18 | import com.kotlinnlp.utils.Timer
19 | import com.xenomachina.argparser.mainBody
20 | import java.io.File
21 | import java.io.FileInputStream
22 |
23 | /**
24 | * Evaluate the model of an [LHRParser].
25 | *
26 | * Launch with the '-h' option for help about the command line arguments.
27 | */
28 | fun main(args: Array) = mainBody {
29 |
30 | val parsedArgs = CommandLineArguments(args)
31 |
32 | val parser: NeuralParser<*> = LHRParser(
33 | model = parsedArgs.modelPath.let {
34 | println("Loading model from '$it'.")
35 | NeuralParserModel.load(FileInputStream(File(it))) as LHRModel
36 | })
37 |
38 | val validator = Validator(
39 | neuralParser = parser,
40 | sentences = loadSentences(
41 | type = "validation",
42 | filePath = parsedArgs.validationSetPath,
43 | maxSentences = null,
44 | skipNonProjective = false),
45 | sentencePreprocessor = buildSentencePreprocessor(
46 | morphoDictionary = parsedArgs.morphoDictionaryPath?.let {
47 | println("Loading serialized dictionary from '$it'...")
48 | MorphologyDictionary.load(FileInputStream(File(it)))
49 | }
50 | ))
51 |
52 | val timer = Timer()
53 | val evaluation = validator.evaluate()
54 |
55 | println("\n$evaluation")
56 | println("\nElapsed time: ${timer.formatElapsedTime()}")
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/examples/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | 4.0.0
7 |
8 | com.kotlinnlp
9 | neuralparser-examples
10 | 0.1.0
11 | jar
12 |
13 |
14 | UTF-8
15 | 2.6
16 | 1.3.31
17 | 0.0.5
18 | 2.0.7
19 | 5.2
20 | 0.6.5
21 |
22 |
23 |
24 |
25 | jcenter
26 | https://jcenter.bintray.com/
27 |
28 |
29 |
30 |
31 |
32 | jcenter
33 | JCenter
34 | https://jcenter.bintray.com/
35 |
36 |
37 |
38 |
39 | .
40 |
41 |
42 |
43 | org.jetbrains.kotlin
44 | kotlin-maven-plugin
45 | ${kotlin.version}
46 |
47 | 1.8
48 |
49 |
50 |
51 | compile
52 | process-sources
53 |
54 | compile
55 |
56 |
57 |
58 |
59 |
60 |
61 | org.apache.maven.plugins
62 | maven-assembly-plugin
63 | ${assembly-plugin.version}
64 |
65 |
66 | lhr-training-assembly
67 | package
68 | single
69 |
70 | lhr-parser-train
71 |
72 |
73 | true
74 | training.TrainLHRKt
75 |
76 |
77 |
78 | jar-with-dependencies
79 |
80 |
81 |
82 |
83 | lhr-evaluation-assembly
84 | package
85 | single
86 |
87 | lhr-parser-eval
88 |
89 |
90 | true
91 | evaluation.EvaluateLHRKt
92 |
93 |
94 |
95 | jar-with-dependencies
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 | org.jetbrains.kotlin
107 | kotlin-stdlib-jdk8
108 | ${kotlin.version}
109 |
110 |
111 |
112 | org.jetbrains.kotlin
113 | kotlin-reflect
114 | ${kotlin.version}
115 |
116 |
117 |
118 | com.beust
119 | klaxon
120 | ${klaxon.version}
121 |
122 |
123 |
124 | com.xenomachina
125 | xenocom
126 | ${xenocom.version}
127 |
128 |
129 |
130 | com.xenomachina
131 | kotlin-argparser
132 | ${argparser.version}
133 |
134 |
135 |
136 | com.kotlinnlp
137 | neuralparser
138 | ${neuralparser.version}
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/examples/training/CommandLineArguments.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package training
9 |
10 | import com.xenomachina.argparser.ArgParser
11 | import com.xenomachina.argparser.InvalidArgumentException
12 | import com.xenomachina.argparser.default
13 |
14 | /**
15 | * The interpreter of command line arguments for the training script.
16 | *
17 | * @param args the array of command line arguments
18 | */
19 | class CommandLineArguments(args: Array) {
20 |
21 | /**
22 | * The type of tokens encoding.
23 | *
24 | * TODO: AMBIGUOUS_POS
25 | */
26 | enum class TokensEncodingType {
27 | WORD_EMBEDDINGS,
28 | WORD_AND_POS_EMBEDDINGS,
29 | WORD_AND_EXT_AND_POS_EMBEDDINGS,
30 | MORPHO_FEATURES,
31 | CHARLM
32 | }
33 |
34 | /**
35 | * The parser of the string arguments.
36 | */
37 | private val parser = ArgParser(args)
38 |
39 | /**
40 | * The language code
41 | */
42 | val langCode: String by parser.storing(
43 | "-l",
44 | "--language",
45 | help="the language ISO 639-1 code"
46 | )
47 |
48 | /**
49 | * The number of training epochs (default = 10).
50 | */
51 | val epochs: Int by parser.storing(
52 | "-e",
53 | "--epochs",
54 | help="the number of training epochs (default = 10)"
55 | ) { toInt() }.default(10)
56 |
57 | /**
58 | * The size of the batches of sentences (default = 1).
59 | */
60 | val batchSize: Int by parser.storing(
61 | "-b",
62 | "--batch-size",
63 | help="the size of the batches of sentences (default = 1)"
64 | ) { toInt() }.default(1)
65 |
66 | /**
67 | * The maximum number of sentences to load for training (default unlimited)
68 | */
69 | val maxSentences: Int? by parser.storing(
70 | "-s",
71 | "--max-sentences",
72 | help="the maximum number of sentences to load for training (default unlimited)"
73 | ) { toInt() }.default { null }
74 |
75 | /**
76 | * The file path of the training set.
77 | */
78 | val trainingSetPath: String by parser.storing(
79 | "-t",
80 | "--training-set",
81 | help="the file path of the training set"
82 | )
83 |
84 | /**
85 | * The file path of the gold-POS training set.
86 | * TODO: Re-enable for LHR transfer learning.
87 | */
88 | // val goldPosSetPath: String? by parser.storing(
89 | // "-p",
90 | // "--pos-set",
91 | // help="the file path of the gold-POS training set"
92 | // ).default { null }
93 |
94 | /**
95 | * The file path of the validation set.
96 | */
97 | val validationSetPath: String by parser.storing(
98 | "-v",
99 | "--validation-set",
100 | help="the file path of the validation set"
101 | )
102 |
103 | /**
104 | * The path of the file in which to save the serialized model.
105 | */
106 | val modelPath: String by parser.storing(
107 | "-m",
108 | "--model-path",
109 | help="the path of the file in which to save the serialized model"
110 | )
111 |
112 | /**
113 | * The file path of the pre-trained word embeddings.
114 | */
115 | val embeddingsPath: String? by parser.storing(
116 | "-w",
117 | "--trained-word-emb-path",
118 | help="the file path of the pre-trained word embeddings"
119 | ).default { null }
120 |
121 | /**
122 | * The number of stacked BiRNNs of the context encoder (default 2).
123 | */
124 | val numOfContextLayers: Int by parser.storing(
125 | "-c",
126 | "--context-layers",
127 | help="the number of stacked BiRNNs of the context encoder (default 2)"
128 | ){ toInt() }
129 | .default(2)
130 | .addValidator { if (value < 1) throw InvalidArgumentException( "The number of context-layers must >= 1") }
131 |
132 | /**
133 | * The size of the word embedding vectors.
134 | */
135 | val wordEmbeddingSize: Int by parser.storing(
136 | "--word-emb-size",
137 | help="the size of the word embedding vectors (default 150)"
138 | ){ toInt() }.default(150)
139 |
140 | /**
141 | * The word embeddings dropout coefficient.
142 | */
143 | val wordDropoutCoefficient: Double by parser.storing(
144 | "--word-dropout",
145 | help="the word embeddings dropout coefficient (default 0.25)"
146 | ){ toDouble() }.default(0.25)
147 |
148 | /**
149 | * The size of the part-of-speech embedding vectors.
150 | */
151 | val posEmbeddingSize: Int by parser.storing(
152 | "--pos-emb-size",
153 | help="the size of the part-of-speech embedding vectors (default 50)"
154 | ){ toInt() }.default(50)
155 |
156 | /**
157 | * The part-of-speech embeddings dropout coefficient.
158 | */
159 | val posDropoutCoefficient: Double by parser.storing(
160 | "--pos-dropout",
161 | help="the part-of-speech embeddings dropout coefficient (default 0.0)"
162 | ){ toDouble() }.default(0.0)
163 |
164 | /**
165 | * Whether to skip non-projective sentences.
166 | */
167 | val skipNonProjective: Boolean by parser.flagging(
168 | "--skip-non-projective",
169 | help="whether to skip non-projective sentences"
170 | )
171 |
172 | /**
173 | * Whether to do not consider punctuation errors.
174 | */
175 | val skipPunctuationErrors: Boolean by parser.flagging(
176 | "--skip-punct-err",
177 | help="whether to do not consider punctuation errors"
178 | )
179 |
180 | /**
181 | * Whether to do not use the labeler.
182 | */
183 | val noLabeler: Boolean by parser.flagging(
184 | "--no-labeler",
185 | help="whether to do not use the labeler"
186 | )
187 |
188 | /**
189 | * Whether to do not predict the POS tags.
190 | */
191 | val noPosPrediction: Boolean by parser.flagging(
192 | "--no-pos",
193 | help="whether to do not predict the POS tags"
194 | )
195 |
196 | /**
197 | * The file path of the serialized morphology dictionary.
198 | */
199 | val morphoDictionaryPath: String? by parser.storing(
200 | "-d",
201 | "--dictionary",
202 | help="the file path of the serialized morphology dictionary"
203 | ).default { null }
204 |
205 | /**
206 | * The file path of the lexicon dictionary.
207 | */
208 | val lexiconDictionaryPath: String? by parser.storing(
209 | "-x",
210 | "--lexicon",
211 | help="the file path of the lexicon dictionary"
212 | ).default { null }
213 |
214 | /**
215 | * The file path of the serialized characters language model.
216 | */
217 | val charLMModelPath: String? by parser.storing(
218 | "--charlm",
219 | help="the file path of the serialized characters language model"
220 | ).default { null }
221 |
222 | /**
223 | * The file path of the serialized characters language model for reverse encodings.
224 | */
225 | val charLMRevModelPath: String? by parser.storing(
226 | "--charlm-rev",
227 | help="the file path of the serialized characters language model for reverse encodings"
228 | ).default { null }
229 |
230 | /**
231 | * The type of morphology encoding.
232 | */
233 | val tokensEncodingType: TokensEncodingType by parser.mapping(
234 | "--tokens-word-emb" to TokensEncodingType.WORD_EMBEDDINGS,
235 | "--tokens-word-pos-emb" to TokensEncodingType.WORD_AND_POS_EMBEDDINGS,
236 | "--tokens-word-ext-pos-emb" to TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS,
237 | "--tokens-morpho" to TokensEncodingType.MORPHO_FEATURES,
238 | "--tokens-charlm" to TokensEncodingType.CHARLM,
239 | help = "the type of morphology encoding (default --tokens-word-pos-emb)"
240 | ).default { TokensEncodingType.WORD_AND_POS_EMBEDDINGS }
241 |
242 | /**
243 | * Whether to do not show details about the training.
244 | */
245 | val quiet: Boolean by parser.flagging(
246 | "-q",
247 | "--quiet",
248 | help="whether to do not show details about the training "
249 | )
250 |
251 | /**
252 | * Force parsing all arguments (only read ones are parsed by default).
253 | * Check the dependencies between more arguments.
254 | */
255 | init {
256 |
257 | parser.force()
258 |
259 | this.checkDependencies()
260 | }
261 |
262 | /**
263 | * Check the dependencies between more arguments.
264 | */
265 | private fun checkDependencies() {
266 |
267 | if (this.tokensEncodingType == TokensEncodingType.CHARLM) {
268 | this.charLMModelPath ?: throw RuntimeException("Missing characters language model path")
269 | this.charLMRevModelPath ?: throw RuntimeException("Missing reverse characters language model path")
270 | }
271 | }
272 | }
273 |
--------------------------------------------------------------------------------
/examples/training/TrainLHR.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package training
9 |
10 | import buildSentencePreprocessor
11 | import com.kotlinnlp.languagemodel.CharLM
12 | import com.kotlinnlp.linguisticdescription.language.getLanguageByIso
13 | import com.kotlinnlp.linguisticdescription.lexicon.LexiconDictionary
14 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
15 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
16 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
17 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
18 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
19 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
20 | import com.kotlinnlp.lssencoder.LSSModel
21 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer
22 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
23 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
24 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
25 | import com.kotlinnlp.simplednn.core.layers.LayerType
26 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
27 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
28 | import com.kotlinnlp.neuralparser.language.*
29 | import com.kotlinnlp.simplednn.deeplearning.birnn.BiRNNConfig
30 | import com.kotlinnlp.tokensencoder.embeddings.EmbeddingsEncoderModel
31 | import com.xenomachina.argparser.mainBody
32 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel
33 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser
34 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRTrainer
35 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
36 | import com.kotlinnlp.tokensencoder.wrapper.MirrorConverter
37 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.MorphoConverter
38 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors.PosTagKeyExtractor
39 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.FormConverter
40 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.NormWordKeyExtractor
41 | import com.kotlinnlp.neuralparser.utils.loadSentences
42 | import com.kotlinnlp.simplednn.core.embeddings.EmbeddingsMap
43 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
44 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge
45 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.ConcatMerge
46 | import com.kotlinnlp.tokensencoder.charlm.CharLMEncoderModel
47 | import com.kotlinnlp.tokensencoder.ensemble.EnsembleTokensEncoderModel
48 | import com.kotlinnlp.tokensencoder.morpho.FeaturesCollector
49 | import com.kotlinnlp.tokensencoder.morpho.MorphoEncoderModel
50 | import com.kotlinnlp.tokensencoder.wrapper.TokensEncoderWrapperModel
51 | import java.io.File
52 | import java.io.FileInputStream
53 |
54 | /**
55 | * Train the [LHRParser].
56 | *
57 | * Launch with the '-h' option for help about the command line arguments.
58 | */
59 | fun main(args: Array) = mainBody {
60 |
61 | val parsedArgs = CommandLineArguments(args)
62 |
63 | val trainingSentences: List = loadSentences(
64 | type = "training",
65 | filePath = parsedArgs.trainingSetPath,
66 | maxSentences = parsedArgs.maxSentences,
67 | skipNonProjective = parsedArgs.skipNonProjective)
68 |
69 | val corpus: CorpusDictionary = trainingSentences.let {
70 | println("Creating corpus dictionary...")
71 | CorpusDictionary(it)
72 | }
73 |
74 | val morphologyDictionary: MorphologyDictionary? = parsedArgs.morphoDictionaryPath?.let {
75 | println("Loading serialized dictionary from '$it'...")
76 | MorphologyDictionary.load(FileInputStream(File(it)))
77 | }
78 |
79 | val parser: LHRParser = buildParser(
80 | parsedArgs = parsedArgs,
81 | tokensEncoderWrapperModel = buildTokensEncoderWrapperModel(
82 | parsedArgs = parsedArgs,
83 | sentences = trainingSentences,
84 | corpus = corpus,
85 | morphologyDictionary = morphologyDictionary),
86 | corpus = corpus)
87 |
88 | val trainer = buildTrainer(parser = parser, parsedArgs = parsedArgs, morphologyDictionary = morphologyDictionary)
89 |
90 | println("\n-- MODEL")
91 | println(parser.model)
92 |
93 | println("\n-- START TRAINING ON %d SENTENCES".format(trainingSentences.size))
94 | println(trainer)
95 |
96 | trainer.train(trainingSentences = trainingSentences)
97 | }
98 |
99 | /**
100 | * Build the LHR Parser.
101 | *
102 | * @param parsedArgs the parsed command line arguments
103 | * @param tokensEncoderWrapperModel the tokens-encoder wrapper model
104 | * @param corpus the corpus dictionary
105 | *
106 | * @return a new parser
107 | */
108 | private fun buildParser(
109 | parsedArgs: CommandLineArguments,
110 | tokensEncoderWrapperModel: TokensEncoderWrapperModel,
111 | corpus: CorpusDictionary
112 | ): LHRParser = LHRParser(model = LHRModel(
113 | corpusDictionary = corpus,
114 | lssModel = LSSModel(
115 | language = getLanguageByIso(parsedArgs.langCode),
116 | tokensEncoderWrapperModel = tokensEncoderWrapperModel,
117 | contextBiRNNConfig = BiRNNConfig(
118 | connectionType = LayerType.Connection.LSTM,
119 | hiddenActivation = Tanh,
120 | numberOfLayers = parsedArgs.numOfContextLayers),
121 | headsBiRNNConfig = BiRNNConfig(
122 | connectionType = LayerType.Connection.LSTM,
123 | hiddenActivation = Tanh)
124 | ),
125 | useLabeler = !parsedArgs.noLabeler,
126 | lossCriterionType = LossCriterionType.Softmax,
127 | predictPosTags = !parsedArgs.noPosPrediction))
128 |
129 | /**
130 | * Build a tokens-encoder wrapper model.
131 | *
132 | * @param parsedArgs the parsed command line arguments
133 | * @param corpus the corpus dictionary
134 | *
135 | * @return a new tokens-encoder wrapper model
136 | */
137 | private fun buildTokensEncoderWrapperModel(
138 | parsedArgs: CommandLineArguments,
139 | sentences: List, // TODO: it will be used to initialize the MorphoEncoder
140 | corpus: CorpusDictionary,
141 | morphologyDictionary: MorphologyDictionary?
142 | ): TokensEncoderWrapperModel =
143 |
144 | when (parsedArgs.tokensEncodingType) {
145 |
146 | CommandLineArguments.TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder
147 |
148 | val embeddingsMap = EmbeddingsMap.fromSet(
149 | size = parsedArgs.wordEmbeddingSize,
150 | elements = corpus.words.getElementsReversedSet())
151 |
152 | val preEmbeddingsMap = parsedArgs.embeddingsPath!!.let {
153 | println("Loading pre-trained word embeddings from '$it'...")
154 | EmbeddingsMap.load(filename = it)
155 | }
156 |
157 | val posEmbeddingsMap = EmbeddingsMap.fromSet(
158 | size = parsedArgs.posEmbeddingSize,
159 | elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet())
160 |
161 | TokensEncoderWrapperModel(
162 | model = EnsembleTokensEncoderModel(
163 | components = listOf(
164 | EnsembleTokensEncoderModel.ComponentModel(
165 | model = TokensEncoderWrapperModel(
166 | model = EmbeddingsEncoderModel.Base(
167 | embeddingsMap = preEmbeddingsMap,
168 | embeddingKeyExtractor = NormWordKeyExtractor(),
169 | dropout = parsedArgs.wordDropoutCoefficient),
170 | converter = FormConverter()),
171 | trainable = true),
172 | EnsembleTokensEncoderModel.ComponentModel(
173 | model = TokensEncoderWrapperModel(
174 | model = EmbeddingsEncoderModel.Base(
175 | embeddingsMap = embeddingsMap,
176 | embeddingKeyExtractor = NormWordKeyExtractor(),
177 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
178 | dropout = parsedArgs.wordDropoutCoefficient),
179 | converter = FormConverter()),
180 | trainable = true),
181 | EnsembleTokensEncoderModel.ComponentModel(
182 | model = TokensEncoderWrapperModel(
183 | model = EmbeddingsEncoderModel.Base(
184 | embeddingsMap = posEmbeddingsMap,
185 | embeddingKeyExtractor = PosTagKeyExtractor,
186 | frequencyDictionary = corpus.grammaticalConfigurations.getElements()
187 | .mapNotNull { it.posToString }
188 | .associateWith { 1 },
189 | dropout = parsedArgs.posDropoutCoefficient),
190 | converter = MirrorConverter()),
191 | trainable = true)
192 | ),
193 | outputMergeConfiguration = AffineMerge(
194 | outputSize = 100, // TODO
195 | activationFunction = null)),
196 | converter = MirrorConverter()
197 | )
198 | }
199 |
200 | CommandLineArguments.TokensEncodingType.WORD_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder
201 |
202 | val embeddingsMap = EmbeddingsMap.fromSet(
203 | size = parsedArgs.wordEmbeddingSize,
204 | elements = corpus.words.getElementsReversedSet())
205 |
206 | val posEmbeddingsMap = EmbeddingsMap.fromSet(
207 | size = parsedArgs.posEmbeddingSize,
208 | elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet())
209 |
210 | TokensEncoderWrapperModel(
211 | model = EnsembleTokensEncoderModel(
212 | components = listOf(
213 | EnsembleTokensEncoderModel.ComponentModel(
214 | model = TokensEncoderWrapperModel(
215 | model = EmbeddingsEncoderModel.Base(
216 | embeddingsMap = embeddingsMap,
217 | embeddingKeyExtractor = NormWordKeyExtractor(),
218 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
219 | dropout = parsedArgs.wordDropoutCoefficient),
220 | converter = FormConverter()),
221 | trainable = true),
222 | EnsembleTokensEncoderModel.ComponentModel(
223 | model = TokensEncoderWrapperModel(
224 | model = EmbeddingsEncoderModel.Base(
225 | embeddingsMap = posEmbeddingsMap,
226 | embeddingKeyExtractor = PosTagKeyExtractor,
227 | frequencyDictionary = corpus.grammaticalConfigurations.getElements()
228 | .mapNotNull { it.posToString }
229 | .associateWith { 1 },
230 | dropout = parsedArgs.posDropoutCoefficient),
231 | converter = MirrorConverter()),
232 | trainable = true)
233 | ),
234 | outputMergeConfiguration = ConcatMerge()),
235 | converter = MirrorConverter()
236 | )
237 | }
238 |
239 | CommandLineArguments.TokensEncodingType.WORD_EMBEDDINGS -> { // TODO: separate with a dedicated builder
240 |
241 | val embeddingsMap = EmbeddingsMap.fromSet(
242 | size = parsedArgs.wordEmbeddingSize,
243 | elements = corpus.words.getElementsReversedSet())
244 |
245 | TokensEncoderWrapperModel(
246 | model = EmbeddingsEncoderModel.Base(
247 | embeddingsMap = embeddingsMap,
248 | embeddingKeyExtractor = NormWordKeyExtractor(),
249 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) },
250 | dropout = parsedArgs.wordDropoutCoefficient),
251 | converter = FormConverter()
252 | )
253 | }
254 |
255 | CommandLineArguments.TokensEncodingType.CHARLM -> { // TODO: separate with a dedicated builder
256 | TokensEncoderWrapperModel(
257 | model = CharLMEncoderModel(
258 | dirCharLM = CharLM.load(File(parsedArgs.charLMModelPath!!).inputStream()),
259 | revCharLM = CharLM.load(File(parsedArgs.charLMRevModelPath!!).inputStream()),
260 | outputMergeConfiguration = AffineMerge(
261 | outputSize = 100, // TODO
262 | activationFunction = Tanh)),
263 | converter = FormConverter()
264 | )
265 | }
266 |
267 | CommandLineArguments.TokensEncodingType.MORPHO_FEATURES -> {
268 |
269 | val analyzer = MorphologicalAnalyzer(dictionary = morphologyDictionary!!)
270 |
271 | val lexiconDictionary = parsedArgs.lexiconDictionaryPath?.let {
272 | println("Loading lexicon from '$it'...")
273 | LexiconDictionary.load(it)
274 | }
275 |
276 | val featuresDictionary = FeaturesCollector(
277 | lexicalDictionary = lexiconDictionary,
278 | sentences = sentences.mapIndexed { i, it -> it.toMorphoSentence(index = i, analyzer = analyzer)}
279 | ).collect()
280 |
281 | TokensEncoderWrapperModel(
282 | model = MorphoEncoderModel(
283 | lexiconDictionary = lexiconDictionary,
284 | featuresDictionary = featuresDictionary,
285 | tokenEncodingSize = parsedArgs.wordEmbeddingSize,
286 | activation = null),
287 | converter = MorphoConverter()
288 | )
289 | }
290 | }
291 |
292 |
293 | /**
294 | * Build a [MorphoSentence] from this [CoNLLSentence].
295 | *
296 | * @param index the position index of this sentence
297 | * @param analyzer a morphological analyzer
298 | *
299 | * @return a new morpho sentence
300 | */
301 | private fun CoNLLSentence.toMorphoSentence(index: Int, analyzer: MorphologicalAnalyzer): MorphoSentence {
302 |
303 | val baseTokens = this.tokens.toBaseTokens()
304 | val position = Position(
305 | index = index,
306 | start = baseTokens.first().position.start,
307 | end = baseTokens.last().position.end)
308 | @Suppress("UNCHECKED_CAST")
309 | val sentence = BaseSentence(id = index, position = position, tokens = baseTokens) as RealSentence
310 |
311 | val analysis = analyzer.analyze(sentence)
312 |
313 | return object : MorphoSentence {
314 | override val tokens: List = this@toMorphoSentence.tokens
315 | override val morphoAnalysis: MorphologicalAnalysis? = analysis
316 | }
317 | }
318 |
319 | /**
320 | * Build a trainer for a given [LHRParser].
321 | *
322 | * @param parser an LHR parser
323 | * @param parsedArgs the parsed command line arguments
324 | * @param morphologyDictionary a morphology dictionary
325 | *
326 | * @return a trainer for the given [parser]
327 | */
328 | private fun buildTrainer(parser: LHRParser,
329 | parsedArgs: CommandLineArguments,
330 | morphologyDictionary: MorphologyDictionary?): LHRTrainer {
331 |
332 | val preprocessor: SentencePreprocessor = buildSentencePreprocessor(morphologyDictionary)
333 |
334 | return LHRTrainer(
335 | parser = parser,
336 | epochs = parsedArgs.epochs,
337 | batchSize = parsedArgs.batchSize,
338 | validator = Validator(
339 | neuralParser = parser,
340 | sentences = loadSentences(
341 | type = "validation",
342 | filePath = parsedArgs.validationSetPath,
343 | maxSentences = null,
344 | skipNonProjective = false),
345 | sentencePreprocessor = preprocessor),
346 | modelFilename = parsedArgs.modelPath,
347 | skipPunctuationErrors = parsedArgs.skipPunctuationErrors,
348 | usePositionalEncodingErrors = false,
349 | updateMethod = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
350 | sentencePreprocessor = preprocessor,
351 | verbose = !parsedArgs.quiet)
352 | }
353 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | 4.0.0
7 |
8 | com.kotlinnlp
9 | neuralparser
10 | 0.6.5
11 | jar
12 |
13 | ${project.groupId}:${project.artifactId}
14 |
15 | NeuralParser is a very simple to use dependency parser, based on the SimpleDNN library and
16 | the SyntaxDecoder transition systems framework.
17 |
18 | http://github.com/kotlinnlp/neuralparser
19 |
20 |
21 | scm:git:git://github.com/kotlinnlp/neuralparser.git
22 | scm:git:ssh://github.com:kotlinnlp/neuralparser.git
23 | http://github.com/kotlinnlp/neuralparser/tree/master
24 |
25 |
26 |
27 |
28 | KotlinNLP Authors
29 | github@kotlinnlp.com
30 | KotlinNLP
31 | http://www.kotlinnlp.com
32 |
33 |
34 |
35 |
36 |
37 | Apache License, Version 2.0
38 | http://www.apache.org/licenses/LICENSE-2.0.txt
39 | repo
40 |
41 |
42 |
43 |
44 |
45 | ossrh
46 | https://oss.sonatype.org/content/repositories/snapshots
47 |
48 |
49 | ossrh
50 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
51 |
52 |
53 |
54 |
55 | UTF-8
56 | 1.6.7
57 | 3.0.0
58 | 1.6
59 | oss_kotlinnlp
60 | C73F18F0
61 | 1.3.31
62 | 5.2
63 | 0.9.16
64 | 0.5.2
65 | 0.2.3
66 |
67 |
68 |
69 |
70 | jcenter
71 | https://jcenter.bintray.com/
72 |
73 |
74 |
75 |
76 |
77 | jcenter
78 | JCenter
79 | https://jcenter.bintray.com/
80 |
81 |
82 |
83 |
84 | src/main/kotlin
85 |
86 |
87 |
88 | org.jetbrains.kotlin
89 | kotlin-maven-plugin
90 | ${kotlin.version}
91 |
92 | 1.8
93 |
94 |
95 |
96 | compile
97 | compile
98 |
99 | compile
100 |
101 |
102 |
103 | test-compile
104 | test-compile
105 |
106 | test-compile
107 |
108 |
109 |
110 |
111 |
112 |
113 | org.jetbrains.dokka
114 | dokka-maven-plugin
115 | ${dokka.version}
116 |
117 | true
118 |
119 |
120 |
121 | prepare-package
122 |
123 | dokka
124 | javadoc
125 | javadocJar
126 |
127 |
128 |
129 | packages.md
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 | org.sonatype.plugins
138 | nexus-staging-maven-plugin
139 | ${nexus-staging-plugin.version}
140 | true
141 |
142 | ${oss.server.id}
143 | https://oss.sonatype.org/
144 | true
145 |
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-source-plugin
151 | ${maven-source-plugin.version}
152 |
153 |
154 | attach-sources
155 |
156 | jar-no-fork
157 |
158 |
159 |
160 |
161 |
162 |
163 | org.apache.maven.plugins
164 | maven-gpg-plugin
165 | ${maven-gpg-plugin.version}
166 |
167 | true
168 |
169 |
170 |
171 | sign-artifacts
172 | verify
173 |
174 | sign
175 |
176 |
177 | ${gpg.keyname}
178 | ${gpg.keyname}
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 | org.jetbrains.kotlin
189 | kotlin-stdlib-jdk8
190 | ${kotlin.version}
191 |
192 |
193 |
194 | org.jetbrains.kotlin
195 | kotlin-reflect
196 | ${kotlin.version}
197 |
198 |
199 |
200 | org.jetbrains.kotlin
201 | kotlin-test
202 | ${kotlin.version}
203 | test
204 |
205 |
206 |
207 | com.beust
208 | klaxon
209 | ${klaxon.version}
210 |
211 |
212 |
213 | com.kotlinnlp
214 | lssencoder
215 | ${lssencoder.version}
216 |
217 |
218 |
219 | com.kotlinnlp
220 | dependencytree
221 | ${dependencytree.version}
222 |
223 |
224 |
225 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParser.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 |
14 | /**
15 | * A Neural Parser.
16 | */
17 | interface NeuralParser {
18 |
19 | /**
20 | * The model of this neural parser.
21 | */
22 | val model: ModelType
23 |
24 | /**
25 | * Whether this parser executes the morpho-syntactic labelling.
26 | */
27 | val labellingEnabled: Boolean
28 |
29 | /**
30 | * Parse a sentence, giving its dependency tree.
31 | *
32 | * @param sentence a [Sentence]
33 | *
34 | * @return the dependency tree predicted for the given [sentence]
35 | */
36 | fun parse(sentence: ParsingSentence): MorphoSynSentence
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParserModel.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser
9 |
10 | import com.kotlinnlp.linguisticdescription.language.Language
11 | import com.kotlinnlp.utils.Serializer
12 | import java.io.InputStream
13 | import java.io.OutputStream
14 | import java.io.Serializable
15 |
16 | /**
17 | * The serializable model of a [NeuralParser].
18 | *
19 | * @property language the language within the parser works
20 | */
21 | abstract class NeuralParserModel(val language: Language) : Serializable {
22 |
23 | companion object {
24 |
25 | /**
26 | * Private val used to serialize the class (needed by Serializable).
27 | */
28 | @Suppress("unused")
29 | private const val serialVersionUID: Long = 1L
30 |
31 | /**
32 | * Read a [NeuralParserModel] (serialized) from an input stream and decode it.
33 | *
34 | * @param inputStream the [InputStream] from which to read the serialized [NeuralParserModel]
35 | *
36 | * @return the [NeuralParserModel] read from [inputStream] and decoded
37 | */
38 | fun load(inputStream: InputStream): NeuralParserModel = Serializer.deserialize(inputStream)
39 | }
40 |
41 | /**
42 | * Serialize this [NeuralParserModel] and write it to an output stream.
43 | *
44 | * @param outputStream the [OutputStream] in which to write this serialized [NeuralParserModel]
45 | */
46 | fun dump(outputStream: OutputStream) = Serializer.serialize(this, outputStream)
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/Trainer.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers
9 |
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.dependencytree.DependencyTree
12 | import com.kotlinnlp.neuralparser.NeuralParser
13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
15 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics
16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
17 | import com.kotlinnlp.neuralparser.language.BaseSentence
18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
19 | import com.kotlinnlp.utils.ShuffledIndices
20 | import com.kotlinnlp.utils.Shuffler
21 | import com.kotlinnlp.utils.Timer
22 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
23 | import java.io.File
24 | import java.io.FileOutputStream
25 |
26 | /**
27 | * The training helper of the [NeuralParser].
28 | *
29 | * @param neuralParser a neural parser
30 | * @param batchSize the number of sentences that compose a batch
31 | * @param epochs the number of training epochs
32 | * @param validator the validation helper (if it is null no validation is done after each epoch)
33 | * @param modelFilename the name of the file in which to save the best trained model
34 | * @param minRelevantErrorsCountToUpdate the min count of relevant errors needed to update the neural parser (default 1)
35 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
36 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
37 | */
38 | abstract class Trainer(
39 | private val neuralParser: NeuralParser<*>,
40 | private val batchSize: Int,
41 | private val epochs: Int,
42 | private val validator: Validator?,
43 | private val modelFilename: String,
44 | private val minRelevantErrorsCountToUpdate: Int = 1,
45 | private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
46 | private val verbose: Boolean = true
47 | ) {
48 |
49 | /**
50 | * A timer to track the elapsed time.
51 | */
52 | private var timer = Timer()
53 |
54 | /**
55 | * The best accuracy reached during the training.
56 | */
57 | private var bestAccuracy: Double = -1.0 // -1 used as init value (all accuracy values are in the range [0.0, 1.0])
58 |
59 | /**
60 | * Check requirements.
61 | */
62 | init {
63 | require(this.epochs > 0) { "The number of epochs must be > 0" }
64 | require(this.batchSize > 0) { "The size of the batch must be > 0" }
65 | require(this.minRelevantErrorsCountToUpdate > 0) { "minRelevantErrorsCountToUpdate must be > 0" }
66 | }
67 |
68 | /**
69 | * Train the [neuralParser] with the given sentences.
70 | *
71 | * @param trainingSentences the sentences used to train the parser
72 | * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null)
73 | */
74 | fun train(trainingSentences: List,
75 | shuffler: Shuffler? = Shuffler(enablePseudoRandom = true, seed = 743)) {
76 |
77 | (0 until this.epochs).forEach { i ->
78 |
79 | this.logTrainingStart(epochIndex = i)
80 |
81 | this.newEpoch()
82 | this.trainEpoch(trainingSentences = trainingSentences, shuffler = shuffler)
83 |
84 | this.logTrainingEnd()
85 |
86 | this.validator?.apply {
87 | logValidationStart()
88 | validateAndSaveModel()
89 | logValidationEnd()
90 | }
91 | }
92 | }
93 |
94 | /**
95 | * Train the parser for an epoch.
96 | *
97 | * @param trainingSentences the training sentences
98 | * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null)
99 | */
100 | private fun trainEpoch(trainingSentences: List,
101 | shuffler: Shuffler?) {
102 |
103 | val progress = ProgressIndicatorBar(trainingSentences.size)
104 |
105 | this.newBatch()
106 |
107 | ShuffledIndices(trainingSentences.size, shuffler = shuffler).forEachIndexed { i, sentenceIndex ->
108 |
109 | val endOfBatch: Boolean = (i + 1) % this.batchSize == 0 || i == trainingSentences.lastIndex
110 |
111 | progress.tick()
112 |
113 | val sentence: CoNLLSentence = trainingSentences[sentenceIndex]
114 |
115 | require(sentence.hasAnnotatedHeads()) {
116 | "The gold dependency tree of a sentence cannot be null during the evaluation."
117 | }
118 |
119 | this.trainSentence(
120 | sentence = this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = sentenceIndex)),
121 | goldTree = DependencyTree.Labeled(sentence))
122 |
123 | if (endOfBatch && this.getRelevantErrorsCount() >= this.minRelevantErrorsCountToUpdate) {
124 | this.update()
125 | this.newBatch()
126 | }
127 | }
128 | }
129 |
130 | /**
131 | * Validate the [neuralParser] with the validation helper and save the best model.
132 | * The [validator] is required to be not null.
133 | */
134 | private fun validateAndSaveModel() {
135 |
136 | val stats: Statistics = this.validator!!.evaluate()
137 |
138 | println("\n$stats")
139 |
140 | if (stats.noPunctuation.uas.perc > this.bestAccuracy) {
141 |
142 | this.saveModel()
143 |
144 | this.bestAccuracy = stats.noPunctuation.uas.perc
145 | }
146 | }
147 |
148 | /**
149 | * Save the model to [modelFilename].
150 | */
151 | private fun saveModel() {
152 |
153 | this.neuralParser.model.dump(FileOutputStream(File(this.modelFilename)))
154 |
155 | println("\nNEW BEST ACCURACY! Model saved to \"${this.modelFilename}\"")
156 | }
157 |
158 | /**
159 | * Log when training starts.
160 | *
161 | * @param epochIndex the current epoch index
162 | */
163 | private fun logTrainingStart(epochIndex: Int) {
164 |
165 | if (this.verbose) {
166 |
167 | this.timer.reset()
168 |
169 | println("\nEpoch ${epochIndex + 1} of ${this.epochs}")
170 | println("\nStart training...")
171 | }
172 | }
173 |
174 | /**
175 | * Log when training ends.
176 | */
177 | private fun logTrainingEnd() {
178 |
179 | if (this.verbose) {
180 | println("Elapsed time: %s".format(this.timer.formatElapsedTime()))
181 | }
182 | }
183 |
184 | /**
185 | * Log when validation starts.
186 | */
187 | private fun logValidationStart() {
188 |
189 | if (this.verbose) {
190 | this.timer.reset()
191 | println() // new line
192 | }
193 | }
194 |
195 | /**
196 | * Log when validation ends.
197 | */
198 | private fun logValidationEnd() {
199 |
200 | if (this.verbose) {
201 | println("Elapsed time: %s".format(this.timer.formatElapsedTime()))
202 | }
203 | }
204 |
205 | /**
206 | * Beat the occurrence of a new batch.
207 | */
208 | protected open fun newBatch() = Unit
209 |
210 | /**
211 | * Beat the occurrence of a new epoch.
212 | */
213 | protected open fun newEpoch() = Unit
214 |
215 | /**
216 | * Update the [neuralParser].
217 | */
218 | protected abstract fun update()
219 |
220 | /**
221 | * Train the parser with the given [sentence] and [goldTree].
222 | *
223 | * @param sentence a sentence
224 | * @param goldTree the gold dependency tree
225 | */
226 | protected abstract fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled)
227 |
228 | /**
229 | * @return the count of the relevant errors
230 | */
231 | protected abstract fun getRelevantErrorsCount(): Int
232 | }
233 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/LabelerSelector.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
9 |
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
14 | import java.io.Serializable
15 |
16 | /**
17 | * The selector of valid configurations of the labeler and compatible morphologies.
18 | */
19 | interface LabelerSelector : Serializable {
20 |
21 | /**
22 | * Get the list of scored grammatical configurations that are valid for a given attachment.
23 | *
24 | * @param configurations the list of grammatical configurations, sorted by descending score
25 | * @param sentence the input sentence
26 | * @param tokenIndex the index of the token to which the deprel must be assigned
27 | * @param headIndex the index of the token head (can be null)
28 | *
29 | * @return the valid grammatical configurations for the given attachment
30 | */
31 | fun getValidConfigurations(configurations: List,
32 | sentence: ParsingSentence,
33 | tokenIndex: Int,
34 | headIndex: Int?): List
35 |
36 | /**
37 | * Get the morphologies of a given token that are compatible with the given grammatical configuration.
38 | *
39 | * @param sentence the input sentence
40 | * @param tokenIndex the index of a token of the sentence
41 | * @param configuration the grammatical configuration of the token
42 | *
43 | * @return the morphologies compatible with the given grammatical configuration
44 | */
45 | fun getValidMorphologies(sentence: ParsingSentence,
46 | tokenIndex: Int,
47 | configuration: GrammaticalConfiguration): Morphologies
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/MorphoSelector.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
9 |
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.POSTag
12 | import com.kotlinnlp.linguisticdescription.morphology.*
13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
14 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Unknown
15 | import com.kotlinnlp.neuralparser.language.ParsingSentence
16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
17 | import com.kotlinnlp.utils.notEmptyOr
18 |
19 | /**
20 | * The selector to use when the labeler predictions are defined as combinations of POS and syntactic types in the
21 | * Base format.
22 | */
23 | object MorphoSelector : LabelerSelector {
24 |
25 | /**
26 | * Private val used to serialize the class (needed by Serializable).
27 | */
28 | @Suppress("unused")
29 | private const val serialVersionUID: Long = 1L
30 |
31 | /**
32 | * Get the list of scored grammatical configurations that are valid for a given attachment.
33 | *
34 | * @param configurations the list of grammatical configurations, sorted by descending score
35 | * @param sentence the input sentence
36 | * @param tokenIndex the index of the token to which the deprel must be assigned
37 | * @param headIndex the index of the token head (can be null)
38 | *
39 | * @return the valid grammatical configurations for the given attachment
40 | */
41 | override fun getValidConfigurations(configurations: List,
42 | sentence: ParsingSentence,
43 | tokenIndex: Int,
44 | headIndex: Int?): List {
45 |
46 | val possibleMorphologies: Morphologies = sentence.morphoAnalysis!!.allMorphologies[tokenIndex]
47 | val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex)
48 | val possibleConfigurations: List = configurations.filter { it.config.direction == correctDirection }
49 | val worstScore: Double = configurations.last().score
50 |
51 | return if (possibleMorphologies.isNotEmpty())
52 | possibleConfigurations
53 | .filter { sentence.areConfigurationCompatible(c = it.config, tokenIndex = tokenIndex) }
54 | .notEmptyOr {
55 | listOf(ScoredGrammar(
56 | config = possibleMorphologies.first().buildUnknownConfig(correctDirection),
57 | score = worstScore))
58 | }
59 | else
60 | possibleConfigurations.filter { it.config.isSingleContentWord() }.notEmptyOr {
61 | listOf(ScoredGrammar(
62 | config = GrammaticalConfiguration(GrammaticalConfiguration.Component(
63 | syntacticDependency = Unknown(correctDirection),
64 | pos = POSTag.Base(POS.Noun))),
65 | score = worstScore))
66 | }
67 | }
68 |
69 | /**
70 | * Get the morphologies of a given token that are compatible with the given grammatical configuration.
71 | *
72 | * @param sentence the input sentence
73 | * @param tokenIndex the index of a token of the sentence
74 | * @param configuration the grammatical configuration of the token
75 | *
76 | * @return the morphologies compatible with the given deprel
77 | */
78 | override fun getValidMorphologies(sentence: ParsingSentence,
79 | tokenIndex: Int,
80 | configuration: GrammaticalConfiguration): Morphologies {
81 |
82 | val possibleMorphologies: Morphologies =
83 | sentence.getCompatibleMorphologies(c = configuration, tokenIndex = tokenIndex)
84 |
85 | return when {
86 |
87 | possibleMorphologies.isNotEmpty() -> possibleMorphologies
88 |
89 | configuration.type == GrammaticalConfiguration.Type.Single -> {
90 |
91 | val pos: POSTag.Base = checkNotNull(configuration.components.single().pos as? POSTag.Base) {
92 | "The POS cannot be null."
93 | }
94 |
95 | require(pos.type.isContentWord) {
96 | "The grammatical configuration of tokens without morphological analysis must define a content word."
97 | }
98 |
99 | Morphologies(Morphology(SingleMorphology(
100 | lemma = sentence.tokens[tokenIndex].form,
101 | pos = pos.type,
102 | allowDefaultValues = true)))
103 | }
104 |
105 | else -> Morphologies()
106 | }
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/NoFilterSelector.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.labelerselector
9 |
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
12 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
15 |
16 | /**
17 | * The selector that does not filter.
18 | */
19 | object NoFilterSelector : LabelerSelector {
20 |
21 | /**
22 | * Private val used to serialize the class (needed by Serializable).
23 | */
24 | @Suppress("unused")
25 | private const val serialVersionUID: Long = 1L
26 |
27 | /**
28 | * Get the list of scored grammatical configurations that are valid for a given attachment.
29 | *
30 | * @param configurations the list of grammatical configurations, sorted by descending score
31 | * @param sentence the input sentence
32 | * @param tokenIndex the index of the token to which the deprel must be assigned
33 | * @param headIndex the index of the token head (can be null)
34 | *
35 | * @return the valid grammatical configurations for the given attachment
36 | */
37 | override fun getValidConfigurations(configurations: List,
38 | sentence: ParsingSentence,
39 | tokenIndex: Int,
40 | headIndex: Int?): List {
41 |
42 | val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex)
43 |
44 | return configurations.filter { it.config.direction == correctDirection }
45 | }
46 |
47 | /**
48 | * Return all the morphologies as valid.
49 | *
50 | * @param sentence the input sentence
51 | * @param tokenIndex the index of a token of the sentence
52 | * @param configuration the grammatical configuration of the token
53 | *
54 | * @return all the given morphologies
55 | */
56 | override fun getValidMorphologies(sentence: ParsingSentence,
57 | tokenIndex: Int,
58 | configuration: GrammaticalConfiguration) = Morphologies(
59 | sentence.morphoAnalysis?.allMorphologies?.get(tokenIndex)?.filter {
60 | it.components.size == configuration.components.size
61 | } ?: emptyList()
62 | )
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/BasePreprocessor.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
9 |
10 | import com.kotlinnlp.neuralparser.language.BaseSentence
11 | import com.kotlinnlp.neuralparser.language.ParsingSentence
12 | import com.kotlinnlp.neuralparser.language.ParsingToken
13 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector
14 |
15 | /**
16 | * Pre-process a sentence before the parsing starts.
17 | */
18 | class BasePreprocessor : SentencePreprocessor {
19 |
20 | companion object {
21 |
22 | /**
23 | * Private val used to serialize the class (needed by Serializable).
24 | */
25 | @Suppress("unused")
26 | private const val serialVersionUID: Long = 1L
27 | }
28 |
29 | /**
30 | * Convert a [BaseSentence] to a [ParsingSentence].
31 | *
32 | * @param sentence a base sentence
33 | *
34 | * @return a sentence ready to be parsed
35 | */
36 | override fun convert(sentence: BaseSentence) = ParsingSentence(
37 | tokens = sentence.tokens.map { ParsingToken(id = it.id, form = it.form, position = it.position) },
38 | labelerSelector = NoFilterSelector,
39 | position = sentence.position
40 | )
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/CoNLLPreprocessor.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
9 |
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.conllio.Token as CoNLLToken
12 | import com.kotlinnlp.neuralparser.language.BaseSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.language.ParsingToken
15 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector
16 |
17 | /**
18 | * Pre-process a sentence that has been built from a [CoNLLSentence].
19 | *
20 | * @param conllSentences the list of CoNLL sentences from which the input base sentences are built
21 | */
22 | class CoNLLPreprocessor(private val conllSentences: List) : SentencePreprocessor {
23 |
24 | companion object {
25 |
26 | /**
27 | * Private val used to serialize the class (needed by Serializable).
28 | */
29 | @Suppress("unused")
30 | private const val serialVersionUID: Long = 1L
31 | }
32 |
33 | /**
34 | * Convert a [BaseSentence] to a [ParsingSentence].
35 | *
36 | * @param sentence a base sentence
37 | *
38 | * @return a sentence ready to be parsed
39 | */
40 | override fun convert(sentence: BaseSentence): ParsingSentence {
41 |
42 | val conllTokens: List = this.conllSentences[sentence.position.index].tokens
43 |
44 | return ParsingSentence(
45 | tokens = sentence.tokens.mapIndexed { i, it ->
46 | ParsingToken(
47 | id = it.id,
48 | form = it.form,
49 | position = it.position,
50 | pos = conllTokens[i].posList
51 | )
52 | },
53 | labelerSelector = NoFilterSelector,
54 | position = sentence.position
55 | )
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/MorphoPreprocessor.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
9 |
10 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
13 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer
14 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary
15 | import com.kotlinnlp.neuralparser.language.BaseSentence
16 | import com.kotlinnlp.neuralparser.language.ParsingSentence
17 | import com.kotlinnlp.neuralparser.language.ParsingToken
18 | import com.kotlinnlp.neuralparser.helpers.labelerselector.MorphoSelector
19 |
20 | /**
21 | * Pre-process a sentence with a morphological analyzer, before the parsing starts.
22 | *
23 | * @param dictionary a morphologies dictionary
24 | */
25 | class MorphoPreprocessor(private val dictionary: MorphologyDictionary) : SentencePreprocessor {
26 |
27 | companion object {
28 |
29 | /**
30 | * Private val used to serialize the class (needed by Serializable).
31 | */
32 | @Suppress("unused")
33 | private const val serialVersionUID: Long = 1L
34 | }
35 |
36 | /**
37 | * A morphological analyzer as transient property.
38 | */
39 | @kotlin.jvm.Transient private var morphologicalAnalyzer: MorphologicalAnalyzer? = null
40 |
41 | /**
42 | * Convert a [BaseSentence] to a [ParsingSentence].
43 | *
44 | * @param sentence a base sentence
45 | *
46 | * @return a sentence ready to be parsed
47 | */
48 | override fun convert(sentence: BaseSentence): ParsingSentence {
49 |
50 | @Suppress("UNCHECKED_CAST")
51 | val morphoAnalysis: MorphologicalAnalysis = this.getOrInitAnalyzer().analyze(sentence as RealSentence)
52 |
53 | return ParsingSentence(
54 | tokens = sentence.tokens.map {
55 | ParsingToken(
56 | id = it.id,
57 | form = it.form,
58 | position = it.position
59 | )
60 | },
61 | morphoAnalysis = morphoAnalysis,
62 | labelerSelector = MorphoSelector,
63 | position = sentence.position
64 | )
65 | }
66 |
67 | /**
68 | * Get the [MorphologicalAnalyzer] of this preprocessor, eventually initializing it (in case this class has just been
69 | * deserialized).
70 | *
71 | * @return the morphological analyzer of this preprocessor
72 | */
73 | private fun getOrInitAnalyzer(): MorphologicalAnalyzer {
74 |
75 | if (this.morphologicalAnalyzer == null)
76 | this.morphologicalAnalyzer = MorphologicalAnalyzer(this.dictionary)
77 |
78 | return this.morphologicalAnalyzer!!
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/SentencePreprocessor.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.preprocessors
9 |
10 | import com.kotlinnlp.neuralparser.language.ParsingSentence
11 | import com.kotlinnlp.neuralparser.language.BaseSentence
12 | import com.kotlinnlp.neuralparser.language.BaseToken
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 |
16 | /**
17 | * Pre-process a sentence before the parsing starts.
18 | */
19 | interface SentencePreprocessor : SentenceConverter
20 |
21 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/CompositeTokenHelper.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.linguisticdescription.morphology.POS
13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticType
14 |
15 | /**
16 | * @param dependencyTree the dependency tree
17 | */
18 | internal class CompositeTokenHelper(private val dependencyTree: DependencyTree.Labeled) {
19 |
20 | /**
21 | * Get the ID of the governor of a component of a composite token.
22 | *
23 | * @param tokenId the ID of a parsing token
24 | * @param componentIndex the index of a component of the token
25 | * @param prevComponentId the ID assigned to the precedent component (null at the first component)
26 | *
27 | * @return the ID of the governor of the given component
28 | */
29 | fun getComponentGovernorId(tokenId: Int,
30 | componentIndex: Int,
31 | prevComponentId: Int?): Int? {
32 |
33 | val governorId: Int? = this.dependencyTree.getHead(tokenId)
34 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
35 |
36 | val isContin: Boolean = config.isContin()
37 | val isPrepArt: Boolean = config.isPrepArt()
38 | val isVerbEnclitic: Boolean = config.isVerbEnclitic()
39 |
40 | return when {
41 | componentIndex == 0 -> governorId
42 | isPrepArt && !isContin -> governorId
43 | isPrepArt && isContin -> this.dependencyTree.getMultiWordGovernorId(tokenId)
44 | isVerbEnclitic -> prevComponentId!!
45 | else -> null
46 | }
47 | }
48 |
49 | /**
50 | * Get the governor ID of a multi-word, given one of its tokens and going back through its ancestors in the dependency
51 | * tree.
52 | * Note: the governor of a multi-word is the governor of it first token.
53 | *
54 | * @param tokenId the id of a token that is part of a multi-word
55 | *
56 | * @return the governor id of the multi-word of which the given token is part of
57 | */
58 | private fun DependencyTree.Labeled.getMultiWordGovernorId(tokenId: Int): Int? {
59 |
60 | var multiWordStartId: Int = this.getHead(tokenId)!!
61 |
62 | while (this.getConfiguration(multiWordStartId).isContin())
63 | multiWordStartId = this.getHead(multiWordStartId)!!
64 |
65 | return this.getHead(multiWordStartId)
66 | }
67 |
68 | /**
69 | * @return true if this configuration defines the continuation of a multi-word, otherwise false
70 | */
71 | private fun GrammaticalConfiguration.isContin(): Boolean = this.components.any {
72 | it.syntacticDependency.isSubTypeOf(SyntacticType.Contin)
73 | }
74 |
75 | /**
76 | * @return true if this configuration defines a composite PREP + ART, otherwise false
77 | */
78 | private fun GrammaticalConfiguration.isPrepArt(): Boolean =
79 | this.components.size == 2 &&
80 | this.components[0].pos?.isSubTypeOf(POS.Prep) == true &&
81 | this.components[1].pos?.isSubTypeOf(POS.Art) == true
82 |
83 | /**
84 | * @return true if this configuration defines a composite VERB + PRON, otherwise false
85 | */
86 | private fun GrammaticalConfiguration.isVerbEnclitic(): Boolean =
87 | this.components.size >= 2 &&
88 | this.components[0].pos?.isSubTypeOf(POS.Verb) == true &&
89 | this.components.subList(1, this.components.size).all { it.pos?.isSubTypeOf(POS.Pron) == true }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/LabeledMorphoSynBuilder.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.linguisticdescription.morphology.ScoredMorphology
13 | import com.kotlinnlp.linguisticdescription.morphology.ScoredSingleMorphology
14 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
15 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
16 | import com.kotlinnlp.linguisticdescription.sentence.token.Word
17 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace
18 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation
19 | import com.kotlinnlp.neuralparser.language.ParsingSentence
20 | import com.kotlinnlp.neuralparser.language.ParsingToken
21 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector
22 |
23 | /**
24 | * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree].
25 | *
26 | * @param parsingSentence a parsing sentence
27 | * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence
28 | */
29 | internal class LabeledMorphoSynBuilder(
30 | private val parsingSentence: ParsingSentence,
31 | private val dependencyTree: DependencyTree.Labeled
32 | ) {
33 |
34 | /**
35 | * The next id that can be assigned to a new token of the sentence, used in case a new single component has to be
36 | * created.
37 | */
38 | private var nextAvailableId: Int = this.parsingSentence.tokens.asSequence().map { it.id }.max()!! + 1
39 |
40 | /**
41 | * Build the morpho-syntactic sentence using a [LabelerSelector] to select the valid morphologies.
42 | *
43 | * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree]
44 | */
45 | fun buildSentence(): MorphoSynSentence = MorphoSynSentence(
46 | id = 0,
47 | confidence = 0.0,
48 | tokens = this.parsingSentence.tokens.mapIndexed { i, token ->
49 |
50 | val attachmentScore: Double = this.dependencyTree.getAttachmentScore(token.id)
51 |
52 | val morphologies: List = this.parsingSentence.getValidMorphologies(
53 | tokenIndex = i,
54 | configuration = this.dependencyTree.getConfiguration(token.id)
55 | ).map { morpho ->
56 | ScoredMorphology(components = morpho.components, score = attachmentScore)
57 | }
58 |
59 | this.buildToken(tokenId = token.id, morphologies = morphologies)
60 | },
61 | position = this.parsingSentence.position
62 | )
63 |
64 | /**
65 | * @param tokenId the id of a parsing token
66 | * @param morphologies the possible morphologies of the token
67 | *
68 | * @return a new morpho-syntactic token build from the given parsing token
69 | */
70 | private fun buildToken(tokenId: Int, morphologies: List): MorphoSynToken {
71 |
72 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
73 |
74 | require(morphologies.all { it.components.size == config.components.size }) {
75 | "The given morphologies must have the same number of components of the given grammatical configuration."
76 | }
77 |
78 | return if (config.components.size == 1)
79 | this.buildSingleToken(
80 | tokenId = tokenId,
81 | governorId = this.dependencyTree.getHead(tokenId),
82 | grammaticalComponent = config.components.single(),
83 | morphologies = morphologies.map { it.toSingle() })
84 | else
85 | this.buildCompositeToken(tokenId = tokenId, morphologies = morphologies)
86 | }
87 |
88 | /**
89 | * @param tokenId the id of the new token
90 | * @param morphologies the list of possible scored morphologies of the token
91 | *
92 | * @return a new composite token
93 | */
94 | private fun buildCompositeToken(tokenId: Int, morphologies: List): MorphoSynToken.Composite {
95 |
96 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
97 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId)
98 | val compositeTokenHandler = CompositeTokenHelper(this.dependencyTree)
99 |
100 | val newToken = MorphoSynToken.Composite(
101 | id = parsingToken.id,
102 | form = parsingToken.form,
103 | position = checkNotNull(parsingToken.position) { "Composite words must have a position." },
104 | components = config.components.mapIndexed { i, component ->
105 | this.buildSingleToken(
106 | tokenId = tokenId,
107 | componentId = this.nextAvailableId + i,
108 | governorId = compositeTokenHandler.getComponentGovernorId(
109 | tokenId = tokenId,
110 | componentIndex = i,
111 | prevComponentId = if (i > 0) this.nextAvailableId else null),
112 | grammaticalComponent = component,
113 | morphologies = morphologies.map { ScoredSingleMorphology(value = it.components[i], score = it.score) }
114 | ) as Word
115 | }
116 | )
117 |
118 | // Attention: the nextAvailableId must be set after the token has been created in order to calculate the
119 | // components governors correctly.
120 | this.nextAvailableId += config.components.size
121 |
122 | return newToken
123 | }
124 |
125 | /**
126 | * @param tokenId the id of the original token
127 | * @param componentId the id of the token in case it is a component (otherwise null)
128 | * @param governorId the id of the governor (null if it is the top)
129 | * @param grammaticalComponent the grammatical configuration of the token as single component
130 | * @param morphologies the list of possible scored morphologies of the token
131 | *
132 | * @return a new single token
133 | */
134 | private fun buildSingleToken(tokenId: Int,
135 | componentId: Int? = null,
136 | governorId: Int?,
137 | grammaticalComponent: GrammaticalConfiguration.Component,
138 | morphologies: List): MorphoSynToken.Single {
139 |
140 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
141 | val syntacticRelation = SyntacticRelation(
142 | governor = governorId,
143 | attachmentScore = this.dependencyTree.getAttachmentScore(tokenId),
144 | dependency = grammaticalComponent.syntacticDependency)
145 |
146 | // Unique morphologies by lemma and POS.
147 | val uniqueMorphologies: List =
148 | morphologies.associateBy { Pair(it.value.lemma, it.value.pos) }.values.toList()
149 |
150 | return if (parsingToken.position != null)
151 | Word(
152 | id = componentId ?: tokenId,
153 | form = parsingToken.form,
154 | position = parsingToken.position,
155 | pos = grammaticalComponent.pos,
156 | morphologies = uniqueMorphologies,
157 | contextMorphologies = listOf(), // TODO: set it
158 | syntacticRelation = syntacticRelation,
159 | coReferences = null, // TODO: set it
160 | semanticRelations = null) // TODO: set it
161 | else
162 | WordTrace(
163 | id = componentId ?: tokenId,
164 | form = parsingToken.form,
165 | pos = grammaticalComponent.pos,
166 | morphologies = uniqueMorphologies,
167 | contextMorphologies = listOf(), // TODO: set it
168 | syntacticRelation = syntacticRelation,
169 | coReferences = null, // TODO: set it
170 | semanticRelations = null)
171 | }
172 | }
173 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/UnlabeledMorphoSynBuilder.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
13 | import com.kotlinnlp.linguisticdescription.sentence.token.Word
14 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace
15 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation
16 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
17 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Undefined
18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
19 | import com.kotlinnlp.neuralparser.language.ParsingToken
20 |
21 | /**
22 | * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree].
23 | *
24 | * @param parsingSentence a parsing sentence
25 | * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence
26 | */
27 | internal class UnlabeledMorphoSynBuilder(
28 | private val parsingSentence: ParsingSentence,
29 | private val dependencyTree: DependencyTree.Unlabeled
30 | ) {
31 |
32 | /**
33 | * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree]
34 | */
35 | fun buildSentence(): MorphoSynSentence = MorphoSynSentence(
36 | id = 0,
37 | confidence = 0.0,
38 | tokens = this.parsingSentence.tokens.map {token ->
39 | this.buildSingleToken(tokenId = token.id, governorId = this.dependencyTree.getHead(token.id))
40 | },
41 | position = this.parsingSentence.position
42 | )
43 |
44 | /**
45 | * @param tokenId the token id
46 | * @param governorId the governor id or null if it is the top
47 | *
48 | * @return a new single morpho-syntactic token built from the given parsing token
49 | */
50 | private fun buildSingleToken(tokenId: Int, governorId: Int?): MorphoSynToken.Single {
51 |
52 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId)
53 | val syntacticRelation = SyntacticRelation(
54 | governor = governorId,
55 | attachmentScore = this.dependencyTree.getAttachmentScore(tokenId),
56 | dependency = Undefined(direction = SyntacticDependency.Direction.NULL))
57 |
58 | return if (parsingToken.position != null)
59 | Word(
60 | id = tokenId,
61 | form = parsingToken.form,
62 | position = parsingToken.position,
63 | pos = null,
64 | morphologies = listOf(),
65 | contextMorphologies = listOf(), // TODO: set it
66 | syntacticRelation = syntacticRelation,
67 | coReferences = null, // TODO: set it
68 | semanticRelations = null) // TODO: set it
69 | else
70 | WordTrace(
71 | id = tokenId,
72 | form = parsingToken.form,
73 | pos = null,
74 | morphologies = listOf(),
75 | contextMorphologies = listOf(), // TODO: set it
76 | syntacticRelation = syntacticRelation,
77 | coReferences = null, // TODO: set it
78 | semanticRelations = null)
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/BaseStatistics.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.statistics
9 |
10 | import com.kotlinnlp.utils.stats.StatMetric
11 |
12 | /**
13 | * Base parsing statistics.
14 | *
15 | * @property las labeled attachment score
16 | * @property uas unlabeled attachment score
17 | * @property ps POS tag accuracy score
18 | * @property ds deprel accuracy score
19 | * @property slas sentence labeled attachment score
20 | * @property suas sentence unlabeled attachment score
21 | */
22 | open class BaseStatistics(
23 | val las: StatMetric,
24 | val uas: StatMetric,
25 | val ps: StatMetric,
26 | val ds: StatMetric,
27 | val slas: StatMetric,
28 | val suas: StatMetric) {
29 |
30 | /**
31 | * @return this statistics formatted into a string
32 | */
33 | override fun toString(): String = """
34 | - Labeled attachment score: $las
35 | - Unlabeled attachment score: $uas
36 | - Deprel accuracy score: $ds
37 | - POS tag accuracy score: $ps
38 | - Sentence labeled attachment score: $slas
39 | - Sentence unlabeled attachment score: $suas
40 | """
41 | .removePrefix("\n")
42 | .trimIndent()
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/MetricsCounter.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.statistics
9 |
10 | import com.kotlinnlp.utils.stats.StatMetric
11 |
12 | /**
13 | * A counter of statistic metrics.
14 | *
15 | * @property labeledAttachments the counter of labeled attachments
16 | * @property unlabeledAttachments the counter of unlabeled attachments
17 | * @property correctPOSTags the counter of correct POS tags
18 | * @property correctDeprels the counter of correct deprels
19 | * @property correctLabeledSentences the counter of correct labeled sentences
20 | * @property correctUnlabeledSentences the counter of correct unlabeled sentences
21 | * @property totalSentences the total amount of sentences
22 | * @property totalTokens the total amount of tokens
23 | */
24 | data class MetricsCounter(
25 | var labeledAttachments: Int = 0,
26 | var unlabeledAttachments: Int = 0,
27 | var correctPOSTags: Int = 0,
28 | var correctDeprels: Int = 0,
29 | var correctLabeledSentences: Int = 0,
30 | var correctUnlabeledSentences: Int = 0,
31 | var totalSentences: Int = 0,
32 | var totalTokens: Int = 0) {
33 |
34 | /**
35 | * @return the base statistics
36 | */
37 | fun toStatistics() = with(this) {
38 | BaseStatistics(
39 | las = StatMetric(count = labeledAttachments, total = totalTokens),
40 | uas = StatMetric(count = unlabeledAttachments, total = totalTokens),
41 | ps = StatMetric(count = correctPOSTags, total = totalTokens),
42 | ds = StatMetric(count = correctDeprels, total = totalTokens),
43 | slas = StatMetric(count = correctLabeledSentences, total = totalSentences),
44 | suas = StatMetric(count = correctUnlabeledSentences, total = totalSentences)
45 | )
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/SentenceMetrics.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.statistics
9 |
10 | /**
11 | * The metrics of a sentence.
12 | *
13 | * @property correctLabeled if the parsed sentence has all correct attachments, including deprel labels
14 | * @property correctUnlabeled if the parsed sentence has all correct attachments, excluding deprel labels
15 | * @property correctLabeledNoPunct same as [correctLabeled], without considering the punctuation tokens
16 | * @property correctUnlabeledNoPunct same as [correctUnlabeled], without considering the punctuation tokens
17 | */
18 | internal data class SentenceMetrics(
19 | var correctLabeled: Boolean = true,
20 | var correctUnlabeled: Boolean = true,
21 | var correctLabeledNoPunct: Boolean = true,
22 | var correctUnlabeledNoPunct: Boolean = true
23 | )
24 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/Statistics.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.statistics
9 |
10 | import com.kotlinnlp.utils.stats.StatMetric
11 |
12 | /**
13 | * Parsing statistics, including ones calculated without considering the punctuation.
14 | *
15 | * @property las labeled attachment score
16 | * @property uas unlabeled attachment score
17 | * @property ps POS tag accuracy score
18 | * @property ds deprel accuracy score
19 | * @property slas sentence labeled attachment score
20 | * @property suas sentence unlabeled attachment score
21 | * @property noPunctuation statistics without considering punctuation tokens
22 | */
23 | class Statistics(
24 | las: StatMetric,
25 | uas: StatMetric,
26 | ps: StatMetric,
27 | ds: StatMetric,
28 | slas: StatMetric,
29 | suas: StatMetric,
30 | val noPunctuation: BaseStatistics
31 | ) : BaseStatistics(las = las, uas = uas, ps = ps, ds = ds, slas = slas, suas = suas) {
32 |
33 | /**
34 | * @return this statistics formatted into a string
35 | */
36 | override fun toString(): String = """
37 | Evaluation stats:
38 | %s
39 |
40 | Evaluation stats without considering punctuation:
41 | %s
42 | """
43 | .removePrefix("\n")
44 | .trimIndent()
45 | .format(
46 | super.toString(),
47 | this.noPunctuation.toString()
48 | )
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLDependencyParser.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.validator
9 |
10 | import com.kotlinnlp.linguisticdescription.POSTag
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Token as CoNLLToken
13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
14 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken
15 | import com.kotlinnlp.neuralparser.NeuralParser
16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
18 | import com.kotlinnlp.neuralparser.language.BaseSentence
19 | import com.kotlinnlp.utils.notEmptyOr
20 |
21 | /**
22 | * A helper that wraps a generic [NeuralParser] to let it working on CoNLL sentences.
23 | *
24 | * @property neuralParser a generic neural parser to use it with input/output sentences in CoNLL format
25 | * @property sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
26 | */
27 | class CoNLLDependencyParser(
28 | private val neuralParser: NeuralParser<*>,
29 | private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor()
30 | ) {
31 |
32 | /**
33 | * Parse a CoNLL sentence.
34 | *
35 | * @param sentence the sentence to parse, in CoNLL format
36 | * @param index the index of the sentence within the list of sentences of the input dataset
37 | *
38 | * @return the parsed sentence in CoNLL format
39 | */
40 | fun parse(sentence: CoNLLSentence, index: Int): CoNLLSentence {
41 |
42 | val parsedSentence: MorphoSynSentence = this.neuralParser.parse(
43 | this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = index)))
44 |
45 | return sentence.copy(tokens = sentence.tokens.map {
46 |
47 | val parsedToken: MorphoSynToken = parsedSentence.getTokenById(it.id)
48 |
49 | it.copy(
50 | head = parsedToken.syntacticRelation.governor ?: 0, // Note: the CoNLL root ID is 0
51 | posList = parsedToken.flatPOS.notEmptyOr { listOf(POSTag(CoNLLToken.EMPTY_FILLER)) },
52 | syntacticDependencies = parsedToken.flatSyntacticRelations.map { it.dependency }
53 | )
54 | })
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLFileValidator.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.validator
9 |
10 | import com.kotlinnlp.conllio.CoNLLUEvaluator
11 | import com.kotlinnlp.conllio.CoNLLWriter
12 | import com.kotlinnlp.conllio.CoNLLXEvaluator
13 | import com.kotlinnlp.conllio.Token as CoNLLToken
14 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
15 | import com.kotlinnlp.neuralparser.NeuralParser
16 | import com.kotlinnlp.neuralparser.utils.loadSentences
17 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
18 | import java.io.File
19 |
20 | /**
21 | * Validate a system output CoNLL file comparing it to a gold CoNLL file.
22 | *
23 | * @param neuralParser a neural parser
24 | * @param goldFilePath the path of the file containing the gold tree-bank, in CoNLL format.
25 | * @param outputFilePath the file path of the output CoNLL corpus (default = null -> a temporary file is used)
26 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
27 | */
28 | class CoNLLFileValidator(
29 | neuralParser: NeuralParser<*>,
30 | private val goldFilePath: String,
31 | private val outputFilePath: String? = null,
32 | private val verbose: Boolean = true
33 | ) {
34 |
35 | /**
36 | * Return a temporary file absolute path.
37 | *
38 | * @return the path of a temporary file generated at runtime
39 | */
40 | private val defaultOutputPath: String get() = File.createTempFile("${this.goldFilePath}_output", ".conll").path
41 |
42 | /**
43 | * The CoNLL Evaluator
44 | */
45 | private val conllEvaluator = if (this.goldFilePath.endsWith(".conllu")) CoNLLUEvaluator else CoNLLXEvaluator
46 |
47 | /**
48 | * The parser wrapper to parse sentences in CoNLL format.
49 | */
50 | private val conllParser = CoNLLDependencyParser(neuralParser)
51 |
52 | /**
53 | * Print the statistics resulting from the official CoNLL evaluation script.
54 | *
55 | * @return the statistics of the evaluation
56 | */
57 | fun evaluate() {
58 |
59 | val parsedSentences: List = this.parseSentences(sentences = loadSentences(
60 | type = "validation",
61 | filePath = goldFilePath,
62 | maxSentences = null,
63 | skipNonProjective = false))
64 |
65 | print("\nCoNLL official script evaluation:\n%s".format(this.evaluateWithCoNLLScript(parsedSentences)))
66 | }
67 |
68 | /**
69 | * Parse the validation CoNLL sentences.
70 | *
71 | * @return the list of parsed CoNLL sentences
72 | */
73 | private fun parseSentences(sentences: List): List {
74 |
75 | val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(sentences.size) else null
76 |
77 | if (this.verbose) println("Start parsing of %d sentences:".format(sentences.size))
78 |
79 | return sentences.mapIndexed { i, sentence ->
80 |
81 | progress?.tick()
82 |
83 | this.conllParser.parse(sentence, index = i)
84 | }
85 | }
86 |
87 | /**
88 | * Get the output of the official CoNLL evaluation script.
89 | *
90 | * @param parsedSentences a list of parsed sentences, parallel to the gold sentences
91 | *
92 | * @return the output of the official CoNLL evaluation script
93 | */
94 | private fun evaluateWithCoNLLScript(parsedSentences: List): String? {
95 |
96 | val outputPath: String = this.outputFilePath ?: this.defaultOutputPath
97 |
98 | CoNLLWriter.toFile(sentences = parsedSentences, writeComments = true, outputFilePath = outputPath)
99 |
100 | return this.conllEvaluator.evaluate(systemFilePath = outputPath, goldFilePath = this.goldFilePath)
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/Validator.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.helpers.validator
9 |
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.conllio.Token as CoNLLToken
12 | import com.kotlinnlp.dependencytree.DependencyTree
13 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
14 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency
15 | import com.kotlinnlp.neuralparser.NeuralParser
16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
17 | import com.kotlinnlp.neuralparser.helpers.statistics.MetricsCounter
18 | import com.kotlinnlp.neuralparser.helpers.statistics.SentenceMetrics
19 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics
20 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar
21 |
22 | /**
23 | * The Validator.
24 | *
25 | * @param neuralParser the neural parser
26 | * @property sentences the sentences to parse containing the gold annotation
27 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
28 | * @property verbose a Boolean indicating if the verbose mode is enabled (default = true)
29 | */
30 | class Validator(
31 | private val neuralParser: NeuralParser<*>,
32 | val sentences: List,
33 | sentencePreprocessor: SentencePreprocessor,
34 | private val verbose: Boolean = true
35 | ) {
36 |
37 | companion object {
38 |
39 | /**
40 | * The regular expression to match punctuation forms.
41 | */
42 | val punctuationRegex = Regex("^[-!\"#%&'()*,./:;?@\\[\\]_{}]+$")
43 | }
44 |
45 | init {
46 | require(sentences.all { it.hasAnnotatedHeads() }) {
47 | "A gold sentence must have a dependency tree with all heads annotated."
48 | }
49 | }
50 |
51 | /**
52 | * A counter of statistic metrics.
53 | */
54 | private lateinit var counter: MetricsCounter
55 |
56 | /**
57 | * A counter of statistic metrics, without considering punctuation.
58 | */
59 | private lateinit var counterNoPunct: MetricsCounter
60 |
61 | /**
62 | * The metrics of a sentence.
63 | */
64 | private lateinit var sentenceMetrics: SentenceMetrics
65 |
66 | /**
67 | * The parser wrapper to parse sentences in CoNLL format.
68 | */
69 | private val conllParser = CoNLLDependencyParser(
70 | neuralParser = this.neuralParser,
71 | sentencePreprocessor = sentencePreprocessor)
72 |
73 | /**
74 | * Get statistics about the evaluation of the parsing accuracy on the given [sentences].
75 | *
76 | * @return the statistics of the parsing accuracy
77 | */
78 | fun evaluate(): Statistics {
79 |
80 | val parsedSentences: List = this.parseSentences()
81 |
82 | this.initCounters(parsedSentences)
83 |
84 | this.sentences.zip(parsedSentences).forEach { (goldSentence, parsedSentence) ->
85 |
86 | val goldTree: DependencyTree = this.buildTree(goldSentence)
87 | val parsedTree: DependencyTree = this.buildTree(parsedSentence, allowCycles = true)
88 |
89 | require(parsedTree.size == goldTree.size) { "The dependency tree and its gold haven't the same size" }
90 |
91 | this.sentenceMetrics = SentenceMetrics()
92 |
93 | goldSentence.tokens.forEach { this.addTokenMetrics(token = it, parsedTree = parsedTree, goldTree = goldTree) }
94 |
95 | this.updateCorrectSentences()
96 | }
97 |
98 | return this.buildStats()
99 | }
100 |
101 | /**
102 | * @param sentence a CoNLL sentence
103 | * @param allowCycles if true it allows to create cycles when building the tree
104 | *
105 | * @return a new dependency tree based on the given sentence
106 | */
107 | private fun buildTree(sentence: CoNLLSentence, allowCycles: Boolean = false): DependencyTree =
108 | if (this.neuralParser.labellingEnabled)
109 | DependencyTree.Labeled(sentence = sentence, allowCycles = allowCycles)
110 | else
111 | DependencyTree.Unlabeled(sentence = sentence, allowCycles = allowCycles)
112 |
113 | /**
114 | * Parse the validation CoNLL sentences.
115 | *
116 | * @return the list of parsed CoNLL sentences
117 | */
118 | private fun parseSentences(): List {
119 |
120 | val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(this.sentences.size) else null
121 |
122 | if (this.verbose) println("Start parsing of %d sentences:".format(this.sentences.size))
123 |
124 | return this.sentences.mapIndexed { i, sentence ->
125 |
126 | progress?.tick()
127 |
128 | this.conllParser.parse(sentence, index = i)
129 | }
130 | }
131 |
132 | /**
133 | * Initialize the metrics counters.
134 | *
135 | * @param parsedSentences a list of parsed sentences
136 | */
137 | private fun initCounters(parsedSentences: List) {
138 |
139 | this.counter = MetricsCounter()
140 | this.counterNoPunct = MetricsCounter()
141 |
142 | this.counter.totalSentences = parsedSentences.size
143 | this.counterNoPunct.totalSentences = parsedSentences.size
144 | this.counter.totalTokens = parsedSentences.sumBy { it.tokens.count() }
145 | }
146 |
147 | /**
148 | * Add the statistic metrics of a given [token].
149 | *
150 | * @param token a token of a sentence
151 | * @param parsedTree the dependency tree of the parsed sentence
152 | * @param goldTree the gold dependency tree of the parsed sentence
153 | */
154 | private fun addTokenMetrics(token: CoNLLToken, parsedTree: DependencyTree, goldTree: DependencyTree) {
155 |
156 | val isNotPunct: Boolean = !punctuationRegex.matches(token.form)
157 | val parsedConfig: GrammaticalConfiguration? = (parsedTree as? DependencyTree.Labeled)?.getConfiguration(token.id)
158 | val goldConfig: GrammaticalConfiguration? = (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id)
159 | val parsedDependencies: List? =
160 | parsedConfig?.components?.map { it.syntacticDependency }
161 | val goldDependencies: List? =
162 | (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id)?.components?.map { it.syntacticDependency }
163 |
164 | if (isNotPunct) this.counterNoPunct.totalTokens++
165 |
166 | if (parsedTree.getHead(token.id) == goldTree.getHead(token.id)) {
167 |
168 | this.addCorrectAttachment(isNotPunct)
169 |
170 | if (parsedDependencies == goldDependencies)
171 | this.addCorrectLabeledAttachment(isNotPunct)
172 | else
173 | this.addUncorrectLabeledAttachment(isNotPunct)
174 |
175 | } else {
176 | this.addUncorrectAttachment(isNotPunct)
177 | this.addUncorrectLabeledAttachment(isNotPunct)
178 | }
179 |
180 | if (parsedConfig?.components?.map { it.pos } == goldConfig?.components?.map { it.pos })
181 | this.addCorrectPOSTag(isNotPunct)
182 |
183 | if ((parsedDependencies != null && goldDependencies != null
184 | && parsedDependencies.zip(goldDependencies).all { it.first.softEquals(it.second) })
185 | || (parsedDependencies == goldDependencies))
186 | this.addCorrectDeprel(isNotPunct)
187 | }
188 |
189 | /**
190 | * Add a correct attachment to the current statistic metrics.
191 | *
192 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
193 | */
194 | private fun addCorrectAttachment(isNotPunct: Boolean) {
195 |
196 | this.counter.unlabeledAttachments++
197 |
198 | if (isNotPunct) this.counterNoPunct.unlabeledAttachments++
199 | }
200 |
201 | /**
202 | * Add an uncorrect attachment to the current statistic metrics.
203 | *
204 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
205 | */
206 | private fun addUncorrectAttachment(isNotPunct: Boolean) {
207 |
208 | this.sentenceMetrics.correctUnlabeled = false
209 |
210 | if (isNotPunct) this.sentenceMetrics.correctUnlabeledNoPunct = false
211 | }
212 |
213 | /**
214 | * Add a correct labeled attachment to the current statistic metrics.
215 | *
216 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
217 | */
218 | private fun addCorrectLabeledAttachment(isNotPunct: Boolean) {
219 |
220 | this.counter.labeledAttachments++
221 |
222 | if (isNotPunct) this.counterNoPunct.labeledAttachments++
223 | }
224 |
225 | /**
226 | * Add an uncorrect labeled attachment to the current statistic metrics.
227 | *
228 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token
229 | */
230 | private fun addUncorrectLabeledAttachment(isNotPunct: Boolean) {
231 |
232 | this.sentenceMetrics.correctLabeled = false
233 |
234 | if (isNotPunct) this.sentenceMetrics.correctLabeledNoPunct = false
235 | }
236 |
237 | /**
238 | * Add a correct POS tag to the current statistic metrics.
239 | *
240 | * @param isNotPunct a Boolean indicating if the POS tag is related to a non-punctuation token
241 | */
242 | private fun addCorrectPOSTag(isNotPunct: Boolean) {
243 |
244 | this.counter.correctPOSTags++
245 |
246 | if (isNotPunct) this.counterNoPunct.correctPOSTags++
247 | }
248 |
249 | /**
250 | * Add a correct deprel to the current statistic metrics.
251 | *
252 | * @param isNotPunct a Boolean indicating if the deprel is related to a non-punctuation token
253 | */
254 | private fun addCorrectDeprel(isNotPunct: Boolean) {
255 |
256 | this.counter.correctDeprels++
257 |
258 | if (isNotPunct) this.counterNoPunct.correctDeprels++
259 | }
260 |
261 | /**
262 | * Update the counters of correct sentences with the current [sentenceMetrics].
263 | */
264 | private fun updateCorrectSentences() {
265 |
266 | if (this.sentenceMetrics.correctLabeled) this.counter.correctLabeledSentences++
267 | if (this.sentenceMetrics.correctUnlabeled) this.counter.correctUnlabeledSentences++
268 | if (this.sentenceMetrics.correctLabeledNoPunct) this.counterNoPunct.correctLabeledSentences++
269 | if (this.sentenceMetrics.correctUnlabeledNoPunct) this.counterNoPunct.correctUnlabeledSentences++
270 | }
271 |
272 | /**
273 | * Build the statistics related to the current counted metrics.
274 | */
275 | private fun buildStats(): Statistics {
276 |
277 | val punctStats = this.counter.toStatistics()
278 | val noPunctStats = this.counterNoPunct.toStatistics()
279 |
280 | return Statistics(
281 | las = punctStats.las,
282 | uas = punctStats.uas,
283 | ps = punctStats.ps,
284 | ds = punctStats.ds,
285 | slas = punctStats.slas,
286 | suas = punctStats.suas,
287 | noPunctuation = noPunctStats)
288 | }
289 | }
290 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseSentence.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
12 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable
13 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
14 |
15 | /**
16 | * A base real sentence.
17 | *
18 | * @property id the id of the sentence, unique within a list of sentences
19 | * @property tokens the list of tokens that compose the sentence
20 | * @property position the position of this sentence in the original text
21 | */
22 | data class BaseSentence(
23 | val id: Int,
24 | override val tokens: List,
25 | override val position: Position
26 | ) : RealSentence, SentenceIdentificable() {
27 |
28 | companion object {
29 |
30 | /**
31 | * Convert a CoNLL sentence to a [BaseSentence].
32 | *
33 | * @param sentence a CoNLL sentence
34 | * @param index the index of the sentence within a list of sentences
35 | *
36 | * @return a real sentence of real tokens
37 | */
38 | fun fromCoNLL(sentence: CoNLLSentence, index: Int): BaseSentence {
39 |
40 | val baseTokens = sentence.tokens.toBaseTokens()
41 |
42 | return BaseSentence(
43 | id = index, // the index is unique within a list of sentences
44 | tokens = baseTokens,
45 | position = Position(
46 | index = index,
47 | start = baseTokens.first().position.start,
48 | end = baseTokens.last().position.end))
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseToken.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken
11 | import com.kotlinnlp.linguisticdescription.sentence.token.TokenIdentificable
12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
13 |
14 | /**
15 | * A base real token.
16 | *
17 | * @property id the token id, an incremental integer starting from 0 within a sentence
18 | * @property form the form of the token
19 | * @property position the position of the token in the original text
20 | */
21 | data class BaseToken(
22 | override val id: Int,
23 | override val form: String,
24 | override val position: Position
25 | ) : RealToken, TokenIdentificable
26 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/CorpusDictionary.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.google.common.collect.HashMultimap
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Token as CoNLLToken
13 | import com.kotlinnlp.linguisticdescription.POSTag
14 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
15 | import com.kotlinnlp.utils.DictionarySet
16 | import java.io.Serializable
17 |
18 | /**
19 | * The CorpusDictionary.
20 | */
21 | class CorpusDictionary : Serializable {
22 |
23 | companion object {
24 |
25 | /**
26 | * Private val used to serialize the class (needed by Serializable).
27 | */
28 | @Suppress("unused")
29 | private const val serialVersionUID: Long = 1L
30 |
31 | /**
32 | * Create a new corpus populated with the information contained in the given [sentences] (words, POS tags and
33 | * deprels).
34 | *
35 | * @param sentences a list of sentences
36 | *
37 | * @return a new corpus dictionary
38 | */
39 | operator fun invoke(sentences: List): CorpusDictionary {
40 |
41 | val dictionary = CorpusDictionary()
42 |
43 | sentences.forEach { it.tokens.forEach { token -> dictionary.addInfo(token) } }
44 |
45 | return dictionary
46 | }
47 | }
48 |
49 | /**
50 | * The words.
51 | */
52 | val words = DictionarySet()
53 |
54 | /**
55 | * The map of forms to their possible POS tags.
56 | */
57 | val formsToPosTags: HashMultimap> = HashMultimap.create()
58 |
59 | /**
60 | * The dictionary set of all the possible grammatical configurations.
61 | */
62 | val grammaticalConfigurations = DictionarySet()
63 |
64 | /**
65 | * Add the info of a given [token] into this dictionary.
66 | *
67 | * @param token the token of a sentence
68 | */
69 | private fun addInfo(token: CoNLLToken) {
70 |
71 | this.words.add(token.normalizedForm)
72 |
73 | this.formsToPosTags.put(token.normalizedForm, token.posList)
74 |
75 | this.grammaticalConfigurations.add(GrammaticalConfiguration(*Array(
76 | size = maxOf(token.posList.size, token.syntacticDependencies.size),
77 | init = { i -> GrammaticalConfiguration.Component(
78 | pos = token.posList.getOrElse(i) { token.posList.single() },
79 | syntacticDependency = token.syntacticDependencies.getOrElse(i) { token.syntacticDependencies.single() })
80 | })))
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/Extensions.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
11 | import com.kotlinnlp.conllio.Token as CoNLLToken
12 |
13 | /**
14 | * @return a list of base real tokens
15 | */
16 | fun List.toBaseTokens(): List {
17 |
18 | var end = -2
19 |
20 | return this.mapIndexed { i, it ->
21 |
22 | val start = end + 2 // each couple of consecutive tokens is separated by a spacing char
23 | end = start + it.form.length - 1
24 |
25 | BaseToken(
26 | id = it.id,
27 | form = it.form,
28 | position = Position(index = i, start = start, end = end)
29 | )
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingSentence.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis
12 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies
13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
14 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence
15 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable
16 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
17 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector
18 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
19 |
20 | /**
21 | * The sentence used as input of the [com.kotlinnlp.neuralparser.NeuralParser].
22 | *
23 | * @property tokens the list of tokens of the sentence
24 | * @property morphoAnalysis the morphological analysis of the tokens (can be null)
25 | * @property position the position of the sentence in the text
26 | * @param labelerSelector the labeler selector used to select the grammatical configurations compatible with the sentence
27 | */
28 | class ParsingSentence(
29 | override val tokens: List,
30 | override val morphoAnalysis: MorphologicalAnalysis? = null,
31 | override val position: Position,
32 | private val labelerSelector: LabelerSelector
33 | ) : MorphoSentence, RealSentence, SentenceIdentificable() {
34 |
35 | /**
36 | * Check whether the morphologies of the token are compatible with the given configuration [c].
37 | * Middle multi-words morphologies are compared partially (only with the "CONTIN" components).
38 | *
39 | * @param c the grammatical configuration
40 | * @param tokenIndex the index of a token of the sentence
41 | *
42 | * @return true if the morphologies of the token are compatible with the given configuration, otherwise false
43 | */
44 | fun areConfigurationCompatible(c: GrammaticalConfiguration, tokenIndex: Int): Boolean =
45 | this.morphoAnalysis!!.startMorphologies[tokenIndex].any { c.isCompatible(it) } ||
46 | this.morphoAnalysis.middleMWMorphologies[tokenIndex].any { c.isPartiallyCompatible(it) }
47 |
48 | /**
49 | * @param c the grammatical configuration
50 | * @param tokenIndex the index of a token of the sentence
51 | *
52 | * @return the token morphologies (including the multi-words) that are compatible with the given configuration
53 | */
54 | fun getCompatibleMorphologies(c: GrammaticalConfiguration, tokenIndex: Int) = Morphologies(
55 | this.morphoAnalysis!!.allMorphologies[tokenIndex].filter {
56 | c.isCompatible(it) // TODO: || c.isPartiallyCompatible(it)
57 | })
58 |
59 | /**
60 | * Get the list of scored grammatical configurations that are valid for a given attachment.
61 | *
62 | * @param tokenIndex the index of the token to which one of the [configurations] must be assigned
63 | * @param headIndex the index of the token head (can be null)
64 | * @param configurations the list of grammatical configurations, sorted by descending score
65 | *
66 | * @return the valid grammatical configurations for the given attachment
67 | */
68 | fun getValidConfigurations(tokenIndex: Int,
69 | headIndex: Int?,
70 | configurations: List): List =
71 | this.labelerSelector.getValidConfigurations(
72 | sentence = this,
73 | tokenIndex = tokenIndex,
74 | headIndex = headIndex,
75 | configurations = configurations)
76 |
77 | /**
78 | * Get the morphologies of a given token that are compatible with the given grammatical configuration.
79 | *
80 | * @param tokenIndex the index of a token of the sentence
81 | * @param configuration the grammatical configuration of the token
82 | *
83 | * @return the morphologies compatible with the given grammatical configuration
84 | */
85 | fun getValidMorphologies(tokenIndex: Int,
86 | configuration: GrammaticalConfiguration): Morphologies =
87 | this.labelerSelector.getValidMorphologies(
88 | sentence = this,
89 | tokenIndex = tokenIndex,
90 | configuration = configuration)
91 | }
92 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingToken.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.language
9 |
10 | import com.kotlinnlp.linguisticdescription.POSTag
11 | import com.kotlinnlp.linguisticdescription.sentence.token.*
12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position
13 |
14 | /**
15 | * The token of the [ParsingSentence].
16 | *
17 | * @property id the id of the token, unique within its sentence
18 | * @property form the form
19 | * @property pos the list of part-of-speech tags associated to the token (more for composite tokens, can be null)
20 | * @property position the position of the token in the text (null if it is a trace)
21 | */
22 | data class ParsingToken(
23 | override val id: Int,
24 | override val form: String,
25 | val pos: List? = null, // TODO: find a better solution
26 | val position: Position?
27 | ) : FormToken, TokenIdentificable
28 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRModel.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
9 |
10 | import com.kotlinnlp.lssencoder.LSSModel
11 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.LabelerModel
12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
13 | import com.kotlinnlp.neuralparser.NeuralParserModel
14 | import com.kotlinnlp.neuralparser.language.CorpusDictionary
15 | import com.kotlinnlp.neuralparser.language.ParsingSentence
16 | import com.kotlinnlp.neuralparser.language.ParsingToken
17 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
18 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge
19 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel
20 | import com.kotlinnlp.utils.Serializer
21 | import java.io.InputStream
22 |
23 | /**
24 | * The model of the [LHRParser].
25 | *
26 | * @property language the language within the parser works (default = unknown)
27 | * @param corpusDictionary a corpus dictionary
28 | * @property lssModel the model of the LSS encoder
29 | * @property useLabeler whether to use the labeler
30 | * @property lossCriterionType the training mode of the labeler
31 | * @property predictPosTags whether to predict the POS tags together with the Deprels
32 | */
33 | class LHRModel(
34 | corpusDictionary: CorpusDictionary,
35 | val lssModel: LSSModel,
36 | val useLabeler: Boolean,
37 | val lossCriterionType: LossCriterionType,
38 | val predictPosTags: Boolean
39 | ) : NeuralParserModel(lssModel.language) {
40 |
41 | companion object {
42 |
43 | /**
44 | * Private val used to serialize the class (needed by Serializable).
45 | */
46 | @Suppress("unused")
47 | private const val serialVersionUID: Long = 1L
48 |
49 | /**
50 | * Read a [LHRModel] (serialized) from an input stream and decode it.
51 | *
52 | * @param inputStream the [InputStream] from which to read the serialized [LHRModel]
53 | *
54 | * @return the [LHRModel] read from [inputStream] and decoded
55 | */
56 | fun load(inputStream: InputStream): LHRModel = Serializer.deserialize(inputStream)
57 | }
58 |
59 | /**
60 | * The model of the Labeler.
61 | */
62 | val labelerModel: LabelerModel? = if (this.useLabeler)
63 | LabelerModel(
64 | contextEncodingSize = this.lssModel.contextVectorsSize,
65 | grammaticalConfigurations = corpusDictionary.grammaticalConfigurations,
66 | lossCriterionType = this.lossCriterionType)
67 | else
68 | null
69 |
70 | /**
71 | * The model of the pointer network used for the positional encoding.
72 | */
73 | val pointerNetworkModel = PointerNetworkModel(
74 | inputSize = this.lssModel.contextVectorsSize,
75 | vectorSize = this.lssModel.contextVectorsSize,
76 | mergeConfig = AffineMerge(outputSize = 100, activationFunction = Tanh))
77 |
78 | /**
79 | * @return the string representation of this model
80 | */
81 | override fun toString(): String = """
82 | %-33s : %s
83 | %-33s : %s
84 | %-33s : %s
85 | %-33s : %s
86 | %-33s : %s
87 | """.trimIndent().format(
88 | this.lssModel.tokensEncoderWrapperModel.model::class.simpleName, this.lssModel.tokensEncoderWrapperModel.model,
89 | "Context Encoder", this.lssModel.contextBiRNNConfig,
90 | "Heads Encoder", this.lssModel.headsBiRNNConfig,
91 | "Labeler training mode", this.lossCriterionType,
92 | "Predict POS tags", this.predictPosTags
93 | )
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRParser.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence
12 | import com.kotlinnlp.lssencoder.LSSEncoder
13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
14 | import com.kotlinnlp.lssencoder.decoder.CosineDecoder
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
16 | import com.kotlinnlp.neuralparser.NeuralParser
17 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.LabeledMorphoSynBuilder
18 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.UnlabeledMorphoSynBuilder
19 | import com.kotlinnlp.neuralparser.language.ParsingSentence
20 | import com.kotlinnlp.neuralparser.language.ParsingToken
21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.GreedyDependencyTreeBuilder
22 |
23 | /**
24 | * The Latent Head Representation (LHR) Parser.
25 | *
26 | * Implemented as described in the following publication:
27 | * [Non-Projective Dependency Parsing via Latent Heads Representation (LHR)](https://arxiv.org/abs/1802.02116)
28 | *
29 | * @property model the parser model
30 | * @param contextDropout the dropout probability of the context encodings (default 0.0)
31 | * @param headsDropout the dropout probability of the latent heads encodings (default 0.0)
32 | */
33 | class LHRParser(
34 | override val model: LHRModel,
35 | contextDropout: Double = 0.0,
36 | headsDropout: Double = 0.0
37 | ) : NeuralParser {
38 |
39 | /**
40 | * Whether this parser executes the morpho-syntactic labelling.
41 | */
42 | override val labellingEnabled: Boolean = this.model.useLabeler
43 |
44 | /**
45 | * The Encoder of the Latent Syntactic Structure.
46 | */
47 | private val lssEncoder =
48 | LSSEncoder(model = this.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
49 |
50 | /**
51 | * The builder of the labeler.
52 | */
53 | private val labeler: Labeler? = this.model.labelerModel?.let { Labeler(it) }
54 |
55 | /**
56 | * Parse a sentence, returning its dependency tree.
57 | * The dependency tree is obtained by decoding a latent syntactic structure.
58 | * If the labeler is available, the dependency tree could contain grammatical information.
59 | *
60 | * @param sentence a parsing sentence
61 | *
62 | * @return the dependency tree predicted for the given [sentence]
63 | */
64 | override fun parse(sentence: ParsingSentence): MorphoSynSentence {
65 |
66 | val lss: LatentSyntacticStructure = this.lssEncoder.forward(sentence)
67 |
68 | val dependencyTree: DependencyTree = GreedyDependencyTreeBuilder(
69 | lss = lss,
70 | scoresMap = CosineDecoder().decode(lss),
71 | labeler = this.labeler
72 | ).build()
73 |
74 | return when (dependencyTree) {
75 |
76 | is DependencyTree.Labeled ->
77 | LabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence()
78 |
79 | is DependencyTree.Unlabeled ->
80 | UnlabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence()
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTrainer.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
9 |
10 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
11 | import com.kotlinnlp.dependencytree.DependencyTree
12 | import com.kotlinnlp.lssencoder.LSSEncoder
13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
15 | import com.kotlinnlp.neuralparser.helpers.Trainer
16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
18 | import com.kotlinnlp.neuralparser.language.ParsingSentence
19 | import com.kotlinnlp.neuralparser.language.ParsingToken
20 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder
21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder.Companion.calculateErrors
22 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator
23 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod
24 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
25 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer
26 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor
27 | import com.kotlinnlp.simplednn.simplemath.assignSum
28 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
29 | import com.kotlinnlp.simplednn.utils.scheduling.BatchScheduling
30 | import com.kotlinnlp.simplednn.utils.scheduling.EpochScheduling
31 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling
32 |
33 | /**
34 | * The training helper.
35 | *
36 | * @param parser a neural parser
37 | * @param batchSize the size of the batches of sentences
38 | * @param epochs the number of training epochs
39 | * @param validator the validation helper (if it is null no validation is done after each epoch)
40 | * @param modelFilename the name of the file in which to save the best trained model
41 | * @param updateMethod the update method shared to all the parameters of the parser (Learning Rate, ADAM, AdaGrad, ...)
42 | * @param contextDropout the dropout probability of the context encodings (default 0.25)
43 | * @param headsDropout the dropout probability of the latent heads encodings (default 0.25)
44 | * @param labelerDropout the dropout probability of the labeler (default 0.25)
45 | * @param skipPunctuationErrors whether to do not consider punctuation errors
46 | * @param usePositionalEncodingErrors whether to calculate and propagate the positional encoding errors
47 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
48 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
49 | */
50 | class LHRTrainer(
51 | private val parser: LHRParser,
52 | private val batchSize: Int,
53 | private val epochs: Int,
54 | validator: Validator?,
55 | modelFilename: String,
56 | private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
57 | contextDropout: Double = 0.25,
58 | headsDropout: Double = 0.25,
59 | labelerDropout: Double = 0.25,
60 | private val skipPunctuationErrors: Boolean,
61 | usePositionalEncodingErrors: Boolean,
62 | sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
63 | verbose: Boolean = true
64 | ) : Trainer(
65 | neuralParser = parser,
66 | batchSize = batchSize,
67 | epochs = epochs,
68 | validator = validator,
69 | modelFilename = modelFilename,
70 | minRelevantErrorsCountToUpdate = 1,
71 | sentencePreprocessor = sentencePreprocessor,
72 | verbose = verbose
73 | ) {
74 |
75 | /**
76 | * The encoder of the Latent Syntactic Structure.
77 | */
78 | private val lssEncoder =
79 | LSSEncoder(model = this.parser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
80 |
81 | /**
82 | * The labeler.
83 | */
84 | private val labeler: Labeler? = this.parser.model.labelerModel?.let { Labeler(model = it, dropout = labelerDropout) }
85 |
86 | /**
87 | * The positional encoder.
88 | */
89 | private val positionalEncoder: PositionalEncoder? = if (usePositionalEncodingErrors)
90 | PositionalEncoder(this.parser.model.pointerNetworkModel)
91 | else
92 | null
93 |
94 | /**
95 | * The pointer network optimizer.
96 | */
97 | private val pointerNetworkOptimizer = ParamsOptimizer(this.updateMethod)
98 |
99 | /**
100 | * The optimizer of the LSS encoder.
101 | */
102 | private val lssEncoderOptimizer = ParamsOptimizer(this.updateMethod)
103 |
104 | /**
105 | * The optimizer of the labeler (can be null).
106 | */
107 | private val labelerOptimizer: ParamsOptimizer? = this.parser.model.labelerModel?.let {
108 | ParamsOptimizer(this.updateMethod)
109 | }
110 |
111 | /**
112 | * The epoch counter.
113 | */
114 | private var epochCount: Int = 0
115 |
116 | /**
117 | * Group the optimizers all together.
118 | */
119 | private val optimizers: List = listOf(
120 | this.lssEncoderOptimizer,
121 | this.labelerOptimizer,
122 | this.pointerNetworkOptimizer)
123 |
124 | /**
125 | * @return a string representation of the configuration of this Trainer
126 | */
127 | override fun toString(): String = """
128 | %-33s : %s
129 | %-33s : %s
130 | %-33s : %s
131 | """.trimIndent().format(
132 | "Epochs", this.epochs,
133 | "Batch size", this.batchSize,
134 | "Skip punctuation errors", this.skipPunctuationErrors
135 | )
136 |
137 | /**
138 | * Beat the occurrence of a new batch.
139 | */
140 | override fun newBatch() {
141 | if (this.updateMethod is BatchScheduling) this.updateMethod.newBatch()
142 | }
143 |
144 | /**
145 | * Beat the occurrence of a new epoch.
146 | */
147 | override fun newEpoch() {
148 |
149 | if (this.updateMethod is EpochScheduling) this.updateMethod.newEpoch()
150 |
151 | this.epochCount++
152 | }
153 |
154 | /**
155 | * Update the model parameters.
156 | */
157 | override fun update() {
158 | this.optimizers.forEach { it?.update() }
159 | }
160 |
161 | /**
162 | * @return the count of the relevant errors
163 | */
164 | override fun getRelevantErrorsCount(): Int = 1
165 |
166 | /**
167 | * Method to call before learning a new sentence.
168 | */
169 | private fun beforeSentenceLearning() {
170 | if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample()
171 | }
172 |
173 | /**
174 | * Train the Transition System with the given [sentence] and [goldTree].
175 | *
176 | * @param sentence the sentence
177 | * @param goldTree the gold tree of the sentence
178 | */
179 | override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) {
180 |
181 | this.beforeSentenceLearning()
182 |
183 | val lss: LatentSyntacticStructure = this.lssEncoder.forward(sentence)
184 | val latentHeadsErrors = calculateLatentHeadsErrors(lss, goldTree)
185 |
186 | val labelerErrors: List? = this.labeler?.let {
187 | val labelerPrediction: List = it.forward(Labeler.Input(lss, goldTree))
188 | this.parser.model.labelerModel?.calculateLoss(labelerPrediction, goldTree)
189 | }
190 |
191 | val positionalEncoderErrors: PointerNetworkProcessor.InputErrors? = this.positionalEncoder?.let {
192 | it.propagateErrors(calculateErrors(it.forward(lss.contextVectors)), this.pointerNetworkOptimizer)
193 | }
194 |
195 | this.propagateErrors(
196 | latentHeadsErrors = latentHeadsErrors,
197 | labelerErrors = labelerErrors,
198 | positionalEncoderErrors = positionalEncoderErrors)
199 | }
200 |
201 | /**
202 | * Calculate the errors of the latent heads
203 | *
204 | * @param lss the latent syntactic structure
205 | * @param goldTree the gold tree of the sentence
206 | *
207 | * @return the errors of the latent heads
208 | */
209 | private fun calculateLatentHeadsErrors(lss: LatentSyntacticStructure,
210 | goldTree: DependencyTree): List =
211 | MSECalculator().calculateErrors(
212 | outputSequence = lss.latentHeads,
213 | outputGoldSequence = this.getExpectedLatentHeads(lss, goldTree))
214 |
215 | /**
216 | * Return a list containing the expected latent heads, one for each token of the sentence.
217 | *
218 | * @param lss the latent syntactic structure
219 | * @param goldTree the gold tree of the sentence
220 | *
221 | * @return the expected latent heads
222 | */
223 | private fun getExpectedLatentHeads(lss: LatentSyntacticStructure,
224 | goldTree: DependencyTree): List =
225 |
226 | lss.sentence.tokens.map { token ->
227 |
228 | val goldHeadId: Int? = goldTree.getHead(token.id)
229 |
230 | when {
231 | goldHeadId == null -> lss.virtualRoot
232 | this.skipPunctuationErrors && token.isComma -> lss.getLatentHeadById(token.id) // no errors
233 | else -> lss.getContextVectorById(goldHeadId)
234 | }
235 | }
236 |
237 | /**
238 | * Propagate the errors through the encoders.
239 | *
240 | * @param latentHeadsErrors the latent heads errors
241 | * @param labelerErrors the labeler errors
242 | * @param positionalEncoderErrors the positional encoder errors
243 | */
244 | private fun propagateErrors(latentHeadsErrors: List,
245 | labelerErrors: List?,
246 | positionalEncoderErrors: PointerNetworkProcessor.InputErrors?) {
247 |
248 | val contextVectorsErrors: List = latentHeadsErrors.map { it.zerosLike() }
249 |
250 | positionalEncoderErrors?.let { contextVectorsErrors.assignSum(it.inputVectorsErrors) }
251 |
252 | this.labeler?.propagateErrors(labelerErrors!!, this.labelerOptimizer!!, copy = false)?.let { labelerInputErrors ->
253 | contextVectorsErrors.assignSum(labelerInputErrors.contextErrors)
254 | this.propagateRootErrors(labelerInputErrors.rootErrors)
255 | }
256 |
257 | this.lssEncoder.backward(outputErrors = LSSEncoder.OutputErrors(
258 | size = latentHeadsErrors.size,
259 | contextVectors = contextVectorsErrors,
260 | latentHeads = latentHeadsErrors))
261 |
262 | this.lssEncoderOptimizer.accumulate(this.lssEncoder.getParamsErrors(copy = false))
263 | }
264 |
265 | /**
266 | * Propagate the [errors] through the virtual root embedding.
267 | *
268 | * @param errors the errors
269 | */
270 | private fun propagateRootErrors(errors: DenseNDArray) {
271 | this.updateMethod.update(array = this.parser.model.lssModel.rootEmbedding, errors = errors)
272 | }
273 | }
274 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTransferLearning.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.lssencoder.LSSEncoder
12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor
14 | import com.kotlinnlp.neuralparser.helpers.Trainer
15 | import com.kotlinnlp.neuralparser.helpers.validator.Validator
16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor
17 | import com.kotlinnlp.neuralparser.language.ParsingSentence
18 | import com.kotlinnlp.neuralparser.language.ParsingToken
19 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator
20 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod
21 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod
22 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer
23 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
24 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling
25 |
26 | /**
27 | * The transfer learning training helper.
28 | *
29 | * @param referenceParser the neural parser used as reference
30 | * @param targetParser the neural parser to train via transfer learning
31 | * @param epochs the number of training epochs
32 | * @param validator the validation helper (if it is null no validation is done after each epoch)
33 | * @param modelFilename the name of the file in which to save the best trained model
34 | * @param updateMethod the update method (Learning Rate, ADAM, AdaGrad, ...)
35 | * @param contextDropout the dropout probability of the target context encodings (default 0.0)
36 | * @param headsDropout the dropout probability of the target latent heads encodings (default 0.0)
37 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis)
38 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true)
39 | */
40 | class LHRTransferLearning(
41 | private val referenceParser: LHRParser,
42 | private val targetParser: LHRParser,
43 | private val epochs: Int,
44 | validator: Validator?,
45 | modelFilename: String,
46 | private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999),
47 | contextDropout: Double = 0.0,
48 | headsDropout: Double = 0.0,
49 | sentencePreprocessor: SentencePreprocessor = BasePreprocessor(),
50 | verbose: Boolean = true
51 | ) : Trainer(
52 | neuralParser = targetParser,
53 | batchSize = 1,
54 | epochs = epochs,
55 | validator = validator,
56 | modelFilename = modelFilename,
57 | minRelevantErrorsCountToUpdate = 1,
58 | sentencePreprocessor = sentencePreprocessor,
59 | verbose = verbose
60 | ) {
61 |
62 | /**
63 | * The [LSSEncoder] of the reference parser.
64 | */
65 | private val referenceLSSEncoder: LSSEncoder =
66 | LSSEncoder(model = this.referenceParser.model.lssModel)
67 |
68 | /**
69 | * The [LSSEncoder] of the target parser.
70 | */
71 | private val targetLSSEncoder: LSSEncoder =
72 | LSSEncoder(model = this.targetParser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout)
73 |
74 | /**
75 | * The optimizer of the context encoder.
76 | */
77 | private val targetLSSEncoderOptimizer = ParamsOptimizer(this.updateMethod)
78 |
79 | /**
80 | * Train the [targetParser] with the given [sentence] and [goldTree].
81 | * Transfer the knowledge acquired by the LSS encoder of a reference parser to that of the target parser.
82 | *
83 | * @param sentence the input sentence
84 | * @param goldTree the gold tree of the sentence
85 | */
86 | override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) {
87 |
88 | this.beforeSentenceLearning()
89 |
90 | val targetLSS: LatentSyntacticStructure = this.targetLSSEncoder.forward(sentence)
91 | val refLSS: LatentSyntacticStructure = this.referenceLSSEncoder.forward(sentence)
92 |
93 | val contextErrors: List = MSECalculator().calculateErrors(
94 | outputSequence = targetLSS.contextVectors,
95 | outputGoldSequence = refLSS.contextVectors)
96 |
97 | this.targetLSSEncoder.backward(LSSEncoder.OutputErrors(size = sentence.tokens.size, contextVectors = contextErrors))
98 | this.targetLSSEncoderOptimizer.accumulate((this.targetLSSEncoder.getParamsErrors()))
99 | }
100 |
101 | /**
102 | * Method to call before learning a new sentence.
103 | */
104 | private fun beforeSentenceLearning() {
105 | if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample()
106 | }
107 |
108 | /**
109 | * Update the model parameters.
110 | */
111 | override fun update() {
112 | this.targetLSSEncoderOptimizer.update()
113 | }
114 |
115 | /**
116 | * @return the count of the relevant errors
117 | */
118 | override fun getRelevantErrorsCount(): Int = 1
119 |
120 | /**
121 | * @return a string representation of the configuration of this Trainer
122 | */
123 | override fun toString(): String = """
124 | %-33s : %s
125 | """.trimIndent().format(
126 | "Epochs", this.epochs
127 | )
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/CyclesFixer.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs
12 |
13 | /**
14 | * Naive strategy to fix possible cycles in a [dependencyTree].
15 | *
16 | * @param dependencyTree the dependency tree
17 | * @param scoredArcs the scored arcs between pair of tree elements
18 | */
19 | internal class CyclesFixer(private val dependencyTree: DependencyTree, private val scoredArcs: ScoredArcs) {
20 |
21 | /**
22 | * The set of direct elements of the tree (elements that aren't involved in cycles).
23 | */
24 | private lateinit var directElements: Set
25 |
26 | /**
27 | * Fix the cycles of the dependency tree.
28 | */
29 | fun fixCycles() {
30 |
31 | val cycles: List = this.dependencyTree.getCycles()
32 |
33 | this.directElements = this.dependencyTree.elements.toSet() - cycles.toElementsSet()
34 |
35 | cycles.forEach { this.fixCycle(it) }
36 | }
37 |
38 | /**
39 | * @return the set of elements from a list of path
40 | */
41 | private fun List.toElementsSet(): Set {
42 |
43 | val elements = mutableSetOf()
44 | this.forEach { path -> elements += path.arcs.map { it.dependent } }
45 | return elements
46 | }
47 |
48 | /**
49 | * Remove a [cycle] from the dependency tree.
50 | *
51 | * @param cycle a cycle of the dependency tree
52 | */
53 | private fun fixCycle(cycle: DependencyTree.Path) {
54 |
55 | val dep: Int = this.removeLowestScoringArc(cycle.arcs)
56 | val (newGov: Int, score: Double) = this.findBestGovernor(dep)
57 | this.dependencyTree.setArc(dependent = dep, governor = newGov, score = score)
58 | }
59 |
60 | /**
61 | * Remove the lowest scoring arc and return the related dependent to be reattached.
62 | *
63 | * @param arcs a list of arcs
64 | *
65 | * @return the element to be reattached.
66 | */
67 | private fun removeLowestScoringArc(arcs: List): Int {
68 |
69 | val arc: DependencyTree.Arc = this.getLowestScoringArc(arcs)
70 | this.dependencyTree.removeArc(dependent = arc.dependent, governor = arc.governor)
71 | return arc.dependent
72 | }
73 |
74 | /**
75 | * @param arcs a list of arcs
76 | *
77 | * @return the lowest scoring arc according to the [scoredArcs].
78 | */
79 | private fun getLowestScoringArc(arcs: List): DependencyTree.Arc =
80 | arcs.minBy { arc -> this.scoredArcs.getScore(dependentId = arc.dependent, governorId = arc.governor) }!!
81 |
82 | /**
83 | * Find the best governor for the given element that doesn't introduce a cycle.
84 | *
85 | * @param element an element of the dependency tree
86 | *
87 | * @return the new governor id and the related score
88 | */
89 | private fun findBestGovernor(element: Int): Pair {
90 |
91 | val headsMap: Map = this.scoredArcs.getHeadsMap(element)
92 |
93 | val candidates: List = this.directElements.intersect(headsMap.keys).filter { candidateGov ->
94 | !this.dependencyTree.introduceCycle(dependent = element, governor = candidateGov)
95 | }
96 |
97 | return headsMap.filter { it.key in candidates }.maxBy { it.value }!!.toPair()
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/GreedyDependencyTreeBuilder.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
12 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.language.ParsingToken
15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler
16 |
17 | /**
18 | * A helper that builds the dependency tree with the highest scoring configurations.
19 | *
20 | * @param lss the latent syntactic structure of the input sentence
21 | */
22 | internal class GreedyDependencyTreeBuilder(
23 | private val lss: LatentSyntacticStructure,
24 | private val scoresMap: ScoredArcs,
25 | private val labeler: Labeler?
26 | ) {
27 |
28 | /**
29 | * Build a new dependency tree from the latent syntactic structure [lss], using the possible attachments in the
30 | * [scoresMap].
31 | *
32 | * @return the annotated dependency tree with the highest score, built from the given LSS
33 | */
34 | fun build(): DependencyTree =
35 | if (this.labeler != null)
36 | DependencyTree.Labeled(this.lss.sentence.tokens.map { it.id }).apply {
37 | assignHighestScoringHeads()
38 | fixCycles()
39 | assignLabels()
40 | }
41 | else
42 | DependencyTree.Unlabeled(this.lss.sentence.tokens.map { it.id }).apply {
43 | assignHighestScoringHeads()
44 | fixCycles()
45 | }
46 |
47 |
48 | /**
49 | * Assign the heads to this dependency tree using the highest scoring arcs of the [scoresMap].
50 | */
51 | private fun DependencyTree.assignHighestScoringHeads() {
52 |
53 | val (topId: Int, topScore: Double) = scoresMap.findHighestScoringTop()
54 |
55 | this.setAttachmentScore(dependent = topId, score = topScore)
56 |
57 | this.elements.filter { it != topId }.forEach { depId ->
58 |
59 | val (govId: Int, score: Double) = scoresMap.findHighestScoringHead(
60 | dependentId = depId,
61 | except = listOf(ScoredArcs.rootId))!!
62 |
63 | this.setArc(
64 | dependent = depId,
65 | governor = govId,
66 | allowCycle = true,
67 | score = score)
68 | }
69 | }
70 |
71 | /**
72 | * Fix possible cycles using the [scoresMap].
73 | */
74 | private fun DependencyTree.fixCycles() = CyclesFixer(dependencyTree = this, scoredArcs = scoresMap).fixCycles()
75 |
76 | /**
77 | * Annotate this dependency tree with the labels.
78 | */
79 | private fun DependencyTree.Labeled.assignLabels() {
80 |
81 | labeler!!.predict(Labeler.Input(lss, this)).forEach { tokenId, configurations ->
82 | this.setGrammaticalConfiguration(dependent = tokenId, configuration = configurations.first().config)
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/keyextractors/PosTagKeyExtractor.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors
9 |
10 | import com.kotlinnlp.neuralparser.language.ParsingSentence
11 | import com.kotlinnlp.neuralparser.language.ParsingToken
12 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.EmbeddingKeyExtractor
13 |
14 | /**
15 | * An [EmbeddingKeyExtractor] by POS tag.
16 | */
17 | object PosTagKeyExtractor : EmbeddingKeyExtractor {
18 |
19 | /**
20 | * Private val used to serialize the class (needed by Serializable).
21 | */
22 | @Suppress("unused")
23 | private const val serialVersionUID: Long = 1L
24 |
25 | /**
26 | * @param sentence a generic sentence
27 | * @param tokenId the id of the token from which to extract the key
28 | *
29 | * @return the POS as string
30 | */
31 | override fun getKey(sentence: ParsingSentence, tokenId: Int): String =
32 | sentence.tokens[tokenId].pos?.toString() ?: "_"
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/PositionalEncoder.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules
9 |
10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator
11 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor
12 | import com.kotlinnlp.simplednn.core.optimizer.ParamsErrorsList
13 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel
14 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor
15 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
16 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory
17 |
18 | /**
19 | * The PositionalEncoder.
20 | *
21 | * @param model the model of the pointer network
22 | * @property id an identification number useful to track a specific encoder
23 | */
24 | class PositionalEncoder(
25 | private val model: PointerNetworkModel,
26 | override val id: Int = 0
27 | ) : NeuralProcessor<
28 | List, // InputType
29 | List, // OutputType
30 | List, // ErrorsType
31 | PointerNetworkProcessor.InputErrors // InputErrorsType
32 | > {
33 |
34 | companion object {
35 |
36 | /**
37 | * @param predictions the list of prediction
38 | *
39 | * @return the errors of the given predictions
40 | */
41 | fun calculateErrors(predictions: List): List {
42 |
43 | return predictions.mapIndexed { index, prediction ->
44 | val expectedValues = DenseNDArrayFactory.oneHotEncoder(length = predictions.size, oneAt = index)
45 | SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, outputGold = expectedValues)
46 | }
47 | }
48 | }
49 |
50 | /**
51 | * Propagate the errors to the input.
52 | */
53 | override val propagateToInput: Boolean = true
54 |
55 | /**
56 | * The pointer processor used as encoder.
57 | */
58 | private val encoder = PointerNetworkProcessor(this.model)
59 |
60 | /**
61 | * The Forward.
62 | *
63 | * @param input the input
64 | *
65 | * @return the result of the forward
66 | */
67 | override fun forward(input: List): List {
68 |
69 | this.encoder.setInputSequence(input)
70 |
71 | return input.map { this.encoder.forward(it) }
72 | }
73 |
74 | /**
75 | * The Backward.
76 | *
77 | * @param outputErrors the errors of the last forward
78 | */
79 | override fun backward(outputErrors: List) {
80 | this.encoder.backward(outputErrors)
81 | }
82 |
83 | /**
84 | * Return the input errors of the last backward.
85 | *
86 | * @param copy whether to return by value or by reference (default true)
87 | *
88 | * @return the input errors
89 | */
90 | override fun getInputErrors(copy: Boolean): PointerNetworkProcessor.InputErrors = this.encoder.getInputErrors()
91 |
92 | /**
93 | * Return the params errors of the last backward.
94 | *
95 | * @param copy a Boolean indicating whether the returned errors must be a copy or a reference (default true)
96 | *
97 | * @return the parameters errors
98 | */
99 | override fun getParamsErrors(copy: Boolean): ParamsErrorsList = this.encoder.getParamsErrors(copy = copy)
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/Labeler.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure
13 | import com.kotlinnlp.neuralparser.language.ParsingSentence
14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar
15 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor
16 | import com.kotlinnlp.simplednn.core.neuralprocessor.batchfeedforward.BatchFeedforwardProcessor
17 | import com.kotlinnlp.simplednn.simplemath.ndarray.Shape
18 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory
20 | import com.kotlinnlp.utils.notEmptyOr
21 |
22 | /**
23 | * The Labeler.
24 | *
25 | * @property model the model
26 | * @param dropout the dropout probability (default 0.0)
27 | * @property id an identification number useful to track a specific encoder
28 | */
29 | class Labeler(
30 | val model: LabelerModel,
31 | dropout: Double = 0.0,
32 | override val id: Int = 0
33 | ) : NeuralProcessor<
34 | Labeler.Input, // InputType
35 | List, // OutputType
36 | List, // ErrorsType
37 | Labeler.InputErrors // InputErrorsType
38 | > {
39 |
40 | /**
41 | * The input of this labeler.
42 | *
43 | * @param lss the latent syntactic structure
44 | * @param dependencyTree the dependency tree
45 | */
46 | data class Input(val lss: LatentSyntacticStructure<*, *>, val dependencyTree: DependencyTree)
47 |
48 | /**
49 | * The input errors of this labeler.
50 | *
51 | * @param rootErrors the errors of the virtual root
52 | * @param contextErrors the errors of the context vectors
53 | */
54 | data class InputErrors(val rootErrors: DenseNDArray, val contextErrors: List)
55 |
56 | /**
57 | * This encoder propagate the errors to the input.
58 | */
59 | override val propagateToInput: Boolean = true
60 |
61 | /**
62 | * The processor that classify the grammar of a token.
63 | */
64 | private val processor =
65 | BatchFeedforwardProcessor(model = this.model.networkModel, dropout = dropout, propagateToInput = true)
66 |
67 | /**
68 | * The dependency tree of the last input, used during the training.
69 | */
70 | private lateinit var dependencyTree: DependencyTree
71 |
72 | /**
73 | * Score the possible grammatical configurations of each token of a given input.
74 | *
75 | * @param input a [Labeler] input
76 | *
77 | * @return a map of valid grammatical configurations (sorted by descending score) associated to each token id
78 | */
79 | fun predict(input: Input): Map> {
80 |
81 | return this.forward(input)
82 | .asSequence()
83 | .map { it.toScoredGrammar() }
84 | .withIndex()
85 | .associate { (tokenIndex, configurations) ->
86 |
87 | val tokenId: Int = input.dependencyTree.elements[tokenIndex]
88 |
89 | val validConfigurations: List = (input.lss.sentence as ParsingSentence).getValidConfigurations(
90 | tokenIndex = tokenIndex,
91 | headIndex = input.dependencyTree.getHead(tokenId)?.let { input.dependencyTree.getPosition(it) },
92 | configurations = configurations)
93 |
94 | tokenId to validConfigurations
95 | .filter { it.score >= this.model.labelerScoreThreshold }
96 | .notEmptyOr { validConfigurations.subList(0, 1) }
97 | }
98 | }
99 |
100 |
101 | /**
102 | * Return the network outcomes for each token.
103 | *
104 | * @param input a [Labeler] input
105 | *
106 | * @return the network outcomes for each token
107 | */
108 | override fun forward(input: Input): List {
109 |
110 | this.dependencyTree = input.dependencyTree
111 |
112 | return this.processor.forward(
113 | input = input.lss.sentence.tokens.map { this.extractFeatures(tokenId = it.id, lss = input.lss) }.toTypedArray())
114 | }
115 |
116 | /**
117 | * Propagate the errors through the neural components of the labeler.
118 | *
119 | * @param outputErrors the list of errors
120 | */
121 | override fun backward(outputErrors: List) {
122 |
123 | this.processor.backward(outputErrors)
124 | }
125 |
126 | /**
127 | * @return the input errors and the root errors
128 | */
129 | override fun getInputErrors(copy: Boolean): InputErrors {
130 |
131 | val inputErrors: List> = this.processor.getInputsErrors(copy = false)
132 |
133 | val contextErrors = List(size = inputErrors.size, init = {
134 | DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize))
135 | })
136 |
137 | val rootErrors: DenseNDArray = DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize))
138 |
139 | inputErrors.forEachIndexed { tokenIndex, (depErrors, govErrors) ->
140 |
141 | val tokenId: Int = this.dependencyTree.elements[tokenIndex]
142 | val depVector: DenseNDArray = contextErrors[tokenIndex]
143 | val govVector: DenseNDArray = this.dependencyTree.getHead(tokenId)?.let {
144 | contextErrors[this.dependencyTree.getPosition(it)]
145 | } ?: rootErrors
146 |
147 | depVector.assignSum(depErrors)
148 | govVector.assignSum(govErrors)
149 | }
150 |
151 | return InputErrors(rootErrors = rootErrors, contextErrors = contextErrors)
152 | }
153 |
154 | /**
155 | * @param copy a Boolean indicating whether the returned errors must be a copy or a reference
156 | *
157 | * @return the errors of the [Labeler] parameters
158 | */
159 | override fun getParamsErrors(copy: Boolean) = this.processor.getParamsErrors(copy = copy)
160 |
161 | /**
162 | * Transform the array resulting from the prediction into a list of [ScoredGrammar].
163 | *
164 | * @return a list of [ScoredGrammar]
165 | */
166 | private fun DenseNDArray.toScoredGrammar(): List = (0 until this.length)
167 | .map { i -> ScoredGrammar(getGrammaticalConfiguration(i), score = this[i]) }
168 | .sortedWith(compareByDescending { it.score })
169 |
170 | /**
171 | * @param index a prediction index
172 | *
173 | * @return the grammatical configuration with the given [index]
174 | */
175 | private fun getGrammaticalConfiguration(index: Int): GrammaticalConfiguration =
176 | this.model.grammaticalConfigurations.getElement(index)!!
177 |
178 | /**
179 | * @param tokenId the id of a token of the input sentence
180 | * @param lss the latent syntactic structure of the input sentence
181 | *
182 | * @return the list of features that encode the given token
183 | */
184 | private fun extractFeatures(tokenId: Int, lss: LatentSyntacticStructure<*, *>): List =
185 | listOf(
186 | lss.getContextVectorById(tokenId),
187 | this.dependencyTree.getHead(tokenId)?.let { lss.getContextVectorById(it) } ?: lss.virtualRoot
188 | )
189 | }
190 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/LabelerModel.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler
9 |
10 | import com.kotlinnlp.dependencytree.DependencyTree
11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterion
13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType
14 | import com.kotlinnlp.simplednn.core.functionalities.activations.Softmax
15 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh
16 | import com.kotlinnlp.simplednn.core.layers.LayerInterface
17 | import com.kotlinnlp.simplednn.core.layers.LayerType
18 | import com.kotlinnlp.simplednn.core.layers.StackedLayersParameters
19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
20 | import com.kotlinnlp.utils.DictionarySet
21 | import java.io.Serializable
22 |
23 | /**
24 | * The model of the [Labeler].
25 | *
26 | * @property contextEncodingSize the size of the token encoding vectors
27 | * @property grammaticalConfigurations the dictionary set of all the possible grammatical configurations
28 | * @property lossCriterionType the training mode
29 | */
30 | class LabelerModel(
31 | val contextEncodingSize: Int,
32 | val grammaticalConfigurations: DictionarySet,
33 | val lossCriterionType: LossCriterionType
34 | ) : Serializable {
35 |
36 | companion object {
37 |
38 | /**
39 | * Private val used to serialize the class (needed by Serializable).
40 | */
41 | @Suppress("unused")
42 | private const val serialVersionUID: Long = 1L
43 | }
44 |
45 | /**
46 | * The score threshold above which to consider a labeler output valid.
47 | * It makes sense with the Softmax activation function.
48 | */
49 | internal val labelerScoreThreshold: Double = 1.0 / this.grammaticalConfigurations.size
50 |
51 | /**
52 | * The Network model that predicts the grammatical configurations.
53 | */
54 | val networkModel = StackedLayersParameters(
55 | LayerInterface(sizes = listOf(this.contextEncodingSize, this.contextEncodingSize)),
56 | LayerInterface(
57 | size = this.contextEncodingSize,
58 | connectionType = LayerType.Connection.Affine,
59 | activationFunction = Tanh),
60 | LayerInterface(
61 | type = LayerType.Input.Dense,
62 | size = this.grammaticalConfigurations.size,
63 | connectionType = LayerType.Connection.Feedforward,
64 | activationFunction = when (this.lossCriterionType) {
65 | LossCriterionType.Softmax -> Softmax()
66 | LossCriterionType.HingeLoss -> null
67 | })
68 | )
69 |
70 | /**
71 | * Return the errors of a given labeler predictions, respect to a gold dependency tree.
72 | * Errors are calculated comparing the last predictions done with the given gold grammatical configurations.
73 | *
74 | * @param predictions the current network predictions
75 | * @param goldTree the gold tree of the sentence
76 | *
77 | * @return a list of predictions errors
78 | */
79 | fun calculateLoss(predictions: List, goldTree: DependencyTree.Labeled): List {
80 |
81 | val errorsList = mutableListOf()
82 |
83 | predictions.forEachIndexed { tokenIndex, prediction ->
84 |
85 | val tokenId: Int = goldTree.elements[tokenIndex]
86 | val errors: DenseNDArray = LossCriterion(this.lossCriterionType).getPredictionErrors(
87 | prediction = prediction,
88 | goldIndex = this.grammaticalConfigurations.getId(goldTree.getConfiguration(tokenId))!!)
89 |
90 | errorsList.add(errors)
91 | }
92 |
93 | return errorsList
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/HingeLoss.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
9 |
10 | import com.kotlinnlp.simplednn.core.functionalities.losses.getErrorsByHingeLoss
11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
12 |
13 | /**
14 | * The loss criterion that calculates the errors with the hinge loss method.
15 | */
16 | class HingeLoss : LossCriterion {
17 |
18 | /**
19 | * @param prediction a prediction array
20 | * @param goldIndex the index of the gold value
21 | *
22 | * @return the errors of the given prediction
23 | */
24 | override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray =
25 | getErrorsByHingeLoss(prediction = prediction, goldIndex = goldIndex)
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterion.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
9 |
10 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
11 |
12 | /**
13 | * The LossCriterion interface.
14 | */
15 | interface LossCriterion {
16 |
17 | companion object {
18 |
19 | /**
20 | * The LossCriterion builder.
21 | *
22 | * @param type the loss criterion type
23 | */
24 | operator fun invoke(type: LossCriterionType): LossCriterion = when (type) {
25 | LossCriterionType.Softmax -> Softmax()
26 | LossCriterionType.HingeLoss -> HingeLoss()
27 | }
28 | }
29 |
30 | /**
31 | * @param prediction a prediction array
32 | * @param goldIndex the index of the gold value
33 | *
34 | * @return the errors of the given prediction
35 | */
36 | fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterionType.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
9 |
10 | /**
11 | * The available loss criterion.
12 | *
13 | * @property Softmax calculate the errors with cross-entropy softmax
14 | * @property HingeLoss calculate the errors with the hinge loss method
15 | */
16 | enum class LossCriterionType { Softmax, HingeLoss }
17 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/ScoredGrammar.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
9 |
10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration
11 |
12 | /**
13 | * The outcome of a single prediction of the labeler.
14 | *
15 | * @property config the grammatical configuration
16 | * @property score the score
17 | */
18 | data class ScoredGrammar(val config: GrammaticalConfiguration, val score: Double)
19 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/Softmax.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils
9 |
10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator
11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray
12 |
13 | /**
14 | * The loss criterion that calculates the errors with the cross-entropy softmax.
15 | */
16 | class Softmax : LossCriterion {
17 |
18 | /**
19 | * @param prediction a prediction array
20 | * @param goldIndex the index of the gold value
21 | *
22 | * @return the errors of the given prediction
23 | */
24 | override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray =
25 | SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, goldIndex = goldIndex)
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/BaseConverter.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.Token
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 |
16 | /**
17 | * The sentence converter from a [ParsingSentence] to a generic [Sentence].
18 | */
19 | class BaseConverter : SentenceConverter> {
20 |
21 | companion object {
22 |
23 | /**
24 | * Private val used to serialize the class (needed by Serializable).
25 | */
26 | @Suppress("unused")
27 | private const val serialVersionUID: Long = 1L
28 | }
29 |
30 | /**
31 | * Convert a given [ParsingSentence] to a generic [Sentence] simply casting it.
32 | *
33 | * @param sentence the input sentence
34 | *
35 | * @return the converted sentence
36 | */
37 | @Suppress("UNCHECKED_CAST")
38 | override fun convert(sentence: ParsingSentence): Sentence = sentence as Sentence
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/FormConverter.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 |
16 | /**
17 | * The sentence converter from a [ParsingSentence] to a sentence of FormToken.
18 | */
19 | class FormConverter : SentenceConverter> {
20 |
21 | companion object {
22 |
23 | /**
24 | * Private val used to serialize the class (needed by Serializable).
25 | */
26 | @Suppress("unused")
27 | private const val serialVersionUID: Long = 1L
28 | }
29 |
30 | /**
31 | * Convert a given [ParsingSentence] to a to a sentence of FormToken simply casting it.
32 | *
33 | * @param sentence the input sentence
34 | *
35 | * @return the converted sentence
36 | */
37 | @Suppress("UNCHECKED_CAST")
38 | override fun convert(sentence: ParsingSentence): Sentence = sentence as Sentence
39 | }
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/MorphoConverter.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * -----------------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters
9 |
10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence
11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken
12 | import com.kotlinnlp.neuralparser.language.ParsingSentence
13 | import com.kotlinnlp.neuralparser.language.ParsingToken
14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter
15 |
16 | /**
17 | * The sentence converter from a [ParsingSentence] to a [MorphoSentence].
18 | */
19 | class MorphoConverter : SentenceConverter> {
20 |
21 | companion object {
22 |
23 | /**
24 | * Private val used to serialize the class (needed by Serializable).
25 | */
26 | @Suppress("unused")
27 | private const val serialVersionUID: Long = 1L
28 | }
29 |
30 | /**
31 | * Convert a given [ParsingSentence] to a [MorphoSentence] simply casting it.
32 | *
33 | * @param sentence the input sentence
34 | *
35 | * @return the converted sentence
36 | */
37 | @Suppress("UNCHECKED_CAST")
38 | override fun convert(sentence: ParsingSentence): MorphoSentence = sentence as MorphoSentence
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/kotlinnlp/neuralparser/utils/Extensions.kt:
--------------------------------------------------------------------------------
1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved.
2 | *
3 | * This Source Code Form is subject to the terms of the Mozilla Public
4 | * License, v. 2.0. If a copy of the MPL was not distributed with this
5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 | * ------------------------------------------------------------------*/
7 |
8 | package com.kotlinnlp.neuralparser.utils
9 |
10 | import com.kotlinnlp.conllio.CoNLLReader
11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence
12 | import com.kotlinnlp.conllio.Sentence.InvalidTree
13 | import java.io.File
14 |
15 | /**
16 | * Load sentences from a CoNLL file.
17 | *
18 | * @param type the string that describes the type of sentences
19 | * @param filePath the file path
20 | * @param maxSentences the max number of sentences to load
21 | * @param skipNonProjective whether to skip non-projective sentences
22 | *
23 | * @return the list of loaded sentences
24 | */
25 | fun loadSentences(type: String,
26 | filePath: String,
27 | maxSentences: Int?,
28 | skipNonProjective: Boolean): List {
29 |
30 | println("Loading $type sentences from '%s'%s%s...".format(
31 | filePath,
32 | maxSentences?.let { " (max $it)" } ?: "",
33 | if (skipNonProjective) " skipping non-projective" else ""
34 | ))
35 |
36 | return filePath.loadFromTreeBank(skipNonProjective = skipNonProjective, maxSentences = maxSentences)
37 | }
38 |
39 | /**
40 | * Return a list of CoNLL sentences from a tree-bank at this path.
41 | *
42 | * @param maxSentences the maximum number of sentences to load (null = unlimited)
43 | * @param skipNonProjective whether to skip non-projective sentences
44 | *
45 | * @throws InvalidTree if the tree of a sentence is not valid
46 | */
47 | private fun String.loadFromTreeBank(maxSentences: Int? = null,
48 | skipNonProjective: Boolean = false): List {
49 |
50 | var index = 0
51 | val sentences = ArrayList()
52 |
53 | CoNLLReader.forEachSentence(File(this)) { sentence ->
54 |
55 | if (maxSentences == null || index < maxSentences) {
56 |
57 | if (sentence.hasAnnotatedHeads()) sentence.assertValidCoNLLTree()
58 |
59 | val skip: Boolean = skipNonProjective && sentence.isNonProjective()
60 |
61 | if (!skip) sentences.add(sentence)
62 | }
63 |
64 | index++
65 | }
66 |
67 | return sentences.toList()
68 | }
69 |
70 |
--------------------------------------------------------------------------------