├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── examples ├── ExampleUtils.kt ├── evaluation │ ├── CommandLineArguments.kt │ └── EvaluateLHR.kt ├── pom.xml └── training │ ├── CommandLineArguments.kt │ └── TrainLHR.kt ├── pom.xml └── src └── main └── kotlin └── com └── kotlinnlp └── neuralparser ├── NeuralParser.kt ├── NeuralParserModel.kt ├── helpers ├── Trainer.kt ├── labelerselector │ ├── LabelerSelector.kt │ ├── MorphoSelector.kt │ └── NoFilterSelector.kt ├── preprocessors │ ├── BasePreprocessor.kt │ ├── CoNLLPreprocessor.kt │ ├── MorphoPreprocessor.kt │ └── SentencePreprocessor.kt ├── sentencebuilder │ ├── CompositeTokenHelper.kt │ ├── LabeledMorphoSynBuilder.kt │ └── UnlabeledMorphoSynBuilder.kt ├── statistics │ ├── BaseStatistics.kt │ ├── MetricsCounter.kt │ ├── SentenceMetrics.kt │ └── Statistics.kt └── validator │ ├── CoNLLDependencyParser.kt │ ├── CoNLLFileValidator.kt │ └── Validator.kt ├── language ├── BaseSentence.kt ├── BaseToken.kt ├── CorpusDictionary.kt ├── Extensions.kt ├── ParsingSentence.kt └── ParsingToken.kt ├── parsers └── lhrparser │ ├── LHRModel.kt │ ├── LHRParser.kt │ ├── LHRTrainer.kt │ ├── LHRTransferLearning.kt │ ├── helpers │ ├── CyclesFixer.kt │ ├── GreedyDependencyTreeBuilder.kt │ └── keyextractors │ │ └── PosTagKeyExtractor.kt │ ├── neuralmodules │ ├── PositionalEncoder.kt │ └── labeler │ │ ├── Labeler.kt │ │ ├── LabelerModel.kt │ │ └── utils │ │ ├── HingeLoss.kt │ │ ├── LossCriterion.kt │ │ ├── LossCriterionType.kt │ │ ├── ScoredGrammar.kt │ │ └── Softmax.kt │ └── sentenceconverters │ ├── BaseConverter.kt │ ├── FormConverter.kt │ └── MorphoConverter.kt └── utils └── Extensions.kt /.gitignore: -------------------------------------------------------------------------------- 1 | ### Configuration ### 2 | config/* 3 | !config/configuration.yaml.example 4 | 5 | ### Intellij ### 6 | .idea/ 7 | /out/ 8 | 9 | ### Intellij Patch ### 10 | *.iml 11 | 12 | /resources/ 13 | 14 | ### Maven ### 15 | target/ 16 | pom.xml.tag 17 | pom.xml.releaseBackup 18 | pom.xml.versionsBackup 19 | pom.xml.next 20 | release.properties 21 | dependency-reduced-pom.xml 22 | buildNumber.properties 23 | .mvn/timing.properties 24 | 25 | # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) 26 | !/.mvn/wrapper/maven-wrapper.jar 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | sudo: false 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - oracle-java8-installer 9 | 10 | os: 11 | - linux 12 | 13 | dist: trusty 14 | 15 | jdk: 16 | - oraclejdk8 17 | 18 | install: true 19 | 20 | script: mvn test compile -B -Dmaven.javadoc.skip=true 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NeuralParser [![Maven Central](https://img.shields.io/maven-central/v/com.kotlinnlp/neuralparser.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22com.kotlinnlp%22%20AND%20a:%22neuralparser%22) [![Build Status](https://travis-ci.org/KotlinNLP/NeuralParser.svg?branch=master)](https://travis-ci.org/KotlinNLP/NeuralParser) 2 | 3 | NeuralParser is a very simple to use dependency parser, based on the 4 | [SimpleDNN](https://github.com/kotlinnlp/SimpleDNN "SimpleDNN on GitHub") library and the 5 | [SyntaxDecoder](https://github.com/kotlinnlp/SyntaxDecoder "SyntaxDecoder on GitHub") transition systems framework. 6 | 7 | NeuralParser is part of [KotlinNLP](http://kotlinnlp.com/ "KotlinNLP"). 8 | 9 | 10 | ## Getting Started 11 | 12 | ### Import with Maven 13 | 14 | ```xml 15 | 16 | com.kotlinnlp 17 | neuralparser 18 | 0.6.5 19 | 20 | ``` 21 | 22 | ### Examples 23 | 24 | Try some examples of training and evaluation of NeuralParser running the files in the `examples` folder. 25 | 26 | 27 | ## License 28 | 29 | This software is released under the terms of the 30 | [Mozilla Public License, v. 2.0](https://mozilla.org/MPL/2.0/ "Mozilla Public License, v. 2.0") 31 | 32 | 33 | ## Contributions 34 | 35 | We greatly appreciate any bug reports and contributions, which can be made by filing an issue or making a pull 36 | request through the [github page](https://github.com/kotlinnlp/NeuralParser "NeuralParser on GitHub"). 37 | -------------------------------------------------------------------------------- /examples/ExampleUtils.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary 9 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor 10 | import com.kotlinnlp.neuralparser.helpers.preprocessors.MorphoPreprocessor 11 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 12 | 13 | /** 14 | * Build a [SentencePreprocessor]. 15 | * 16 | * @param morphoDictionary a morphology dictionary 17 | * 18 | * @return a new sentence preprocessor 19 | */ 20 | internal fun buildSentencePreprocessor(morphoDictionary: MorphologyDictionary?): SentencePreprocessor = 21 | morphoDictionary?.let { MorphoPreprocessor(dictionary = it) } ?: BasePreprocessor() 22 | -------------------------------------------------------------------------------- /examples/evaluation/CommandLineArguments.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package evaluation 9 | 10 | import com.xenomachina.argparser.ArgParser 11 | import com.xenomachina.argparser.default 12 | 13 | /** 14 | * The interpreter of command line arguments for the evaluation script. 15 | * 16 | * @param args the array of command line arguments 17 | */ 18 | class CommandLineArguments(args: Array) { 19 | 20 | /** 21 | * The parser of the string arguments. 22 | */ 23 | private val parser = ArgParser(args) 24 | 25 | /** 26 | * The file path of the serialized model. 27 | */ 28 | val modelPath: String by parser.storing( 29 | "-m", 30 | "--model-path", 31 | help="the file path of the serialized model" 32 | ) 33 | 34 | /** 35 | * The file path of the validation set. 36 | */ 37 | val validationSetPath: String by parser.storing( 38 | "-v", 39 | "--validation-set", 40 | help="the file path of the validation set" 41 | ) 42 | 43 | /** 44 | * The file path of the serialized morphology dictionary. 45 | */ 46 | val morphoDictionaryPath: String? by parser.storing( 47 | "-d", 48 | "--dictionary", 49 | help="the file path of the serialized morphology dictionary" 50 | ).default { null } 51 | 52 | /** 53 | * Force parsing all arguments (only read ones are parsed by default). 54 | */ 55 | init { 56 | parser.force() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /examples/evaluation/EvaluateLHR.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package evaluation 9 | 10 | import buildSentencePreprocessor 11 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary 12 | import com.kotlinnlp.neuralparser.NeuralParser 13 | import com.kotlinnlp.neuralparser.NeuralParserModel 14 | import com.kotlinnlp.neuralparser.helpers.validator.Validator 15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel 16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser 17 | import com.kotlinnlp.neuralparser.utils.loadSentences 18 | import com.kotlinnlp.utils.Timer 19 | import com.xenomachina.argparser.mainBody 20 | import java.io.File 21 | import java.io.FileInputStream 22 | 23 | /** 24 | * Evaluate the model of an [LHRParser]. 25 | * 26 | * Launch with the '-h' option for help about the command line arguments. 27 | */ 28 | fun main(args: Array) = mainBody { 29 | 30 | val parsedArgs = CommandLineArguments(args) 31 | 32 | val parser: NeuralParser<*> = LHRParser( 33 | model = parsedArgs.modelPath.let { 34 | println("Loading model from '$it'.") 35 | NeuralParserModel.load(FileInputStream(File(it))) as LHRModel 36 | }) 37 | 38 | val validator = Validator( 39 | neuralParser = parser, 40 | sentences = loadSentences( 41 | type = "validation", 42 | filePath = parsedArgs.validationSetPath, 43 | maxSentences = null, 44 | skipNonProjective = false), 45 | sentencePreprocessor = buildSentencePreprocessor( 46 | morphoDictionary = parsedArgs.morphoDictionaryPath?.let { 47 | println("Loading serialized dictionary from '$it'...") 48 | MorphologyDictionary.load(FileInputStream(File(it))) 49 | } 50 | )) 51 | 52 | val timer = Timer() 53 | val evaluation = validator.evaluate() 54 | 55 | println("\n$evaluation") 56 | println("\nElapsed time: ${timer.formatElapsedTime()}") 57 | } 58 | 59 | -------------------------------------------------------------------------------- /examples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 4.0.0 7 | 8 | com.kotlinnlp 9 | neuralparser-examples 10 | 0.1.0 11 | jar 12 | 13 | 14 | UTF-8 15 | 2.6 16 | 1.3.31 17 | 0.0.5 18 | 2.0.7 19 | 5.2 20 | 0.6.5 21 | 22 | 23 | 24 | 25 | jcenter 26 | https://jcenter.bintray.com/ 27 | 28 | 29 | 30 | 31 | 32 | jcenter 33 | JCenter 34 | https://jcenter.bintray.com/ 35 | 36 | 37 | 38 | 39 | . 40 | 41 | 42 | 43 | org.jetbrains.kotlin 44 | kotlin-maven-plugin 45 | ${kotlin.version} 46 | 47 | 1.8 48 | 49 | 50 | 51 | compile 52 | process-sources 53 | 54 | compile 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.apache.maven.plugins 62 | maven-assembly-plugin 63 | ${assembly-plugin.version} 64 | 65 | 66 | lhr-training-assembly 67 | package 68 | single 69 | 70 | lhr-parser-train 71 | 72 | 73 | true 74 | training.TrainLHRKt 75 | 76 | 77 | 78 | jar-with-dependencies 79 | 80 | 81 | 82 | 83 | lhr-evaluation-assembly 84 | package 85 | single 86 | 87 | lhr-parser-eval 88 | 89 | 90 | true 91 | evaluation.EvaluateLHRKt 92 | 93 | 94 | 95 | jar-with-dependencies 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | org.jetbrains.kotlin 107 | kotlin-stdlib-jdk8 108 | ${kotlin.version} 109 | 110 | 111 | 112 | org.jetbrains.kotlin 113 | kotlin-reflect 114 | ${kotlin.version} 115 | 116 | 117 | 118 | com.beust 119 | klaxon 120 | ${klaxon.version} 121 | 122 | 123 | 124 | com.xenomachina 125 | xenocom 126 | ${xenocom.version} 127 | 128 | 129 | 130 | com.xenomachina 131 | kotlin-argparser 132 | ${argparser.version} 133 | 134 | 135 | 136 | com.kotlinnlp 137 | neuralparser 138 | ${neuralparser.version} 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /examples/training/CommandLineArguments.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package training 9 | 10 | import com.xenomachina.argparser.ArgParser 11 | import com.xenomachina.argparser.InvalidArgumentException 12 | import com.xenomachina.argparser.default 13 | 14 | /** 15 | * The interpreter of command line arguments for the training script. 16 | * 17 | * @param args the array of command line arguments 18 | */ 19 | class CommandLineArguments(args: Array) { 20 | 21 | /** 22 | * The type of tokens encoding. 23 | * 24 | * TODO: AMBIGUOUS_POS 25 | */ 26 | enum class TokensEncodingType { 27 | WORD_EMBEDDINGS, 28 | WORD_AND_POS_EMBEDDINGS, 29 | WORD_AND_EXT_AND_POS_EMBEDDINGS, 30 | MORPHO_FEATURES, 31 | CHARLM 32 | } 33 | 34 | /** 35 | * The parser of the string arguments. 36 | */ 37 | private val parser = ArgParser(args) 38 | 39 | /** 40 | * The language code 41 | */ 42 | val langCode: String by parser.storing( 43 | "-l", 44 | "--language", 45 | help="the language ISO 639-1 code" 46 | ) 47 | 48 | /** 49 | * The number of training epochs (default = 10). 50 | */ 51 | val epochs: Int by parser.storing( 52 | "-e", 53 | "--epochs", 54 | help="the number of training epochs (default = 10)" 55 | ) { toInt() }.default(10) 56 | 57 | /** 58 | * The size of the batches of sentences (default = 1). 59 | */ 60 | val batchSize: Int by parser.storing( 61 | "-b", 62 | "--batch-size", 63 | help="the size of the batches of sentences (default = 1)" 64 | ) { toInt() }.default(1) 65 | 66 | /** 67 | * The maximum number of sentences to load for training (default unlimited) 68 | */ 69 | val maxSentences: Int? by parser.storing( 70 | "-s", 71 | "--max-sentences", 72 | help="the maximum number of sentences to load for training (default unlimited)" 73 | ) { toInt() }.default { null } 74 | 75 | /** 76 | * The file path of the training set. 77 | */ 78 | val trainingSetPath: String by parser.storing( 79 | "-t", 80 | "--training-set", 81 | help="the file path of the training set" 82 | ) 83 | 84 | /** 85 | * The file path of the gold-POS training set. 86 | * TODO: Re-enable for LHR transfer learning. 87 | */ 88 | // val goldPosSetPath: String? by parser.storing( 89 | // "-p", 90 | // "--pos-set", 91 | // help="the file path of the gold-POS training set" 92 | // ).default { null } 93 | 94 | /** 95 | * The file path of the validation set. 96 | */ 97 | val validationSetPath: String by parser.storing( 98 | "-v", 99 | "--validation-set", 100 | help="the file path of the validation set" 101 | ) 102 | 103 | /** 104 | * The path of the file in which to save the serialized model. 105 | */ 106 | val modelPath: String by parser.storing( 107 | "-m", 108 | "--model-path", 109 | help="the path of the file in which to save the serialized model" 110 | ) 111 | 112 | /** 113 | * The file path of the pre-trained word embeddings. 114 | */ 115 | val embeddingsPath: String? by parser.storing( 116 | "-w", 117 | "--trained-word-emb-path", 118 | help="the file path of the pre-trained word embeddings" 119 | ).default { null } 120 | 121 | /** 122 | * The number of stacked BiRNNs of the context encoder (default 2). 123 | */ 124 | val numOfContextLayers: Int by parser.storing( 125 | "-c", 126 | "--context-layers", 127 | help="the number of stacked BiRNNs of the context encoder (default 2)" 128 | ){ toInt() } 129 | .default(2) 130 | .addValidator { if (value < 1) throw InvalidArgumentException( "The number of context-layers must >= 1") } 131 | 132 | /** 133 | * The size of the word embedding vectors. 134 | */ 135 | val wordEmbeddingSize: Int by parser.storing( 136 | "--word-emb-size", 137 | help="the size of the word embedding vectors (default 150)" 138 | ){ toInt() }.default(150) 139 | 140 | /** 141 | * The word embeddings dropout coefficient. 142 | */ 143 | val wordDropoutCoefficient: Double by parser.storing( 144 | "--word-dropout", 145 | help="the word embeddings dropout coefficient (default 0.25)" 146 | ){ toDouble() }.default(0.25) 147 | 148 | /** 149 | * The size of the part-of-speech embedding vectors. 150 | */ 151 | val posEmbeddingSize: Int by parser.storing( 152 | "--pos-emb-size", 153 | help="the size of the part-of-speech embedding vectors (default 50)" 154 | ){ toInt() }.default(50) 155 | 156 | /** 157 | * The part-of-speech embeddings dropout coefficient. 158 | */ 159 | val posDropoutCoefficient: Double by parser.storing( 160 | "--pos-dropout", 161 | help="the part-of-speech embeddings dropout coefficient (default 0.0)" 162 | ){ toDouble() }.default(0.0) 163 | 164 | /** 165 | * Whether to skip non-projective sentences. 166 | */ 167 | val skipNonProjective: Boolean by parser.flagging( 168 | "--skip-non-projective", 169 | help="whether to skip non-projective sentences" 170 | ) 171 | 172 | /** 173 | * Whether to do not consider punctuation errors. 174 | */ 175 | val skipPunctuationErrors: Boolean by parser.flagging( 176 | "--skip-punct-err", 177 | help="whether to do not consider punctuation errors" 178 | ) 179 | 180 | /** 181 | * Whether to do not use the labeler. 182 | */ 183 | val noLabeler: Boolean by parser.flagging( 184 | "--no-labeler", 185 | help="whether to do not use the labeler" 186 | ) 187 | 188 | /** 189 | * Whether to do not predict the POS tags. 190 | */ 191 | val noPosPrediction: Boolean by parser.flagging( 192 | "--no-pos", 193 | help="whether to do not predict the POS tags" 194 | ) 195 | 196 | /** 197 | * The file path of the serialized morphology dictionary. 198 | */ 199 | val morphoDictionaryPath: String? by parser.storing( 200 | "-d", 201 | "--dictionary", 202 | help="the file path of the serialized morphology dictionary" 203 | ).default { null } 204 | 205 | /** 206 | * The file path of the lexicon dictionary. 207 | */ 208 | val lexiconDictionaryPath: String? by parser.storing( 209 | "-x", 210 | "--lexicon", 211 | help="the file path of the lexicon dictionary" 212 | ).default { null } 213 | 214 | /** 215 | * The file path of the serialized characters language model. 216 | */ 217 | val charLMModelPath: String? by parser.storing( 218 | "--charlm", 219 | help="the file path of the serialized characters language model" 220 | ).default { null } 221 | 222 | /** 223 | * The file path of the serialized characters language model for reverse encodings. 224 | */ 225 | val charLMRevModelPath: String? by parser.storing( 226 | "--charlm-rev", 227 | help="the file path of the serialized characters language model for reverse encodings" 228 | ).default { null } 229 | 230 | /** 231 | * The type of morphology encoding. 232 | */ 233 | val tokensEncodingType: TokensEncodingType by parser.mapping( 234 | "--tokens-word-emb" to TokensEncodingType.WORD_EMBEDDINGS, 235 | "--tokens-word-pos-emb" to TokensEncodingType.WORD_AND_POS_EMBEDDINGS, 236 | "--tokens-word-ext-pos-emb" to TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS, 237 | "--tokens-morpho" to TokensEncodingType.MORPHO_FEATURES, 238 | "--tokens-charlm" to TokensEncodingType.CHARLM, 239 | help = "the type of morphology encoding (default --tokens-word-pos-emb)" 240 | ).default { TokensEncodingType.WORD_AND_POS_EMBEDDINGS } 241 | 242 | /** 243 | * Whether to do not show details about the training. 244 | */ 245 | val quiet: Boolean by parser.flagging( 246 | "-q", 247 | "--quiet", 248 | help="whether to do not show details about the training " 249 | ) 250 | 251 | /** 252 | * Force parsing all arguments (only read ones are parsed by default). 253 | * Check the dependencies between more arguments. 254 | */ 255 | init { 256 | 257 | parser.force() 258 | 259 | this.checkDependencies() 260 | } 261 | 262 | /** 263 | * Check the dependencies between more arguments. 264 | */ 265 | private fun checkDependencies() { 266 | 267 | if (this.tokensEncodingType == TokensEncodingType.CHARLM) { 268 | this.charLMModelPath ?: throw RuntimeException("Missing characters language model path") 269 | this.charLMRevModelPath ?: throw RuntimeException("Missing reverse characters language model path") 270 | } 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /examples/training/TrainLHR.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package training 9 | 10 | import buildSentencePreprocessor 11 | import com.kotlinnlp.languagemodel.CharLM 12 | import com.kotlinnlp.linguisticdescription.language.getLanguageByIso 13 | import com.kotlinnlp.linguisticdescription.lexicon.LexiconDictionary 14 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis 15 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence 16 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence 17 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken 18 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken 19 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 20 | import com.kotlinnlp.lssencoder.LSSModel 21 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer 22 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary 23 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 24 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh 25 | import com.kotlinnlp.simplednn.core.layers.LayerType 26 | import com.kotlinnlp.neuralparser.helpers.validator.Validator 27 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 28 | import com.kotlinnlp.neuralparser.language.* 29 | import com.kotlinnlp.simplednn.deeplearning.birnn.BiRNNConfig 30 | import com.kotlinnlp.tokensencoder.embeddings.EmbeddingsEncoderModel 31 | import com.xenomachina.argparser.mainBody 32 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRModel 33 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRParser 34 | import com.kotlinnlp.neuralparser.parsers.lhrparser.LHRTrainer 35 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType 36 | import com.kotlinnlp.tokensencoder.wrapper.MirrorConverter 37 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.MorphoConverter 38 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors.PosTagKeyExtractor 39 | import com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters.FormConverter 40 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.NormWordKeyExtractor 41 | import com.kotlinnlp.neuralparser.utils.loadSentences 42 | import com.kotlinnlp.simplednn.core.embeddings.EmbeddingsMap 43 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod 44 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge 45 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.ConcatMerge 46 | import com.kotlinnlp.tokensencoder.charlm.CharLMEncoderModel 47 | import com.kotlinnlp.tokensencoder.ensemble.EnsembleTokensEncoderModel 48 | import com.kotlinnlp.tokensencoder.morpho.FeaturesCollector 49 | import com.kotlinnlp.tokensencoder.morpho.MorphoEncoderModel 50 | import com.kotlinnlp.tokensencoder.wrapper.TokensEncoderWrapperModel 51 | import java.io.File 52 | import java.io.FileInputStream 53 | 54 | /** 55 | * Train the [LHRParser]. 56 | * 57 | * Launch with the '-h' option for help about the command line arguments. 58 | */ 59 | fun main(args: Array) = mainBody { 60 | 61 | val parsedArgs = CommandLineArguments(args) 62 | 63 | val trainingSentences: List = loadSentences( 64 | type = "training", 65 | filePath = parsedArgs.trainingSetPath, 66 | maxSentences = parsedArgs.maxSentences, 67 | skipNonProjective = parsedArgs.skipNonProjective) 68 | 69 | val corpus: CorpusDictionary = trainingSentences.let { 70 | println("Creating corpus dictionary...") 71 | CorpusDictionary(it) 72 | } 73 | 74 | val morphologyDictionary: MorphologyDictionary? = parsedArgs.morphoDictionaryPath?.let { 75 | println("Loading serialized dictionary from '$it'...") 76 | MorphologyDictionary.load(FileInputStream(File(it))) 77 | } 78 | 79 | val parser: LHRParser = buildParser( 80 | parsedArgs = parsedArgs, 81 | tokensEncoderWrapperModel = buildTokensEncoderWrapperModel( 82 | parsedArgs = parsedArgs, 83 | sentences = trainingSentences, 84 | corpus = corpus, 85 | morphologyDictionary = morphologyDictionary), 86 | corpus = corpus) 87 | 88 | val trainer = buildTrainer(parser = parser, parsedArgs = parsedArgs, morphologyDictionary = morphologyDictionary) 89 | 90 | println("\n-- MODEL") 91 | println(parser.model) 92 | 93 | println("\n-- START TRAINING ON %d SENTENCES".format(trainingSentences.size)) 94 | println(trainer) 95 | 96 | trainer.train(trainingSentences = trainingSentences) 97 | } 98 | 99 | /** 100 | * Build the LHR Parser. 101 | * 102 | * @param parsedArgs the parsed command line arguments 103 | * @param tokensEncoderWrapperModel the tokens-encoder wrapper model 104 | * @param corpus the corpus dictionary 105 | * 106 | * @return a new parser 107 | */ 108 | private fun buildParser( 109 | parsedArgs: CommandLineArguments, 110 | tokensEncoderWrapperModel: TokensEncoderWrapperModel, 111 | corpus: CorpusDictionary 112 | ): LHRParser = LHRParser(model = LHRModel( 113 | corpusDictionary = corpus, 114 | lssModel = LSSModel( 115 | language = getLanguageByIso(parsedArgs.langCode), 116 | tokensEncoderWrapperModel = tokensEncoderWrapperModel, 117 | contextBiRNNConfig = BiRNNConfig( 118 | connectionType = LayerType.Connection.LSTM, 119 | hiddenActivation = Tanh, 120 | numberOfLayers = parsedArgs.numOfContextLayers), 121 | headsBiRNNConfig = BiRNNConfig( 122 | connectionType = LayerType.Connection.LSTM, 123 | hiddenActivation = Tanh) 124 | ), 125 | useLabeler = !parsedArgs.noLabeler, 126 | lossCriterionType = LossCriterionType.Softmax, 127 | predictPosTags = !parsedArgs.noPosPrediction)) 128 | 129 | /** 130 | * Build a tokens-encoder wrapper model. 131 | * 132 | * @param parsedArgs the parsed command line arguments 133 | * @param corpus the corpus dictionary 134 | * 135 | * @return a new tokens-encoder wrapper model 136 | */ 137 | private fun buildTokensEncoderWrapperModel( 138 | parsedArgs: CommandLineArguments, 139 | sentences: List, // TODO: it will be used to initialize the MorphoEncoder 140 | corpus: CorpusDictionary, 141 | morphologyDictionary: MorphologyDictionary? 142 | ): TokensEncoderWrapperModel = 143 | 144 | when (parsedArgs.tokensEncodingType) { 145 | 146 | CommandLineArguments.TokensEncodingType.WORD_AND_EXT_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder 147 | 148 | val embeddingsMap = EmbeddingsMap.fromSet( 149 | size = parsedArgs.wordEmbeddingSize, 150 | elements = corpus.words.getElementsReversedSet()) 151 | 152 | val preEmbeddingsMap = parsedArgs.embeddingsPath!!.let { 153 | println("Loading pre-trained word embeddings from '$it'...") 154 | EmbeddingsMap.load(filename = it) 155 | } 156 | 157 | val posEmbeddingsMap = EmbeddingsMap.fromSet( 158 | size = parsedArgs.posEmbeddingSize, 159 | elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet()) 160 | 161 | TokensEncoderWrapperModel( 162 | model = EnsembleTokensEncoderModel( 163 | components = listOf( 164 | EnsembleTokensEncoderModel.ComponentModel( 165 | model = TokensEncoderWrapperModel( 166 | model = EmbeddingsEncoderModel.Base( 167 | embeddingsMap = preEmbeddingsMap, 168 | embeddingKeyExtractor = NormWordKeyExtractor(), 169 | dropout = parsedArgs.wordDropoutCoefficient), 170 | converter = FormConverter()), 171 | trainable = true), 172 | EnsembleTokensEncoderModel.ComponentModel( 173 | model = TokensEncoderWrapperModel( 174 | model = EmbeddingsEncoderModel.Base( 175 | embeddingsMap = embeddingsMap, 176 | embeddingKeyExtractor = NormWordKeyExtractor(), 177 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) }, 178 | dropout = parsedArgs.wordDropoutCoefficient), 179 | converter = FormConverter()), 180 | trainable = true), 181 | EnsembleTokensEncoderModel.ComponentModel( 182 | model = TokensEncoderWrapperModel( 183 | model = EmbeddingsEncoderModel.Base( 184 | embeddingsMap = posEmbeddingsMap, 185 | embeddingKeyExtractor = PosTagKeyExtractor, 186 | frequencyDictionary = corpus.grammaticalConfigurations.getElements() 187 | .mapNotNull { it.posToString } 188 | .associateWith { 1 }, 189 | dropout = parsedArgs.posDropoutCoefficient), 190 | converter = MirrorConverter()), 191 | trainable = true) 192 | ), 193 | outputMergeConfiguration = AffineMerge( 194 | outputSize = 100, // TODO 195 | activationFunction = null)), 196 | converter = MirrorConverter() 197 | ) 198 | } 199 | 200 | CommandLineArguments.TokensEncodingType.WORD_AND_POS_EMBEDDINGS -> { // TODO: separate with a dedicated builder 201 | 202 | val embeddingsMap = EmbeddingsMap.fromSet( 203 | size = parsedArgs.wordEmbeddingSize, 204 | elements = corpus.words.getElementsReversedSet()) 205 | 206 | val posEmbeddingsMap = EmbeddingsMap.fromSet( 207 | size = parsedArgs.posEmbeddingSize, 208 | elements = corpus.grammaticalConfigurations.getElements().mapNotNull { it.posToString }.toSet()) 209 | 210 | TokensEncoderWrapperModel( 211 | model = EnsembleTokensEncoderModel( 212 | components = listOf( 213 | EnsembleTokensEncoderModel.ComponentModel( 214 | model = TokensEncoderWrapperModel( 215 | model = EmbeddingsEncoderModel.Base( 216 | embeddingsMap = embeddingsMap, 217 | embeddingKeyExtractor = NormWordKeyExtractor(), 218 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) }, 219 | dropout = parsedArgs.wordDropoutCoefficient), 220 | converter = FormConverter()), 221 | trainable = true), 222 | EnsembleTokensEncoderModel.ComponentModel( 223 | model = TokensEncoderWrapperModel( 224 | model = EmbeddingsEncoderModel.Base( 225 | embeddingsMap = posEmbeddingsMap, 226 | embeddingKeyExtractor = PosTagKeyExtractor, 227 | frequencyDictionary = corpus.grammaticalConfigurations.getElements() 228 | .mapNotNull { it.posToString } 229 | .associateWith { 1 }, 230 | dropout = parsedArgs.posDropoutCoefficient), 231 | converter = MirrorConverter()), 232 | trainable = true) 233 | ), 234 | outputMergeConfiguration = ConcatMerge()), 235 | converter = MirrorConverter() 236 | ) 237 | } 238 | 239 | CommandLineArguments.TokensEncodingType.WORD_EMBEDDINGS -> { // TODO: separate with a dedicated builder 240 | 241 | val embeddingsMap = EmbeddingsMap.fromSet( 242 | size = parsedArgs.wordEmbeddingSize, 243 | elements = corpus.words.getElementsReversedSet()) 244 | 245 | TokensEncoderWrapperModel( 246 | model = EmbeddingsEncoderModel.Base( 247 | embeddingsMap = embeddingsMap, 248 | embeddingKeyExtractor = NormWordKeyExtractor(), 249 | frequencyDictionary = corpus.words.getElements().associateWith { corpus.words.getCount(it) }, 250 | dropout = parsedArgs.wordDropoutCoefficient), 251 | converter = FormConverter() 252 | ) 253 | } 254 | 255 | CommandLineArguments.TokensEncodingType.CHARLM -> { // TODO: separate with a dedicated builder 256 | TokensEncoderWrapperModel( 257 | model = CharLMEncoderModel( 258 | dirCharLM = CharLM.load(File(parsedArgs.charLMModelPath!!).inputStream()), 259 | revCharLM = CharLM.load(File(parsedArgs.charLMRevModelPath!!).inputStream()), 260 | outputMergeConfiguration = AffineMerge( 261 | outputSize = 100, // TODO 262 | activationFunction = Tanh)), 263 | converter = FormConverter() 264 | ) 265 | } 266 | 267 | CommandLineArguments.TokensEncodingType.MORPHO_FEATURES -> { 268 | 269 | val analyzer = MorphologicalAnalyzer(dictionary = morphologyDictionary!!) 270 | 271 | val lexiconDictionary = parsedArgs.lexiconDictionaryPath?.let { 272 | println("Loading lexicon from '$it'...") 273 | LexiconDictionary.load(it) 274 | } 275 | 276 | val featuresDictionary = FeaturesCollector( 277 | lexicalDictionary = lexiconDictionary, 278 | sentences = sentences.mapIndexed { i, it -> it.toMorphoSentence(index = i, analyzer = analyzer)} 279 | ).collect() 280 | 281 | TokensEncoderWrapperModel( 282 | model = MorphoEncoderModel( 283 | lexiconDictionary = lexiconDictionary, 284 | featuresDictionary = featuresDictionary, 285 | tokenEncodingSize = parsedArgs.wordEmbeddingSize, 286 | activation = null), 287 | converter = MorphoConverter() 288 | ) 289 | } 290 | } 291 | 292 | 293 | /** 294 | * Build a [MorphoSentence] from this [CoNLLSentence]. 295 | * 296 | * @param index the position index of this sentence 297 | * @param analyzer a morphological analyzer 298 | * 299 | * @return a new morpho sentence 300 | */ 301 | private fun CoNLLSentence.toMorphoSentence(index: Int, analyzer: MorphologicalAnalyzer): MorphoSentence { 302 | 303 | val baseTokens = this.tokens.toBaseTokens() 304 | val position = Position( 305 | index = index, 306 | start = baseTokens.first().position.start, 307 | end = baseTokens.last().position.end) 308 | @Suppress("UNCHECKED_CAST") 309 | val sentence = BaseSentence(id = index, position = position, tokens = baseTokens) as RealSentence 310 | 311 | val analysis = analyzer.analyze(sentence) 312 | 313 | return object : MorphoSentence { 314 | override val tokens: List = this@toMorphoSentence.tokens 315 | override val morphoAnalysis: MorphologicalAnalysis? = analysis 316 | } 317 | } 318 | 319 | /** 320 | * Build a trainer for a given [LHRParser]. 321 | * 322 | * @param parser an LHR parser 323 | * @param parsedArgs the parsed command line arguments 324 | * @param morphologyDictionary a morphology dictionary 325 | * 326 | * @return a trainer for the given [parser] 327 | */ 328 | private fun buildTrainer(parser: LHRParser, 329 | parsedArgs: CommandLineArguments, 330 | morphologyDictionary: MorphologyDictionary?): LHRTrainer { 331 | 332 | val preprocessor: SentencePreprocessor = buildSentencePreprocessor(morphologyDictionary) 333 | 334 | return LHRTrainer( 335 | parser = parser, 336 | epochs = parsedArgs.epochs, 337 | batchSize = parsedArgs.batchSize, 338 | validator = Validator( 339 | neuralParser = parser, 340 | sentences = loadSentences( 341 | type = "validation", 342 | filePath = parsedArgs.validationSetPath, 343 | maxSentences = null, 344 | skipNonProjective = false), 345 | sentencePreprocessor = preprocessor), 346 | modelFilename = parsedArgs.modelPath, 347 | skipPunctuationErrors = parsedArgs.skipPunctuationErrors, 348 | usePositionalEncodingErrors = false, 349 | updateMethod = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999), 350 | sentencePreprocessor = preprocessor, 351 | verbose = !parsedArgs.quiet) 352 | } 353 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 4.0.0 7 | 8 | com.kotlinnlp 9 | neuralparser 10 | 0.6.5 11 | jar 12 | 13 | ${project.groupId}:${project.artifactId} 14 | 15 | NeuralParser is a very simple to use dependency parser, based on the SimpleDNN library and 16 | the SyntaxDecoder transition systems framework. 17 | 18 | http://github.com/kotlinnlp/neuralparser 19 | 20 | 21 | scm:git:git://github.com/kotlinnlp/neuralparser.git 22 | scm:git:ssh://github.com:kotlinnlp/neuralparser.git 23 | http://github.com/kotlinnlp/neuralparser/tree/master 24 | 25 | 26 | 27 | 28 | KotlinNLP Authors 29 | github@kotlinnlp.com 30 | KotlinNLP 31 | http://www.kotlinnlp.com 32 | 33 | 34 | 35 | 36 | 37 | Apache License, Version 2.0 38 | http://www.apache.org/licenses/LICENSE-2.0.txt 39 | repo 40 | 41 | 42 | 43 | 44 | 45 | ossrh 46 | https://oss.sonatype.org/content/repositories/snapshots 47 | 48 | 49 | ossrh 50 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 51 | 52 | 53 | 54 | 55 | UTF-8 56 | 1.6.7 57 | 3.0.0 58 | 1.6 59 | oss_kotlinnlp 60 | C73F18F0 61 | 1.3.31 62 | 5.2 63 | 0.9.16 64 | 0.5.2 65 | 0.2.3 66 | 67 | 68 | 69 | 70 | jcenter 71 | https://jcenter.bintray.com/ 72 | 73 | 74 | 75 | 76 | 77 | jcenter 78 | JCenter 79 | https://jcenter.bintray.com/ 80 | 81 | 82 | 83 | 84 | src/main/kotlin 85 | 86 | 87 | 88 | org.jetbrains.kotlin 89 | kotlin-maven-plugin 90 | ${kotlin.version} 91 | 92 | 1.8 93 | 94 | 95 | 96 | compile 97 | compile 98 | 99 | compile 100 | 101 | 102 | 103 | test-compile 104 | test-compile 105 | 106 | test-compile 107 | 108 | 109 | 110 | 111 | 112 | 113 | org.jetbrains.dokka 114 | dokka-maven-plugin 115 | ${dokka.version} 116 | 117 | true 118 | 119 | 120 | 121 | prepare-package 122 | 123 | dokka 124 | javadoc 125 | javadocJar 126 | 127 | 128 | 129 | packages.md 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | org.sonatype.plugins 138 | nexus-staging-maven-plugin 139 | ${nexus-staging-plugin.version} 140 | true 141 | 142 | ${oss.server.id} 143 | https://oss.sonatype.org/ 144 | true 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-source-plugin 151 | ${maven-source-plugin.version} 152 | 153 | 154 | attach-sources 155 | 156 | jar-no-fork 157 | 158 | 159 | 160 | 161 | 162 | 163 | org.apache.maven.plugins 164 | maven-gpg-plugin 165 | ${maven-gpg-plugin.version} 166 | 167 | true 168 | 169 | 170 | 171 | sign-artifacts 172 | verify 173 | 174 | sign 175 | 176 | 177 | ${gpg.keyname} 178 | ${gpg.keyname} 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | org.jetbrains.kotlin 189 | kotlin-stdlib-jdk8 190 | ${kotlin.version} 191 | 192 | 193 | 194 | org.jetbrains.kotlin 195 | kotlin-reflect 196 | ${kotlin.version} 197 | 198 | 199 | 200 | org.jetbrains.kotlin 201 | kotlin-test 202 | ${kotlin.version} 203 | test 204 | 205 | 206 | 207 | com.beust 208 | klaxon 209 | ${klaxon.version} 210 | 211 | 212 | 213 | com.kotlinnlp 214 | lssencoder 215 | ${lssencoder.version} 216 | 217 | 218 | 219 | com.kotlinnlp 220 | dependencytree 221 | ${dependencytree.version} 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParser.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence 11 | import com.kotlinnlp.linguisticdescription.sentence.Sentence 12 | import com.kotlinnlp.neuralparser.language.ParsingSentence 13 | 14 | /** 15 | * A Neural Parser. 16 | */ 17 | interface NeuralParser { 18 | 19 | /** 20 | * The model of this neural parser. 21 | */ 22 | val model: ModelType 23 | 24 | /** 25 | * Whether this parser executes the morpho-syntactic labelling. 26 | */ 27 | val labellingEnabled: Boolean 28 | 29 | /** 30 | * Parse a sentence, giving its dependency tree. 31 | * 32 | * @param sentence a [Sentence] 33 | * 34 | * @return the dependency tree predicted for the given [sentence] 35 | */ 36 | fun parse(sentence: ParsingSentence): MorphoSynSentence 37 | } 38 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/NeuralParserModel.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser 9 | 10 | import com.kotlinnlp.linguisticdescription.language.Language 11 | import com.kotlinnlp.utils.Serializer 12 | import java.io.InputStream 13 | import java.io.OutputStream 14 | import java.io.Serializable 15 | 16 | /** 17 | * The serializable model of a [NeuralParser]. 18 | * 19 | * @property language the language within the parser works 20 | */ 21 | abstract class NeuralParserModel(val language: Language) : Serializable { 22 | 23 | companion object { 24 | 25 | /** 26 | * Private val used to serialize the class (needed by Serializable). 27 | */ 28 | @Suppress("unused") 29 | private const val serialVersionUID: Long = 1L 30 | 31 | /** 32 | * Read a [NeuralParserModel] (serialized) from an input stream and decode it. 33 | * 34 | * @param inputStream the [InputStream] from which to read the serialized [NeuralParserModel] 35 | * 36 | * @return the [NeuralParserModel] read from [inputStream] and decoded 37 | */ 38 | fun load(inputStream: InputStream): NeuralParserModel = Serializer.deserialize(inputStream) 39 | } 40 | 41 | /** 42 | * Serialize this [NeuralParserModel] and write it to an output stream. 43 | * 44 | * @param outputStream the [OutputStream] in which to write this serialized [NeuralParserModel] 45 | */ 46 | fun dump(outputStream: OutputStream) = Serializer.serialize(this, outputStream) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/Trainer.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers 9 | 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 11 | import com.kotlinnlp.dependencytree.DependencyTree 12 | import com.kotlinnlp.neuralparser.NeuralParser 13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor 14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 15 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics 16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator 17 | import com.kotlinnlp.neuralparser.language.BaseSentence 18 | import com.kotlinnlp.neuralparser.language.ParsingSentence 19 | import com.kotlinnlp.utils.ShuffledIndices 20 | import com.kotlinnlp.utils.Shuffler 21 | import com.kotlinnlp.utils.Timer 22 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar 23 | import java.io.File 24 | import java.io.FileOutputStream 25 | 26 | /** 27 | * The training helper of the [NeuralParser]. 28 | * 29 | * @param neuralParser a neural parser 30 | * @param batchSize the number of sentences that compose a batch 31 | * @param epochs the number of training epochs 32 | * @param validator the validation helper (if it is null no validation is done after each epoch) 33 | * @param modelFilename the name of the file in which to save the best trained model 34 | * @param minRelevantErrorsCountToUpdate the min count of relevant errors needed to update the neural parser (default 1) 35 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis) 36 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true) 37 | */ 38 | abstract class Trainer( 39 | private val neuralParser: NeuralParser<*>, 40 | private val batchSize: Int, 41 | private val epochs: Int, 42 | private val validator: Validator?, 43 | private val modelFilename: String, 44 | private val minRelevantErrorsCountToUpdate: Int = 1, 45 | private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor(), 46 | private val verbose: Boolean = true 47 | ) { 48 | 49 | /** 50 | * A timer to track the elapsed time. 51 | */ 52 | private var timer = Timer() 53 | 54 | /** 55 | * The best accuracy reached during the training. 56 | */ 57 | private var bestAccuracy: Double = -1.0 // -1 used as init value (all accuracy values are in the range [0.0, 1.0]) 58 | 59 | /** 60 | * Check requirements. 61 | */ 62 | init { 63 | require(this.epochs > 0) { "The number of epochs must be > 0" } 64 | require(this.batchSize > 0) { "The size of the batch must be > 0" } 65 | require(this.minRelevantErrorsCountToUpdate > 0) { "minRelevantErrorsCountToUpdate must be > 0" } 66 | } 67 | 68 | /** 69 | * Train the [neuralParser] with the given sentences. 70 | * 71 | * @param trainingSentences the sentences used to train the parser 72 | * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null) 73 | */ 74 | fun train(trainingSentences: List, 75 | shuffler: Shuffler? = Shuffler(enablePseudoRandom = true, seed = 743)) { 76 | 77 | (0 until this.epochs).forEach { i -> 78 | 79 | this.logTrainingStart(epochIndex = i) 80 | 81 | this.newEpoch() 82 | this.trainEpoch(trainingSentences = trainingSentences, shuffler = shuffler) 83 | 84 | this.logTrainingEnd() 85 | 86 | this.validator?.apply { 87 | logValidationStart() 88 | validateAndSaveModel() 89 | logValidationEnd() 90 | } 91 | } 92 | } 93 | 94 | /** 95 | * Train the parser for an epoch. 96 | * 97 | * @param trainingSentences the training sentences 98 | * @param shuffler a shuffle to shuffle the sentences at each epoch (can be null) 99 | */ 100 | private fun trainEpoch(trainingSentences: List, 101 | shuffler: Shuffler?) { 102 | 103 | val progress = ProgressIndicatorBar(trainingSentences.size) 104 | 105 | this.newBatch() 106 | 107 | ShuffledIndices(trainingSentences.size, shuffler = shuffler).forEachIndexed { i, sentenceIndex -> 108 | 109 | val endOfBatch: Boolean = (i + 1) % this.batchSize == 0 || i == trainingSentences.lastIndex 110 | 111 | progress.tick() 112 | 113 | val sentence: CoNLLSentence = trainingSentences[sentenceIndex] 114 | 115 | require(sentence.hasAnnotatedHeads()) { 116 | "The gold dependency tree of a sentence cannot be null during the evaluation." 117 | } 118 | 119 | this.trainSentence( 120 | sentence = this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = sentenceIndex)), 121 | goldTree = DependencyTree.Labeled(sentence)) 122 | 123 | if (endOfBatch && this.getRelevantErrorsCount() >= this.minRelevantErrorsCountToUpdate) { 124 | this.update() 125 | this.newBatch() 126 | } 127 | } 128 | } 129 | 130 | /** 131 | * Validate the [neuralParser] with the validation helper and save the best model. 132 | * The [validator] is required to be not null. 133 | */ 134 | private fun validateAndSaveModel() { 135 | 136 | val stats: Statistics = this.validator!!.evaluate() 137 | 138 | println("\n$stats") 139 | 140 | if (stats.noPunctuation.uas.perc > this.bestAccuracy) { 141 | 142 | this.saveModel() 143 | 144 | this.bestAccuracy = stats.noPunctuation.uas.perc 145 | } 146 | } 147 | 148 | /** 149 | * Save the model to [modelFilename]. 150 | */ 151 | private fun saveModel() { 152 | 153 | this.neuralParser.model.dump(FileOutputStream(File(this.modelFilename))) 154 | 155 | println("\nNEW BEST ACCURACY! Model saved to \"${this.modelFilename}\"") 156 | } 157 | 158 | /** 159 | * Log when training starts. 160 | * 161 | * @param epochIndex the current epoch index 162 | */ 163 | private fun logTrainingStart(epochIndex: Int) { 164 | 165 | if (this.verbose) { 166 | 167 | this.timer.reset() 168 | 169 | println("\nEpoch ${epochIndex + 1} of ${this.epochs}") 170 | println("\nStart training...") 171 | } 172 | } 173 | 174 | /** 175 | * Log when training ends. 176 | */ 177 | private fun logTrainingEnd() { 178 | 179 | if (this.verbose) { 180 | println("Elapsed time: %s".format(this.timer.formatElapsedTime())) 181 | } 182 | } 183 | 184 | /** 185 | * Log when validation starts. 186 | */ 187 | private fun logValidationStart() { 188 | 189 | if (this.verbose) { 190 | this.timer.reset() 191 | println() // new line 192 | } 193 | } 194 | 195 | /** 196 | * Log when validation ends. 197 | */ 198 | private fun logValidationEnd() { 199 | 200 | if (this.verbose) { 201 | println("Elapsed time: %s".format(this.timer.formatElapsedTime())) 202 | } 203 | } 204 | 205 | /** 206 | * Beat the occurrence of a new batch. 207 | */ 208 | protected open fun newBatch() = Unit 209 | 210 | /** 211 | * Beat the occurrence of a new epoch. 212 | */ 213 | protected open fun newEpoch() = Unit 214 | 215 | /** 216 | * Update the [neuralParser]. 217 | */ 218 | protected abstract fun update() 219 | 220 | /** 221 | * Train the parser with the given [sentence] and [goldTree]. 222 | * 223 | * @param sentence a sentence 224 | * @param goldTree the gold dependency tree 225 | */ 226 | protected abstract fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) 227 | 228 | /** 229 | * @return the count of the relevant errors 230 | */ 231 | protected abstract fun getRelevantErrorsCount(): Int 232 | } 233 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/LabelerSelector.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.labelerselector 9 | 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies 12 | import com.kotlinnlp.neuralparser.language.ParsingSentence 13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar 14 | import java.io.Serializable 15 | 16 | /** 17 | * The selector of valid configurations of the labeler and compatible morphologies. 18 | */ 19 | interface LabelerSelector : Serializable { 20 | 21 | /** 22 | * Get the list of scored grammatical configurations that are valid for a given attachment. 23 | * 24 | * @param configurations the list of grammatical configurations, sorted by descending score 25 | * @param sentence the input sentence 26 | * @param tokenIndex the index of the token to which the deprel must be assigned 27 | * @param headIndex the index of the token head (can be null) 28 | * 29 | * @return the valid grammatical configurations for the given attachment 30 | */ 31 | fun getValidConfigurations(configurations: List, 32 | sentence: ParsingSentence, 33 | tokenIndex: Int, 34 | headIndex: Int?): List 35 | 36 | /** 37 | * Get the morphologies of a given token that are compatible with the given grammatical configuration. 38 | * 39 | * @param sentence the input sentence 40 | * @param tokenIndex the index of a token of the sentence 41 | * @param configuration the grammatical configuration of the token 42 | * 43 | * @return the morphologies compatible with the given grammatical configuration 44 | */ 45 | fun getValidMorphologies(sentence: ParsingSentence, 46 | tokenIndex: Int, 47 | configuration: GrammaticalConfiguration): Morphologies 48 | } 49 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/MorphoSelector.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.labelerselector 9 | 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 11 | import com.kotlinnlp.linguisticdescription.POSTag 12 | import com.kotlinnlp.linguisticdescription.morphology.* 13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency 14 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Unknown 15 | import com.kotlinnlp.neuralparser.language.ParsingSentence 16 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar 17 | import com.kotlinnlp.utils.notEmptyOr 18 | 19 | /** 20 | * The selector to use when the labeler predictions are defined as combinations of POS and syntactic types in the 21 | * Base format. 22 | */ 23 | object MorphoSelector : LabelerSelector { 24 | 25 | /** 26 | * Private val used to serialize the class (needed by Serializable). 27 | */ 28 | @Suppress("unused") 29 | private const val serialVersionUID: Long = 1L 30 | 31 | /** 32 | * Get the list of scored grammatical configurations that are valid for a given attachment. 33 | * 34 | * @param configurations the list of grammatical configurations, sorted by descending score 35 | * @param sentence the input sentence 36 | * @param tokenIndex the index of the token to which the deprel must be assigned 37 | * @param headIndex the index of the token head (can be null) 38 | * 39 | * @return the valid grammatical configurations for the given attachment 40 | */ 41 | override fun getValidConfigurations(configurations: List, 42 | sentence: ParsingSentence, 43 | tokenIndex: Int, 44 | headIndex: Int?): List { 45 | 46 | val possibleMorphologies: Morphologies = sentence.morphoAnalysis!!.allMorphologies[tokenIndex] 47 | val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex) 48 | val possibleConfigurations: List = configurations.filter { it.config.direction == correctDirection } 49 | val worstScore: Double = configurations.last().score 50 | 51 | return if (possibleMorphologies.isNotEmpty()) 52 | possibleConfigurations 53 | .filter { sentence.areConfigurationCompatible(c = it.config, tokenIndex = tokenIndex) } 54 | .notEmptyOr { 55 | listOf(ScoredGrammar( 56 | config = possibleMorphologies.first().buildUnknownConfig(correctDirection), 57 | score = worstScore)) 58 | } 59 | else 60 | possibleConfigurations.filter { it.config.isSingleContentWord() }.notEmptyOr { 61 | listOf(ScoredGrammar( 62 | config = GrammaticalConfiguration(GrammaticalConfiguration.Component( 63 | syntacticDependency = Unknown(correctDirection), 64 | pos = POSTag.Base(POS.Noun))), 65 | score = worstScore)) 66 | } 67 | } 68 | 69 | /** 70 | * Get the morphologies of a given token that are compatible with the given grammatical configuration. 71 | * 72 | * @param sentence the input sentence 73 | * @param tokenIndex the index of a token of the sentence 74 | * @param configuration the grammatical configuration of the token 75 | * 76 | * @return the morphologies compatible with the given deprel 77 | */ 78 | override fun getValidMorphologies(sentence: ParsingSentence, 79 | tokenIndex: Int, 80 | configuration: GrammaticalConfiguration): Morphologies { 81 | 82 | val possibleMorphologies: Morphologies = 83 | sentence.getCompatibleMorphologies(c = configuration, tokenIndex = tokenIndex) 84 | 85 | return when { 86 | 87 | possibleMorphologies.isNotEmpty() -> possibleMorphologies 88 | 89 | configuration.type == GrammaticalConfiguration.Type.Single -> { 90 | 91 | val pos: POSTag.Base = checkNotNull(configuration.components.single().pos as? POSTag.Base) { 92 | "The POS cannot be null." 93 | } 94 | 95 | require(pos.type.isContentWord) { 96 | "The grammatical configuration of tokens without morphological analysis must define a content word." 97 | } 98 | 99 | Morphologies(Morphology(SingleMorphology( 100 | lemma = sentence.tokens[tokenIndex].form, 101 | pos = pos.type, 102 | allowDefaultValues = true))) 103 | } 104 | 105 | else -> Morphologies() 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/labelerselector/NoFilterSelector.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.labelerselector 9 | 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 11 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies 12 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency 13 | import com.kotlinnlp.neuralparser.language.ParsingSentence 14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar 15 | 16 | /** 17 | * The selector that does not filter. 18 | */ 19 | object NoFilterSelector : LabelerSelector { 20 | 21 | /** 22 | * Private val used to serialize the class (needed by Serializable). 23 | */ 24 | @Suppress("unused") 25 | private const val serialVersionUID: Long = 1L 26 | 27 | /** 28 | * Get the list of scored grammatical configurations that are valid for a given attachment. 29 | * 30 | * @param configurations the list of grammatical configurations, sorted by descending score 31 | * @param sentence the input sentence 32 | * @param tokenIndex the index of the token to which the deprel must be assigned 33 | * @param headIndex the index of the token head (can be null) 34 | * 35 | * @return the valid grammatical configurations for the given attachment 36 | */ 37 | override fun getValidConfigurations(configurations: List, 38 | sentence: ParsingSentence, 39 | tokenIndex: Int, 40 | headIndex: Int?): List { 41 | 42 | val correctDirection = SyntacticDependency.Direction(tokenIndex = tokenIndex, headIndex = headIndex) 43 | 44 | return configurations.filter { it.config.direction == correctDirection } 45 | } 46 | 47 | /** 48 | * Return all the morphologies as valid. 49 | * 50 | * @param sentence the input sentence 51 | * @param tokenIndex the index of a token of the sentence 52 | * @param configuration the grammatical configuration of the token 53 | * 54 | * @return all the given morphologies 55 | */ 56 | override fun getValidMorphologies(sentence: ParsingSentence, 57 | tokenIndex: Int, 58 | configuration: GrammaticalConfiguration) = Morphologies( 59 | sentence.morphoAnalysis?.allMorphologies?.get(tokenIndex)?.filter { 60 | it.components.size == configuration.components.size 61 | } ?: emptyList() 62 | ) 63 | } 64 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/BasePreprocessor.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors 9 | 10 | import com.kotlinnlp.neuralparser.language.BaseSentence 11 | import com.kotlinnlp.neuralparser.language.ParsingSentence 12 | import com.kotlinnlp.neuralparser.language.ParsingToken 13 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector 14 | 15 | /** 16 | * Pre-process a sentence before the parsing starts. 17 | */ 18 | class BasePreprocessor : SentencePreprocessor { 19 | 20 | companion object { 21 | 22 | /** 23 | * Private val used to serialize the class (needed by Serializable). 24 | */ 25 | @Suppress("unused") 26 | private const val serialVersionUID: Long = 1L 27 | } 28 | 29 | /** 30 | * Convert a [BaseSentence] to a [ParsingSentence]. 31 | * 32 | * @param sentence a base sentence 33 | * 34 | * @return a sentence ready to be parsed 35 | */ 36 | override fun convert(sentence: BaseSentence) = ParsingSentence( 37 | tokens = sentence.tokens.map { ParsingToken(id = it.id, form = it.form, position = it.position) }, 38 | labelerSelector = NoFilterSelector, 39 | position = sentence.position 40 | ) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/CoNLLPreprocessor.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors 9 | 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 11 | import com.kotlinnlp.conllio.Token as CoNLLToken 12 | import com.kotlinnlp.neuralparser.language.BaseSentence 13 | import com.kotlinnlp.neuralparser.language.ParsingSentence 14 | import com.kotlinnlp.neuralparser.language.ParsingToken 15 | import com.kotlinnlp.neuralparser.helpers.labelerselector.NoFilterSelector 16 | 17 | /** 18 | * Pre-process a sentence that has been built from a [CoNLLSentence]. 19 | * 20 | * @param conllSentences the list of CoNLL sentences from which the input base sentences are built 21 | */ 22 | class CoNLLPreprocessor(private val conllSentences: List) : SentencePreprocessor { 23 | 24 | companion object { 25 | 26 | /** 27 | * Private val used to serialize the class (needed by Serializable). 28 | */ 29 | @Suppress("unused") 30 | private const val serialVersionUID: Long = 1L 31 | } 32 | 33 | /** 34 | * Convert a [BaseSentence] to a [ParsingSentence]. 35 | * 36 | * @param sentence a base sentence 37 | * 38 | * @return a sentence ready to be parsed 39 | */ 40 | override fun convert(sentence: BaseSentence): ParsingSentence { 41 | 42 | val conllTokens: List = this.conllSentences[sentence.position.index].tokens 43 | 44 | return ParsingSentence( 45 | tokens = sentence.tokens.mapIndexed { i, it -> 46 | ParsingToken( 47 | id = it.id, 48 | form = it.form, 49 | position = it.position, 50 | pos = conllTokens[i].posList 51 | ) 52 | }, 53 | labelerSelector = NoFilterSelector, 54 | position = sentence.position 55 | ) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/MorphoPreprocessor.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors 9 | 10 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis 11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence 12 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken 13 | import com.kotlinnlp.morphologicalanalyzer.MorphologicalAnalyzer 14 | import com.kotlinnlp.morphologicalanalyzer.dictionary.MorphologyDictionary 15 | import com.kotlinnlp.neuralparser.language.BaseSentence 16 | import com.kotlinnlp.neuralparser.language.ParsingSentence 17 | import com.kotlinnlp.neuralparser.language.ParsingToken 18 | import com.kotlinnlp.neuralparser.helpers.labelerselector.MorphoSelector 19 | 20 | /** 21 | * Pre-process a sentence with a morphological analyzer, before the parsing starts. 22 | * 23 | * @param dictionary a morphologies dictionary 24 | */ 25 | class MorphoPreprocessor(private val dictionary: MorphologyDictionary) : SentencePreprocessor { 26 | 27 | companion object { 28 | 29 | /** 30 | * Private val used to serialize the class (needed by Serializable). 31 | */ 32 | @Suppress("unused") 33 | private const val serialVersionUID: Long = 1L 34 | } 35 | 36 | /** 37 | * A morphological analyzer as transient property. 38 | */ 39 | @kotlin.jvm.Transient private var morphologicalAnalyzer: MorphologicalAnalyzer? = null 40 | 41 | /** 42 | * Convert a [BaseSentence] to a [ParsingSentence]. 43 | * 44 | * @param sentence a base sentence 45 | * 46 | * @return a sentence ready to be parsed 47 | */ 48 | override fun convert(sentence: BaseSentence): ParsingSentence { 49 | 50 | @Suppress("UNCHECKED_CAST") 51 | val morphoAnalysis: MorphologicalAnalysis = this.getOrInitAnalyzer().analyze(sentence as RealSentence) 52 | 53 | return ParsingSentence( 54 | tokens = sentence.tokens.map { 55 | ParsingToken( 56 | id = it.id, 57 | form = it.form, 58 | position = it.position 59 | ) 60 | }, 61 | morphoAnalysis = morphoAnalysis, 62 | labelerSelector = MorphoSelector, 63 | position = sentence.position 64 | ) 65 | } 66 | 67 | /** 68 | * Get the [MorphologicalAnalyzer] of this preprocessor, eventually initializing it (in case this class has just been 69 | * deserialized). 70 | * 71 | * @return the morphological analyzer of this preprocessor 72 | */ 73 | private fun getOrInitAnalyzer(): MorphologicalAnalyzer { 74 | 75 | if (this.morphologicalAnalyzer == null) 76 | this.morphologicalAnalyzer = MorphologicalAnalyzer(this.dictionary) 77 | 78 | return this.morphologicalAnalyzer!! 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/preprocessors/SentencePreprocessor.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.preprocessors 9 | 10 | import com.kotlinnlp.neuralparser.language.ParsingSentence 11 | import com.kotlinnlp.neuralparser.language.BaseSentence 12 | import com.kotlinnlp.neuralparser.language.BaseToken 13 | import com.kotlinnlp.neuralparser.language.ParsingToken 14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter 15 | 16 | /** 17 | * Pre-process a sentence before the parsing starts. 18 | */ 19 | interface SentencePreprocessor : SentenceConverter 20 | 21 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/CompositeTokenHelper.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 12 | import com.kotlinnlp.linguisticdescription.morphology.POS 13 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticType 14 | 15 | /** 16 | * @param dependencyTree the dependency tree 17 | */ 18 | internal class CompositeTokenHelper(private val dependencyTree: DependencyTree.Labeled) { 19 | 20 | /** 21 | * Get the ID of the governor of a component of a composite token. 22 | * 23 | * @param tokenId the ID of a parsing token 24 | * @param componentIndex the index of a component of the token 25 | * @param prevComponentId the ID assigned to the precedent component (null at the first component) 26 | * 27 | * @return the ID of the governor of the given component 28 | */ 29 | fun getComponentGovernorId(tokenId: Int, 30 | componentIndex: Int, 31 | prevComponentId: Int?): Int? { 32 | 33 | val governorId: Int? = this.dependencyTree.getHead(tokenId) 34 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId) 35 | 36 | val isContin: Boolean = config.isContin() 37 | val isPrepArt: Boolean = config.isPrepArt() 38 | val isVerbEnclitic: Boolean = config.isVerbEnclitic() 39 | 40 | return when { 41 | componentIndex == 0 -> governorId 42 | isPrepArt && !isContin -> governorId 43 | isPrepArt && isContin -> this.dependencyTree.getMultiWordGovernorId(tokenId) 44 | isVerbEnclitic -> prevComponentId!! 45 | else -> null 46 | } 47 | } 48 | 49 | /** 50 | * Get the governor ID of a multi-word, given one of its tokens and going back through its ancestors in the dependency 51 | * tree. 52 | * Note: the governor of a multi-word is the governor of it first token. 53 | * 54 | * @param tokenId the id of a token that is part of a multi-word 55 | * 56 | * @return the governor id of the multi-word of which the given token is part of 57 | */ 58 | private fun DependencyTree.Labeled.getMultiWordGovernorId(tokenId: Int): Int? { 59 | 60 | var multiWordStartId: Int = this.getHead(tokenId)!! 61 | 62 | while (this.getConfiguration(multiWordStartId).isContin()) 63 | multiWordStartId = this.getHead(multiWordStartId)!! 64 | 65 | return this.getHead(multiWordStartId) 66 | } 67 | 68 | /** 69 | * @return true if this configuration defines the continuation of a multi-word, otherwise false 70 | */ 71 | private fun GrammaticalConfiguration.isContin(): Boolean = this.components.any { 72 | it.syntacticDependency.isSubTypeOf(SyntacticType.Contin) 73 | } 74 | 75 | /** 76 | * @return true if this configuration defines a composite PREP + ART, otherwise false 77 | */ 78 | private fun GrammaticalConfiguration.isPrepArt(): Boolean = 79 | this.components.size == 2 && 80 | this.components[0].pos?.isSubTypeOf(POS.Prep) == true && 81 | this.components[1].pos?.isSubTypeOf(POS.Art) == true 82 | 83 | /** 84 | * @return true if this configuration defines a composite VERB + PRON, otherwise false 85 | */ 86 | private fun GrammaticalConfiguration.isVerbEnclitic(): Boolean = 87 | this.components.size >= 2 && 88 | this.components[0].pos?.isSubTypeOf(POS.Verb) == true && 89 | this.components.subList(1, this.components.size).all { it.pos?.isSubTypeOf(POS.Pron) == true } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/LabeledMorphoSynBuilder.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 12 | import com.kotlinnlp.linguisticdescription.morphology.ScoredMorphology 13 | import com.kotlinnlp.linguisticdescription.morphology.ScoredSingleMorphology 14 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence 15 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken 16 | import com.kotlinnlp.linguisticdescription.sentence.token.Word 17 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace 18 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation 19 | import com.kotlinnlp.neuralparser.language.ParsingSentence 20 | import com.kotlinnlp.neuralparser.language.ParsingToken 21 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector 22 | 23 | /** 24 | * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree]. 25 | * 26 | * @param parsingSentence a parsing sentence 27 | * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence 28 | */ 29 | internal class LabeledMorphoSynBuilder( 30 | private val parsingSentence: ParsingSentence, 31 | private val dependencyTree: DependencyTree.Labeled 32 | ) { 33 | 34 | /** 35 | * The next id that can be assigned to a new token of the sentence, used in case a new single component has to be 36 | * created. 37 | */ 38 | private var nextAvailableId: Int = this.parsingSentence.tokens.asSequence().map { it.id }.max()!! + 1 39 | 40 | /** 41 | * Build the morpho-syntactic sentence using a [LabelerSelector] to select the valid morphologies. 42 | * 43 | * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree] 44 | */ 45 | fun buildSentence(): MorphoSynSentence = MorphoSynSentence( 46 | id = 0, 47 | confidence = 0.0, 48 | tokens = this.parsingSentence.tokens.mapIndexed { i, token -> 49 | 50 | val attachmentScore: Double = this.dependencyTree.getAttachmentScore(token.id) 51 | 52 | val morphologies: List = this.parsingSentence.getValidMorphologies( 53 | tokenIndex = i, 54 | configuration = this.dependencyTree.getConfiguration(token.id) 55 | ).map { morpho -> 56 | ScoredMorphology(components = morpho.components, score = attachmentScore) 57 | } 58 | 59 | this.buildToken(tokenId = token.id, morphologies = morphologies) 60 | }, 61 | position = this.parsingSentence.position 62 | ) 63 | 64 | /** 65 | * @param tokenId the id of a parsing token 66 | * @param morphologies the possible morphologies of the token 67 | * 68 | * @return a new morpho-syntactic token build from the given parsing token 69 | */ 70 | private fun buildToken(tokenId: Int, morphologies: List): MorphoSynToken { 71 | 72 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId) 73 | 74 | require(morphologies.all { it.components.size == config.components.size }) { 75 | "The given morphologies must have the same number of components of the given grammatical configuration." 76 | } 77 | 78 | return if (config.components.size == 1) 79 | this.buildSingleToken( 80 | tokenId = tokenId, 81 | governorId = this.dependencyTree.getHead(tokenId), 82 | grammaticalComponent = config.components.single(), 83 | morphologies = morphologies.map { it.toSingle() }) 84 | else 85 | this.buildCompositeToken(tokenId = tokenId, morphologies = morphologies) 86 | } 87 | 88 | /** 89 | * @param tokenId the id of the new token 90 | * @param morphologies the list of possible scored morphologies of the token 91 | * 92 | * @return a new composite token 93 | */ 94 | private fun buildCompositeToken(tokenId: Int, morphologies: List): MorphoSynToken.Composite { 95 | 96 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId) 97 | val config: GrammaticalConfiguration = this.dependencyTree.getConfiguration(tokenId) 98 | val compositeTokenHandler = CompositeTokenHelper(this.dependencyTree) 99 | 100 | val newToken = MorphoSynToken.Composite( 101 | id = parsingToken.id, 102 | form = parsingToken.form, 103 | position = checkNotNull(parsingToken.position) { "Composite words must have a position." }, 104 | components = config.components.mapIndexed { i, component -> 105 | this.buildSingleToken( 106 | tokenId = tokenId, 107 | componentId = this.nextAvailableId + i, 108 | governorId = compositeTokenHandler.getComponentGovernorId( 109 | tokenId = tokenId, 110 | componentIndex = i, 111 | prevComponentId = if (i > 0) this.nextAvailableId else null), 112 | grammaticalComponent = component, 113 | morphologies = morphologies.map { ScoredSingleMorphology(value = it.components[i], score = it.score) } 114 | ) as Word 115 | } 116 | ) 117 | 118 | // Attention: the nextAvailableId must be set after the token has been created in order to calculate the 119 | // components governors correctly. 120 | this.nextAvailableId += config.components.size 121 | 122 | return newToken 123 | } 124 | 125 | /** 126 | * @param tokenId the id of the original token 127 | * @param componentId the id of the token in case it is a component (otherwise null) 128 | * @param governorId the id of the governor (null if it is the top) 129 | * @param grammaticalComponent the grammatical configuration of the token as single component 130 | * @param morphologies the list of possible scored morphologies of the token 131 | * 132 | * @return a new single token 133 | */ 134 | private fun buildSingleToken(tokenId: Int, 135 | componentId: Int? = null, 136 | governorId: Int?, 137 | grammaticalComponent: GrammaticalConfiguration.Component, 138 | morphologies: List): MorphoSynToken.Single { 139 | 140 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId) 141 | val syntacticRelation = SyntacticRelation( 142 | governor = governorId, 143 | attachmentScore = this.dependencyTree.getAttachmentScore(tokenId), 144 | dependency = grammaticalComponent.syntacticDependency) 145 | 146 | // Unique morphologies by lemma and POS. 147 | val uniqueMorphologies: List = 148 | morphologies.associateBy { Pair(it.value.lemma, it.value.pos) }.values.toList() 149 | 150 | return if (parsingToken.position != null) 151 | Word( 152 | id = componentId ?: tokenId, 153 | form = parsingToken.form, 154 | position = parsingToken.position, 155 | pos = grammaticalComponent.pos, 156 | morphologies = uniqueMorphologies, 157 | contextMorphologies = listOf(), // TODO: set it 158 | syntacticRelation = syntacticRelation, 159 | coReferences = null, // TODO: set it 160 | semanticRelations = null) // TODO: set it 161 | else 162 | WordTrace( 163 | id = componentId ?: tokenId, 164 | form = parsingToken.form, 165 | pos = grammaticalComponent.pos, 166 | morphologies = uniqueMorphologies, 167 | contextMorphologies = listOf(), // TODO: set it 168 | syntacticRelation = syntacticRelation, 169 | coReferences = null, // TODO: set it 170 | semanticRelations = null) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/sentencebuilder/UnlabeledMorphoSynBuilder.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.sentencebuilder 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence 12 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken 13 | import com.kotlinnlp.linguisticdescription.sentence.token.Word 14 | import com.kotlinnlp.linguisticdescription.sentence.token.WordTrace 15 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.SyntacticRelation 16 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency 17 | import com.kotlinnlp.linguisticdescription.syntax.dependencies.Undefined 18 | import com.kotlinnlp.neuralparser.language.ParsingSentence 19 | import com.kotlinnlp.neuralparser.language.ParsingToken 20 | 21 | /** 22 | * A helper class that builds a [MorphoSynSentence] from a [ParsingSentence] and a [DependencyTree]. 23 | * 24 | * @param parsingSentence a parsing sentence 25 | * @param dependencyTree the tree that represents the dependencies and the grammatical configuration of the sentence 26 | */ 27 | internal class UnlabeledMorphoSynBuilder( 28 | private val parsingSentence: ParsingSentence, 29 | private val dependencyTree: DependencyTree.Unlabeled 30 | ) { 31 | 32 | /** 33 | * @return a new morpho-syntactic sentence built from the given [parsingSentence] and [dependencyTree] 34 | */ 35 | fun buildSentence(): MorphoSynSentence = MorphoSynSentence( 36 | id = 0, 37 | confidence = 0.0, 38 | tokens = this.parsingSentence.tokens.map {token -> 39 | this.buildSingleToken(tokenId = token.id, governorId = this.dependencyTree.getHead(token.id)) 40 | }, 41 | position = this.parsingSentence.position 42 | ) 43 | 44 | /** 45 | * @param tokenId the token id 46 | * @param governorId the governor id or null if it is the top 47 | * 48 | * @return a new single morpho-syntactic token built from the given parsing token 49 | */ 50 | private fun buildSingleToken(tokenId: Int, governorId: Int?): MorphoSynToken.Single { 51 | 52 | val parsingToken: ParsingToken = this.parsingSentence.getTokenById(tokenId) 53 | val syntacticRelation = SyntacticRelation( 54 | governor = governorId, 55 | attachmentScore = this.dependencyTree.getAttachmentScore(tokenId), 56 | dependency = Undefined(direction = SyntacticDependency.Direction.NULL)) 57 | 58 | return if (parsingToken.position != null) 59 | Word( 60 | id = tokenId, 61 | form = parsingToken.form, 62 | position = parsingToken.position, 63 | pos = null, 64 | morphologies = listOf(), 65 | contextMorphologies = listOf(), // TODO: set it 66 | syntacticRelation = syntacticRelation, 67 | coReferences = null, // TODO: set it 68 | semanticRelations = null) // TODO: set it 69 | else 70 | WordTrace( 71 | id = tokenId, 72 | form = parsingToken.form, 73 | pos = null, 74 | morphologies = listOf(), 75 | contextMorphologies = listOf(), // TODO: set it 76 | syntacticRelation = syntacticRelation, 77 | coReferences = null, // TODO: set it 78 | semanticRelations = null) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/BaseStatistics.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.statistics 9 | 10 | import com.kotlinnlp.utils.stats.StatMetric 11 | 12 | /** 13 | * Base parsing statistics. 14 | * 15 | * @property las labeled attachment score 16 | * @property uas unlabeled attachment score 17 | * @property ps POS tag accuracy score 18 | * @property ds deprel accuracy score 19 | * @property slas sentence labeled attachment score 20 | * @property suas sentence unlabeled attachment score 21 | */ 22 | open class BaseStatistics( 23 | val las: StatMetric, 24 | val uas: StatMetric, 25 | val ps: StatMetric, 26 | val ds: StatMetric, 27 | val slas: StatMetric, 28 | val suas: StatMetric) { 29 | 30 | /** 31 | * @return this statistics formatted into a string 32 | */ 33 | override fun toString(): String = """ 34 | - Labeled attachment score: $las 35 | - Unlabeled attachment score: $uas 36 | - Deprel accuracy score: $ds 37 | - POS tag accuracy score: $ps 38 | - Sentence labeled attachment score: $slas 39 | - Sentence unlabeled attachment score: $suas 40 | """ 41 | .removePrefix("\n") 42 | .trimIndent() 43 | } 44 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/MetricsCounter.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.statistics 9 | 10 | import com.kotlinnlp.utils.stats.StatMetric 11 | 12 | /** 13 | * A counter of statistic metrics. 14 | * 15 | * @property labeledAttachments the counter of labeled attachments 16 | * @property unlabeledAttachments the counter of unlabeled attachments 17 | * @property correctPOSTags the counter of correct POS tags 18 | * @property correctDeprels the counter of correct deprels 19 | * @property correctLabeledSentences the counter of correct labeled sentences 20 | * @property correctUnlabeledSentences the counter of correct unlabeled sentences 21 | * @property totalSentences the total amount of sentences 22 | * @property totalTokens the total amount of tokens 23 | */ 24 | data class MetricsCounter( 25 | var labeledAttachments: Int = 0, 26 | var unlabeledAttachments: Int = 0, 27 | var correctPOSTags: Int = 0, 28 | var correctDeprels: Int = 0, 29 | var correctLabeledSentences: Int = 0, 30 | var correctUnlabeledSentences: Int = 0, 31 | var totalSentences: Int = 0, 32 | var totalTokens: Int = 0) { 33 | 34 | /** 35 | * @return the base statistics 36 | */ 37 | fun toStatistics() = with(this) { 38 | BaseStatistics( 39 | las = StatMetric(count = labeledAttachments, total = totalTokens), 40 | uas = StatMetric(count = unlabeledAttachments, total = totalTokens), 41 | ps = StatMetric(count = correctPOSTags, total = totalTokens), 42 | ds = StatMetric(count = correctDeprels, total = totalTokens), 43 | slas = StatMetric(count = correctLabeledSentences, total = totalSentences), 44 | suas = StatMetric(count = correctUnlabeledSentences, total = totalSentences) 45 | ) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/SentenceMetrics.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.statistics 9 | 10 | /** 11 | * The metrics of a sentence. 12 | * 13 | * @property correctLabeled if the parsed sentence has all correct attachments, including deprel labels 14 | * @property correctUnlabeled if the parsed sentence has all correct attachments, excluding deprel labels 15 | * @property correctLabeledNoPunct same as [correctLabeled], without considering the punctuation tokens 16 | * @property correctUnlabeledNoPunct same as [correctUnlabeled], without considering the punctuation tokens 17 | */ 18 | internal data class SentenceMetrics( 19 | var correctLabeled: Boolean = true, 20 | var correctUnlabeled: Boolean = true, 21 | var correctLabeledNoPunct: Boolean = true, 22 | var correctUnlabeledNoPunct: Boolean = true 23 | ) 24 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/statistics/Statistics.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.statistics 9 | 10 | import com.kotlinnlp.utils.stats.StatMetric 11 | 12 | /** 13 | * Parsing statistics, including ones calculated without considering the punctuation. 14 | * 15 | * @property las labeled attachment score 16 | * @property uas unlabeled attachment score 17 | * @property ps POS tag accuracy score 18 | * @property ds deprel accuracy score 19 | * @property slas sentence labeled attachment score 20 | * @property suas sentence unlabeled attachment score 21 | * @property noPunctuation statistics without considering punctuation tokens 22 | */ 23 | class Statistics( 24 | las: StatMetric, 25 | uas: StatMetric, 26 | ps: StatMetric, 27 | ds: StatMetric, 28 | slas: StatMetric, 29 | suas: StatMetric, 30 | val noPunctuation: BaseStatistics 31 | ) : BaseStatistics(las = las, uas = uas, ps = ps, ds = ds, slas = slas, suas = suas) { 32 | 33 | /** 34 | * @return this statistics formatted into a string 35 | */ 36 | override fun toString(): String = """ 37 | Evaluation stats: 38 | %s 39 | 40 | Evaluation stats without considering punctuation: 41 | %s 42 | """ 43 | .removePrefix("\n") 44 | .trimIndent() 45 | .format( 46 | super.toString(), 47 | this.noPunctuation.toString() 48 | ) 49 | } 50 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLDependencyParser.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.validator 9 | 10 | import com.kotlinnlp.linguisticdescription.POSTag 11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 12 | import com.kotlinnlp.conllio.Token as CoNLLToken 13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence 14 | import com.kotlinnlp.linguisticdescription.sentence.token.MorphoSynToken 15 | import com.kotlinnlp.neuralparser.NeuralParser 16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor 17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 18 | import com.kotlinnlp.neuralparser.language.BaseSentence 19 | import com.kotlinnlp.utils.notEmptyOr 20 | 21 | /** 22 | * A helper that wraps a generic [NeuralParser] to let it working on CoNLL sentences. 23 | * 24 | * @property neuralParser a generic neural parser to use it with input/output sentences in CoNLL format 25 | * @property sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis) 26 | */ 27 | class CoNLLDependencyParser( 28 | private val neuralParser: NeuralParser<*>, 29 | private val sentencePreprocessor: SentencePreprocessor = BasePreprocessor() 30 | ) { 31 | 32 | /** 33 | * Parse a CoNLL sentence. 34 | * 35 | * @param sentence the sentence to parse, in CoNLL format 36 | * @param index the index of the sentence within the list of sentences of the input dataset 37 | * 38 | * @return the parsed sentence in CoNLL format 39 | */ 40 | fun parse(sentence: CoNLLSentence, index: Int): CoNLLSentence { 41 | 42 | val parsedSentence: MorphoSynSentence = this.neuralParser.parse( 43 | this.sentencePreprocessor.convert(BaseSentence.fromCoNLL(sentence, index = index))) 44 | 45 | return sentence.copy(tokens = sentence.tokens.map { 46 | 47 | val parsedToken: MorphoSynToken = parsedSentence.getTokenById(it.id) 48 | 49 | it.copy( 50 | head = parsedToken.syntacticRelation.governor ?: 0, // Note: the CoNLL root ID is 0 51 | posList = parsedToken.flatPOS.notEmptyOr { listOf(POSTag(CoNLLToken.EMPTY_FILLER)) }, 52 | syntacticDependencies = parsedToken.flatSyntacticRelations.map { it.dependency } 53 | ) 54 | }) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/CoNLLFileValidator.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.validator 9 | 10 | import com.kotlinnlp.conllio.CoNLLUEvaluator 11 | import com.kotlinnlp.conllio.CoNLLWriter 12 | import com.kotlinnlp.conllio.CoNLLXEvaluator 13 | import com.kotlinnlp.conllio.Token as CoNLLToken 14 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 15 | import com.kotlinnlp.neuralparser.NeuralParser 16 | import com.kotlinnlp.neuralparser.utils.loadSentences 17 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar 18 | import java.io.File 19 | 20 | /** 21 | * Validate a system output CoNLL file comparing it to a gold CoNLL file. 22 | * 23 | * @param neuralParser a neural parser 24 | * @param goldFilePath the path of the file containing the gold tree-bank, in CoNLL format. 25 | * @param outputFilePath the file path of the output CoNLL corpus (default = null -> a temporary file is used) 26 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true) 27 | */ 28 | class CoNLLFileValidator( 29 | neuralParser: NeuralParser<*>, 30 | private val goldFilePath: String, 31 | private val outputFilePath: String? = null, 32 | private val verbose: Boolean = true 33 | ) { 34 | 35 | /** 36 | * Return a temporary file absolute path. 37 | * 38 | * @return the path of a temporary file generated at runtime 39 | */ 40 | private val defaultOutputPath: String get() = File.createTempFile("${this.goldFilePath}_output", ".conll").path 41 | 42 | /** 43 | * The CoNLL Evaluator 44 | */ 45 | private val conllEvaluator = if (this.goldFilePath.endsWith(".conllu")) CoNLLUEvaluator else CoNLLXEvaluator 46 | 47 | /** 48 | * The parser wrapper to parse sentences in CoNLL format. 49 | */ 50 | private val conllParser = CoNLLDependencyParser(neuralParser) 51 | 52 | /** 53 | * Print the statistics resulting from the official CoNLL evaluation script. 54 | * 55 | * @return the statistics of the evaluation 56 | */ 57 | fun evaluate() { 58 | 59 | val parsedSentences: List = this.parseSentences(sentences = loadSentences( 60 | type = "validation", 61 | filePath = goldFilePath, 62 | maxSentences = null, 63 | skipNonProjective = false)) 64 | 65 | print("\nCoNLL official script evaluation:\n%s".format(this.evaluateWithCoNLLScript(parsedSentences))) 66 | } 67 | 68 | /** 69 | * Parse the validation CoNLL sentences. 70 | * 71 | * @return the list of parsed CoNLL sentences 72 | */ 73 | private fun parseSentences(sentences: List): List { 74 | 75 | val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(sentences.size) else null 76 | 77 | if (this.verbose) println("Start parsing of %d sentences:".format(sentences.size)) 78 | 79 | return sentences.mapIndexed { i, sentence -> 80 | 81 | progress?.tick() 82 | 83 | this.conllParser.parse(sentence, index = i) 84 | } 85 | } 86 | 87 | /** 88 | * Get the output of the official CoNLL evaluation script. 89 | * 90 | * @param parsedSentences a list of parsed sentences, parallel to the gold sentences 91 | * 92 | * @return the output of the official CoNLL evaluation script 93 | */ 94 | private fun evaluateWithCoNLLScript(parsedSentences: List): String? { 95 | 96 | val outputPath: String = this.outputFilePath ?: this.defaultOutputPath 97 | 98 | CoNLLWriter.toFile(sentences = parsedSentences, writeComments = true, outputFilePath = outputPath) 99 | 100 | return this.conllEvaluator.evaluate(systemFilePath = outputPath, goldFilePath = this.goldFilePath) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/helpers/validator/Validator.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.helpers.validator 9 | 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 11 | import com.kotlinnlp.conllio.Token as CoNLLToken 12 | import com.kotlinnlp.dependencytree.DependencyTree 13 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 14 | import com.kotlinnlp.linguisticdescription.syntax.SyntacticDependency 15 | import com.kotlinnlp.neuralparser.NeuralParser 16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 17 | import com.kotlinnlp.neuralparser.helpers.statistics.MetricsCounter 18 | import com.kotlinnlp.neuralparser.helpers.statistics.SentenceMetrics 19 | import com.kotlinnlp.neuralparser.helpers.statistics.Statistics 20 | import com.kotlinnlp.utils.progressindicator.ProgressIndicatorBar 21 | 22 | /** 23 | * The Validator. 24 | * 25 | * @param neuralParser the neural parser 26 | * @property sentences the sentences to parse containing the gold annotation 27 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis) 28 | * @property verbose a Boolean indicating if the verbose mode is enabled (default = true) 29 | */ 30 | class Validator( 31 | private val neuralParser: NeuralParser<*>, 32 | val sentences: List, 33 | sentencePreprocessor: SentencePreprocessor, 34 | private val verbose: Boolean = true 35 | ) { 36 | 37 | companion object { 38 | 39 | /** 40 | * The regular expression to match punctuation forms. 41 | */ 42 | val punctuationRegex = Regex("^[-!\"#%&'()*,./:;?@\\[\\]_{}]+$") 43 | } 44 | 45 | init { 46 | require(sentences.all { it.hasAnnotatedHeads() }) { 47 | "A gold sentence must have a dependency tree with all heads annotated." 48 | } 49 | } 50 | 51 | /** 52 | * A counter of statistic metrics. 53 | */ 54 | private lateinit var counter: MetricsCounter 55 | 56 | /** 57 | * A counter of statistic metrics, without considering punctuation. 58 | */ 59 | private lateinit var counterNoPunct: MetricsCounter 60 | 61 | /** 62 | * The metrics of a sentence. 63 | */ 64 | private lateinit var sentenceMetrics: SentenceMetrics 65 | 66 | /** 67 | * The parser wrapper to parse sentences in CoNLL format. 68 | */ 69 | private val conllParser = CoNLLDependencyParser( 70 | neuralParser = this.neuralParser, 71 | sentencePreprocessor = sentencePreprocessor) 72 | 73 | /** 74 | * Get statistics about the evaluation of the parsing accuracy on the given [sentences]. 75 | * 76 | * @return the statistics of the parsing accuracy 77 | */ 78 | fun evaluate(): Statistics { 79 | 80 | val parsedSentences: List = this.parseSentences() 81 | 82 | this.initCounters(parsedSentences) 83 | 84 | this.sentences.zip(parsedSentences).forEach { (goldSentence, parsedSentence) -> 85 | 86 | val goldTree: DependencyTree = this.buildTree(goldSentence) 87 | val parsedTree: DependencyTree = this.buildTree(parsedSentence, allowCycles = true) 88 | 89 | require(parsedTree.size == goldTree.size) { "The dependency tree and its gold haven't the same size" } 90 | 91 | this.sentenceMetrics = SentenceMetrics() 92 | 93 | goldSentence.tokens.forEach { this.addTokenMetrics(token = it, parsedTree = parsedTree, goldTree = goldTree) } 94 | 95 | this.updateCorrectSentences() 96 | } 97 | 98 | return this.buildStats() 99 | } 100 | 101 | /** 102 | * @param sentence a CoNLL sentence 103 | * @param allowCycles if true it allows to create cycles when building the tree 104 | * 105 | * @return a new dependency tree based on the given sentence 106 | */ 107 | private fun buildTree(sentence: CoNLLSentence, allowCycles: Boolean = false): DependencyTree = 108 | if (this.neuralParser.labellingEnabled) 109 | DependencyTree.Labeled(sentence = sentence, allowCycles = allowCycles) 110 | else 111 | DependencyTree.Unlabeled(sentence = sentence, allowCycles = allowCycles) 112 | 113 | /** 114 | * Parse the validation CoNLL sentences. 115 | * 116 | * @return the list of parsed CoNLL sentences 117 | */ 118 | private fun parseSentences(): List { 119 | 120 | val progress: ProgressIndicatorBar? = if (this.verbose) ProgressIndicatorBar(this.sentences.size) else null 121 | 122 | if (this.verbose) println("Start parsing of %d sentences:".format(this.sentences.size)) 123 | 124 | return this.sentences.mapIndexed { i, sentence -> 125 | 126 | progress?.tick() 127 | 128 | this.conllParser.parse(sentence, index = i) 129 | } 130 | } 131 | 132 | /** 133 | * Initialize the metrics counters. 134 | * 135 | * @param parsedSentences a list of parsed sentences 136 | */ 137 | private fun initCounters(parsedSentences: List) { 138 | 139 | this.counter = MetricsCounter() 140 | this.counterNoPunct = MetricsCounter() 141 | 142 | this.counter.totalSentences = parsedSentences.size 143 | this.counterNoPunct.totalSentences = parsedSentences.size 144 | this.counter.totalTokens = parsedSentences.sumBy { it.tokens.count() } 145 | } 146 | 147 | /** 148 | * Add the statistic metrics of a given [token]. 149 | * 150 | * @param token a token of a sentence 151 | * @param parsedTree the dependency tree of the parsed sentence 152 | * @param goldTree the gold dependency tree of the parsed sentence 153 | */ 154 | private fun addTokenMetrics(token: CoNLLToken, parsedTree: DependencyTree, goldTree: DependencyTree) { 155 | 156 | val isNotPunct: Boolean = !punctuationRegex.matches(token.form) 157 | val parsedConfig: GrammaticalConfiguration? = (parsedTree as? DependencyTree.Labeled)?.getConfiguration(token.id) 158 | val goldConfig: GrammaticalConfiguration? = (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id) 159 | val parsedDependencies: List? = 160 | parsedConfig?.components?.map { it.syntacticDependency } 161 | val goldDependencies: List? = 162 | (goldTree as? DependencyTree.Labeled)?.getConfiguration(token.id)?.components?.map { it.syntacticDependency } 163 | 164 | if (isNotPunct) this.counterNoPunct.totalTokens++ 165 | 166 | if (parsedTree.getHead(token.id) == goldTree.getHead(token.id)) { 167 | 168 | this.addCorrectAttachment(isNotPunct) 169 | 170 | if (parsedDependencies == goldDependencies) 171 | this.addCorrectLabeledAttachment(isNotPunct) 172 | else 173 | this.addUncorrectLabeledAttachment(isNotPunct) 174 | 175 | } else { 176 | this.addUncorrectAttachment(isNotPunct) 177 | this.addUncorrectLabeledAttachment(isNotPunct) 178 | } 179 | 180 | if (parsedConfig?.components?.map { it.pos } == goldConfig?.components?.map { it.pos }) 181 | this.addCorrectPOSTag(isNotPunct) 182 | 183 | if ((parsedDependencies != null && goldDependencies != null 184 | && parsedDependencies.zip(goldDependencies).all { it.first.softEquals(it.second) }) 185 | || (parsedDependencies == goldDependencies)) 186 | this.addCorrectDeprel(isNotPunct) 187 | } 188 | 189 | /** 190 | * Add a correct attachment to the current statistic metrics. 191 | * 192 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token 193 | */ 194 | private fun addCorrectAttachment(isNotPunct: Boolean) { 195 | 196 | this.counter.unlabeledAttachments++ 197 | 198 | if (isNotPunct) this.counterNoPunct.unlabeledAttachments++ 199 | } 200 | 201 | /** 202 | * Add an uncorrect attachment to the current statistic metrics. 203 | * 204 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token 205 | */ 206 | private fun addUncorrectAttachment(isNotPunct: Boolean) { 207 | 208 | this.sentenceMetrics.correctUnlabeled = false 209 | 210 | if (isNotPunct) this.sentenceMetrics.correctUnlabeledNoPunct = false 211 | } 212 | 213 | /** 214 | * Add a correct labeled attachment to the current statistic metrics. 215 | * 216 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token 217 | */ 218 | private fun addCorrectLabeledAttachment(isNotPunct: Boolean) { 219 | 220 | this.counter.labeledAttachments++ 221 | 222 | if (isNotPunct) this.counterNoPunct.labeledAttachments++ 223 | } 224 | 225 | /** 226 | * Add an uncorrect labeled attachment to the current statistic metrics. 227 | * 228 | * @param isNotPunct a Boolean indicating if the attachment is related to a non-punctuation token 229 | */ 230 | private fun addUncorrectLabeledAttachment(isNotPunct: Boolean) { 231 | 232 | this.sentenceMetrics.correctLabeled = false 233 | 234 | if (isNotPunct) this.sentenceMetrics.correctLabeledNoPunct = false 235 | } 236 | 237 | /** 238 | * Add a correct POS tag to the current statistic metrics. 239 | * 240 | * @param isNotPunct a Boolean indicating if the POS tag is related to a non-punctuation token 241 | */ 242 | private fun addCorrectPOSTag(isNotPunct: Boolean) { 243 | 244 | this.counter.correctPOSTags++ 245 | 246 | if (isNotPunct) this.counterNoPunct.correctPOSTags++ 247 | } 248 | 249 | /** 250 | * Add a correct deprel to the current statistic metrics. 251 | * 252 | * @param isNotPunct a Boolean indicating if the deprel is related to a non-punctuation token 253 | */ 254 | private fun addCorrectDeprel(isNotPunct: Boolean) { 255 | 256 | this.counter.correctDeprels++ 257 | 258 | if (isNotPunct) this.counterNoPunct.correctDeprels++ 259 | } 260 | 261 | /** 262 | * Update the counters of correct sentences with the current [sentenceMetrics]. 263 | */ 264 | private fun updateCorrectSentences() { 265 | 266 | if (this.sentenceMetrics.correctLabeled) this.counter.correctLabeledSentences++ 267 | if (this.sentenceMetrics.correctUnlabeled) this.counter.correctUnlabeledSentences++ 268 | if (this.sentenceMetrics.correctLabeledNoPunct) this.counterNoPunct.correctLabeledSentences++ 269 | if (this.sentenceMetrics.correctUnlabeledNoPunct) this.counterNoPunct.correctUnlabeledSentences++ 270 | } 271 | 272 | /** 273 | * Build the statistics related to the current counted metrics. 274 | */ 275 | private fun buildStats(): Statistics { 276 | 277 | val punctStats = this.counter.toStatistics() 278 | val noPunctStats = this.counterNoPunct.toStatistics() 279 | 280 | return Statistics( 281 | las = punctStats.las, 282 | uas = punctStats.uas, 283 | ps = punctStats.ps, 284 | ds = punctStats.ds, 285 | slas = punctStats.slas, 286 | suas = punctStats.suas, 287 | noPunctuation = noPunctStats) 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseSentence.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 11 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence 12 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable 13 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 14 | 15 | /** 16 | * A base real sentence. 17 | * 18 | * @property id the id of the sentence, unique within a list of sentences 19 | * @property tokens the list of tokens that compose the sentence 20 | * @property position the position of this sentence in the original text 21 | */ 22 | data class BaseSentence( 23 | val id: Int, 24 | override val tokens: List, 25 | override val position: Position 26 | ) : RealSentence, SentenceIdentificable() { 27 | 28 | companion object { 29 | 30 | /** 31 | * Convert a CoNLL sentence to a [BaseSentence]. 32 | * 33 | * @param sentence a CoNLL sentence 34 | * @param index the index of the sentence within a list of sentences 35 | * 36 | * @return a real sentence of real tokens 37 | */ 38 | fun fromCoNLL(sentence: CoNLLSentence, index: Int): BaseSentence { 39 | 40 | val baseTokens = sentence.tokens.toBaseTokens() 41 | 42 | return BaseSentence( 43 | id = index, // the index is unique within a list of sentences 44 | tokens = baseTokens, 45 | position = Position( 46 | index = index, 47 | start = baseTokens.first().position.start, 48 | end = baseTokens.last().position.end)) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/BaseToken.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.token.RealToken 11 | import com.kotlinnlp.linguisticdescription.sentence.token.TokenIdentificable 12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 13 | 14 | /** 15 | * A base real token. 16 | * 17 | * @property id the token id, an incremental integer starting from 0 within a sentence 18 | * @property form the form of the token 19 | * @property position the position of the token in the original text 20 | */ 21 | data class BaseToken( 22 | override val id: Int, 23 | override val form: String, 24 | override val position: Position 25 | ) : RealToken, TokenIdentificable 26 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/CorpusDictionary.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.google.common.collect.HashMultimap 11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 12 | import com.kotlinnlp.conllio.Token as CoNLLToken 13 | import com.kotlinnlp.linguisticdescription.POSTag 14 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 15 | import com.kotlinnlp.utils.DictionarySet 16 | import java.io.Serializable 17 | 18 | /** 19 | * The CorpusDictionary. 20 | */ 21 | class CorpusDictionary : Serializable { 22 | 23 | companion object { 24 | 25 | /** 26 | * Private val used to serialize the class (needed by Serializable). 27 | */ 28 | @Suppress("unused") 29 | private const val serialVersionUID: Long = 1L 30 | 31 | /** 32 | * Create a new corpus populated with the information contained in the given [sentences] (words, POS tags and 33 | * deprels). 34 | * 35 | * @param sentences a list of sentences 36 | * 37 | * @return a new corpus dictionary 38 | */ 39 | operator fun invoke(sentences: List): CorpusDictionary { 40 | 41 | val dictionary = CorpusDictionary() 42 | 43 | sentences.forEach { it.tokens.forEach { token -> dictionary.addInfo(token) } } 44 | 45 | return dictionary 46 | } 47 | } 48 | 49 | /** 50 | * The words. 51 | */ 52 | val words = DictionarySet() 53 | 54 | /** 55 | * The map of forms to their possible POS tags. 56 | */ 57 | val formsToPosTags: HashMultimap> = HashMultimap.create() 58 | 59 | /** 60 | * The dictionary set of all the possible grammatical configurations. 61 | */ 62 | val grammaticalConfigurations = DictionarySet() 63 | 64 | /** 65 | * Add the info of a given [token] into this dictionary. 66 | * 67 | * @param token the token of a sentence 68 | */ 69 | private fun addInfo(token: CoNLLToken) { 70 | 71 | this.words.add(token.normalizedForm) 72 | 73 | this.formsToPosTags.put(token.normalizedForm, token.posList) 74 | 75 | this.grammaticalConfigurations.add(GrammaticalConfiguration(*Array( 76 | size = maxOf(token.posList.size, token.syntacticDependencies.size), 77 | init = { i -> GrammaticalConfiguration.Component( 78 | pos = token.posList.getOrElse(i) { token.posList.single() }, 79 | syntacticDependency = token.syntacticDependencies.getOrElse(i) { token.syntacticDependencies.single() }) 80 | }))) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/Extensions.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 11 | import com.kotlinnlp.conllio.Token as CoNLLToken 12 | 13 | /** 14 | * @return a list of base real tokens 15 | */ 16 | fun List.toBaseTokens(): List { 17 | 18 | var end = -2 19 | 20 | return this.mapIndexed { i, it -> 21 | 22 | val start = end + 2 // each couple of consecutive tokens is separated by a spacing char 23 | end = start + it.form.length - 1 24 | 25 | BaseToken( 26 | id = it.id, 27 | form = it.form, 28 | position = Position(index = i, start = start, end = end) 29 | ) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingSentence.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 11 | import com.kotlinnlp.linguisticdescription.morphology.MorphologicalAnalysis 12 | import com.kotlinnlp.linguisticdescription.morphology.Morphologies 13 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence 14 | import com.kotlinnlp.linguisticdescription.sentence.RealSentence 15 | import com.kotlinnlp.linguisticdescription.sentence.SentenceIdentificable 16 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 17 | import com.kotlinnlp.neuralparser.helpers.labelerselector.LabelerSelector 18 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar 19 | 20 | /** 21 | * The sentence used as input of the [com.kotlinnlp.neuralparser.NeuralParser]. 22 | * 23 | * @property tokens the list of tokens of the sentence 24 | * @property morphoAnalysis the morphological analysis of the tokens (can be null) 25 | * @property position the position of the sentence in the text 26 | * @param labelerSelector the labeler selector used to select the grammatical configurations compatible with the sentence 27 | */ 28 | class ParsingSentence( 29 | override val tokens: List, 30 | override val morphoAnalysis: MorphologicalAnalysis? = null, 31 | override val position: Position, 32 | private val labelerSelector: LabelerSelector 33 | ) : MorphoSentence, RealSentence, SentenceIdentificable() { 34 | 35 | /** 36 | * Check whether the morphologies of the token are compatible with the given configuration [c]. 37 | * Middle multi-words morphologies are compared partially (only with the "CONTIN" components). 38 | * 39 | * @param c the grammatical configuration 40 | * @param tokenIndex the index of a token of the sentence 41 | * 42 | * @return true if the morphologies of the token are compatible with the given configuration, otherwise false 43 | */ 44 | fun areConfigurationCompatible(c: GrammaticalConfiguration, tokenIndex: Int): Boolean = 45 | this.morphoAnalysis!!.startMorphologies[tokenIndex].any { c.isCompatible(it) } || 46 | this.morphoAnalysis.middleMWMorphologies[tokenIndex].any { c.isPartiallyCompatible(it) } 47 | 48 | /** 49 | * @param c the grammatical configuration 50 | * @param tokenIndex the index of a token of the sentence 51 | * 52 | * @return the token morphologies (including the multi-words) that are compatible with the given configuration 53 | */ 54 | fun getCompatibleMorphologies(c: GrammaticalConfiguration, tokenIndex: Int) = Morphologies( 55 | this.morphoAnalysis!!.allMorphologies[tokenIndex].filter { 56 | c.isCompatible(it) // TODO: || c.isPartiallyCompatible(it) 57 | }) 58 | 59 | /** 60 | * Get the list of scored grammatical configurations that are valid for a given attachment. 61 | * 62 | * @param tokenIndex the index of the token to which one of the [configurations] must be assigned 63 | * @param headIndex the index of the token head (can be null) 64 | * @param configurations the list of grammatical configurations, sorted by descending score 65 | * 66 | * @return the valid grammatical configurations for the given attachment 67 | */ 68 | fun getValidConfigurations(tokenIndex: Int, 69 | headIndex: Int?, 70 | configurations: List): List = 71 | this.labelerSelector.getValidConfigurations( 72 | sentence = this, 73 | tokenIndex = tokenIndex, 74 | headIndex = headIndex, 75 | configurations = configurations) 76 | 77 | /** 78 | * Get the morphologies of a given token that are compatible with the given grammatical configuration. 79 | * 80 | * @param tokenIndex the index of a token of the sentence 81 | * @param configuration the grammatical configuration of the token 82 | * 83 | * @return the morphologies compatible with the given grammatical configuration 84 | */ 85 | fun getValidMorphologies(tokenIndex: Int, 86 | configuration: GrammaticalConfiguration): Morphologies = 87 | this.labelerSelector.getValidMorphologies( 88 | sentence = this, 89 | tokenIndex = tokenIndex, 90 | configuration = configuration) 91 | } 92 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/language/ParsingToken.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.language 9 | 10 | import com.kotlinnlp.linguisticdescription.POSTag 11 | import com.kotlinnlp.linguisticdescription.sentence.token.* 12 | import com.kotlinnlp.linguisticdescription.sentence.token.properties.Position 13 | 14 | /** 15 | * The token of the [ParsingSentence]. 16 | * 17 | * @property id the id of the token, unique within its sentence 18 | * @property form the form 19 | * @property pos the list of part-of-speech tags associated to the token (more for composite tokens, can be null) 20 | * @property position the position of the token in the text (null if it is a trace) 21 | */ 22 | data class ParsingToken( 23 | override val id: Int, 24 | override val form: String, 25 | val pos: List? = null, // TODO: find a better solution 26 | val position: Position? 27 | ) : FormToken, TokenIdentificable 28 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRModel.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser 9 | 10 | import com.kotlinnlp.lssencoder.LSSModel 11 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.LabelerModel 12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType 13 | import com.kotlinnlp.neuralparser.NeuralParserModel 14 | import com.kotlinnlp.neuralparser.language.CorpusDictionary 15 | import com.kotlinnlp.neuralparser.language.ParsingSentence 16 | import com.kotlinnlp.neuralparser.language.ParsingToken 17 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh 18 | import com.kotlinnlp.simplednn.core.layers.models.merge.mergeconfig.AffineMerge 19 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel 20 | import com.kotlinnlp.utils.Serializer 21 | import java.io.InputStream 22 | 23 | /** 24 | * The model of the [LHRParser]. 25 | * 26 | * @property language the language within the parser works (default = unknown) 27 | * @param corpusDictionary a corpus dictionary 28 | * @property lssModel the model of the LSS encoder 29 | * @property useLabeler whether to use the labeler 30 | * @property lossCriterionType the training mode of the labeler 31 | * @property predictPosTags whether to predict the POS tags together with the Deprels 32 | */ 33 | class LHRModel( 34 | corpusDictionary: CorpusDictionary, 35 | val lssModel: LSSModel, 36 | val useLabeler: Boolean, 37 | val lossCriterionType: LossCriterionType, 38 | val predictPosTags: Boolean 39 | ) : NeuralParserModel(lssModel.language) { 40 | 41 | companion object { 42 | 43 | /** 44 | * Private val used to serialize the class (needed by Serializable). 45 | */ 46 | @Suppress("unused") 47 | private const val serialVersionUID: Long = 1L 48 | 49 | /** 50 | * Read a [LHRModel] (serialized) from an input stream and decode it. 51 | * 52 | * @param inputStream the [InputStream] from which to read the serialized [LHRModel] 53 | * 54 | * @return the [LHRModel] read from [inputStream] and decoded 55 | */ 56 | fun load(inputStream: InputStream): LHRModel = Serializer.deserialize(inputStream) 57 | } 58 | 59 | /** 60 | * The model of the Labeler. 61 | */ 62 | val labelerModel: LabelerModel? = if (this.useLabeler) 63 | LabelerModel( 64 | contextEncodingSize = this.lssModel.contextVectorsSize, 65 | grammaticalConfigurations = corpusDictionary.grammaticalConfigurations, 66 | lossCriterionType = this.lossCriterionType) 67 | else 68 | null 69 | 70 | /** 71 | * The model of the pointer network used for the positional encoding. 72 | */ 73 | val pointerNetworkModel = PointerNetworkModel( 74 | inputSize = this.lssModel.contextVectorsSize, 75 | vectorSize = this.lssModel.contextVectorsSize, 76 | mergeConfig = AffineMerge(outputSize = 100, activationFunction = Tanh)) 77 | 78 | /** 79 | * @return the string representation of this model 80 | */ 81 | override fun toString(): String = """ 82 | %-33s : %s 83 | %-33s : %s 84 | %-33s : %s 85 | %-33s : %s 86 | %-33s : %s 87 | """.trimIndent().format( 88 | this.lssModel.tokensEncoderWrapperModel.model::class.simpleName, this.lssModel.tokensEncoderWrapperModel.model, 89 | "Context Encoder", this.lssModel.contextBiRNNConfig, 90 | "Heads Encoder", this.lssModel.headsBiRNNConfig, 91 | "Labeler training mode", this.lossCriterionType, 92 | "Predict POS tags", this.predictPosTags 93 | ) 94 | } 95 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRParser.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSynSentence 12 | import com.kotlinnlp.lssencoder.LSSEncoder 13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure 14 | import com.kotlinnlp.lssencoder.decoder.CosineDecoder 15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler 16 | import com.kotlinnlp.neuralparser.NeuralParser 17 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.LabeledMorphoSynBuilder 18 | import com.kotlinnlp.neuralparser.helpers.sentencebuilder.UnlabeledMorphoSynBuilder 19 | import com.kotlinnlp.neuralparser.language.ParsingSentence 20 | import com.kotlinnlp.neuralparser.language.ParsingToken 21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.GreedyDependencyTreeBuilder 22 | 23 | /** 24 | * The Latent Head Representation (LHR) Parser. 25 | * 26 | * Implemented as described in the following publication: 27 | * [Non-Projective Dependency Parsing via Latent Heads Representation (LHR)](https://arxiv.org/abs/1802.02116) 28 | * 29 | * @property model the parser model 30 | * @param contextDropout the dropout probability of the context encodings (default 0.0) 31 | * @param headsDropout the dropout probability of the latent heads encodings (default 0.0) 32 | */ 33 | class LHRParser( 34 | override val model: LHRModel, 35 | contextDropout: Double = 0.0, 36 | headsDropout: Double = 0.0 37 | ) : NeuralParser { 38 | 39 | /** 40 | * Whether this parser executes the morpho-syntactic labelling. 41 | */ 42 | override val labellingEnabled: Boolean = this.model.useLabeler 43 | 44 | /** 45 | * The Encoder of the Latent Syntactic Structure. 46 | */ 47 | private val lssEncoder = 48 | LSSEncoder(model = this.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout) 49 | 50 | /** 51 | * The builder of the labeler. 52 | */ 53 | private val labeler: Labeler? = this.model.labelerModel?.let { Labeler(it) } 54 | 55 | /** 56 | * Parse a sentence, returning its dependency tree. 57 | * The dependency tree is obtained by decoding a latent syntactic structure. 58 | * If the labeler is available, the dependency tree could contain grammatical information. 59 | * 60 | * @param sentence a parsing sentence 61 | * 62 | * @return the dependency tree predicted for the given [sentence] 63 | */ 64 | override fun parse(sentence: ParsingSentence): MorphoSynSentence { 65 | 66 | val lss: LatentSyntacticStructure = this.lssEncoder.forward(sentence) 67 | 68 | val dependencyTree: DependencyTree = GreedyDependencyTreeBuilder( 69 | lss = lss, 70 | scoresMap = CosineDecoder().decode(lss), 71 | labeler = this.labeler 72 | ).build() 73 | 74 | return when (dependencyTree) { 75 | 76 | is DependencyTree.Labeled -> 77 | LabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence() 78 | 79 | is DependencyTree.Unlabeled -> 80 | UnlabeledMorphoSynBuilder(parsingSentence = sentence, dependencyTree = dependencyTree).buildSentence() 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTrainer.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser 9 | 10 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler 11 | import com.kotlinnlp.dependencytree.DependencyTree 12 | import com.kotlinnlp.lssencoder.LSSEncoder 13 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure 14 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 15 | import com.kotlinnlp.neuralparser.helpers.Trainer 16 | import com.kotlinnlp.neuralparser.helpers.validator.Validator 17 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor 18 | import com.kotlinnlp.neuralparser.language.ParsingSentence 19 | import com.kotlinnlp.neuralparser.language.ParsingToken 20 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder 21 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.PositionalEncoder.Companion.calculateErrors 22 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator 23 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod 24 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod 25 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer 26 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor 27 | import com.kotlinnlp.simplednn.simplemath.assignSum 28 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 29 | import com.kotlinnlp.simplednn.utils.scheduling.BatchScheduling 30 | import com.kotlinnlp.simplednn.utils.scheduling.EpochScheduling 31 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling 32 | 33 | /** 34 | * The training helper. 35 | * 36 | * @param parser a neural parser 37 | * @param batchSize the size of the batches of sentences 38 | * @param epochs the number of training epochs 39 | * @param validator the validation helper (if it is null no validation is done after each epoch) 40 | * @param modelFilename the name of the file in which to save the best trained model 41 | * @param updateMethod the update method shared to all the parameters of the parser (Learning Rate, ADAM, AdaGrad, ...) 42 | * @param contextDropout the dropout probability of the context encodings (default 0.25) 43 | * @param headsDropout the dropout probability of the latent heads encodings (default 0.25) 44 | * @param labelerDropout the dropout probability of the labeler (default 0.25) 45 | * @param skipPunctuationErrors whether to do not consider punctuation errors 46 | * @param usePositionalEncodingErrors whether to calculate and propagate the positional encoding errors 47 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis) 48 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true) 49 | */ 50 | class LHRTrainer( 51 | private val parser: LHRParser, 52 | private val batchSize: Int, 53 | private val epochs: Int, 54 | validator: Validator?, 55 | modelFilename: String, 56 | private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999), 57 | contextDropout: Double = 0.25, 58 | headsDropout: Double = 0.25, 59 | labelerDropout: Double = 0.25, 60 | private val skipPunctuationErrors: Boolean, 61 | usePositionalEncodingErrors: Boolean, 62 | sentencePreprocessor: SentencePreprocessor = BasePreprocessor(), 63 | verbose: Boolean = true 64 | ) : Trainer( 65 | neuralParser = parser, 66 | batchSize = batchSize, 67 | epochs = epochs, 68 | validator = validator, 69 | modelFilename = modelFilename, 70 | minRelevantErrorsCountToUpdate = 1, 71 | sentencePreprocessor = sentencePreprocessor, 72 | verbose = verbose 73 | ) { 74 | 75 | /** 76 | * The encoder of the Latent Syntactic Structure. 77 | */ 78 | private val lssEncoder = 79 | LSSEncoder(model = this.parser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout) 80 | 81 | /** 82 | * The labeler. 83 | */ 84 | private val labeler: Labeler? = this.parser.model.labelerModel?.let { Labeler(model = it, dropout = labelerDropout) } 85 | 86 | /** 87 | * The positional encoder. 88 | */ 89 | private val positionalEncoder: PositionalEncoder? = if (usePositionalEncodingErrors) 90 | PositionalEncoder(this.parser.model.pointerNetworkModel) 91 | else 92 | null 93 | 94 | /** 95 | * The pointer network optimizer. 96 | */ 97 | private val pointerNetworkOptimizer = ParamsOptimizer(this.updateMethod) 98 | 99 | /** 100 | * The optimizer of the LSS encoder. 101 | */ 102 | private val lssEncoderOptimizer = ParamsOptimizer(this.updateMethod) 103 | 104 | /** 105 | * The optimizer of the labeler (can be null). 106 | */ 107 | private val labelerOptimizer: ParamsOptimizer? = this.parser.model.labelerModel?.let { 108 | ParamsOptimizer(this.updateMethod) 109 | } 110 | 111 | /** 112 | * The epoch counter. 113 | */ 114 | private var epochCount: Int = 0 115 | 116 | /** 117 | * Group the optimizers all together. 118 | */ 119 | private val optimizers: List = listOf( 120 | this.lssEncoderOptimizer, 121 | this.labelerOptimizer, 122 | this.pointerNetworkOptimizer) 123 | 124 | /** 125 | * @return a string representation of the configuration of this Trainer 126 | */ 127 | override fun toString(): String = """ 128 | %-33s : %s 129 | %-33s : %s 130 | %-33s : %s 131 | """.trimIndent().format( 132 | "Epochs", this.epochs, 133 | "Batch size", this.batchSize, 134 | "Skip punctuation errors", this.skipPunctuationErrors 135 | ) 136 | 137 | /** 138 | * Beat the occurrence of a new batch. 139 | */ 140 | override fun newBatch() { 141 | if (this.updateMethod is BatchScheduling) this.updateMethod.newBatch() 142 | } 143 | 144 | /** 145 | * Beat the occurrence of a new epoch. 146 | */ 147 | override fun newEpoch() { 148 | 149 | if (this.updateMethod is EpochScheduling) this.updateMethod.newEpoch() 150 | 151 | this.epochCount++ 152 | } 153 | 154 | /** 155 | * Update the model parameters. 156 | */ 157 | override fun update() { 158 | this.optimizers.forEach { it?.update() } 159 | } 160 | 161 | /** 162 | * @return the count of the relevant errors 163 | */ 164 | override fun getRelevantErrorsCount(): Int = 1 165 | 166 | /** 167 | * Method to call before learning a new sentence. 168 | */ 169 | private fun beforeSentenceLearning() { 170 | if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample() 171 | } 172 | 173 | /** 174 | * Train the Transition System with the given [sentence] and [goldTree]. 175 | * 176 | * @param sentence the sentence 177 | * @param goldTree the gold tree of the sentence 178 | */ 179 | override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) { 180 | 181 | this.beforeSentenceLearning() 182 | 183 | val lss: LatentSyntacticStructure = this.lssEncoder.forward(sentence) 184 | val latentHeadsErrors = calculateLatentHeadsErrors(lss, goldTree) 185 | 186 | val labelerErrors: List? = this.labeler?.let { 187 | val labelerPrediction: List = it.forward(Labeler.Input(lss, goldTree)) 188 | this.parser.model.labelerModel?.calculateLoss(labelerPrediction, goldTree) 189 | } 190 | 191 | val positionalEncoderErrors: PointerNetworkProcessor.InputErrors? = this.positionalEncoder?.let { 192 | it.propagateErrors(calculateErrors(it.forward(lss.contextVectors)), this.pointerNetworkOptimizer) 193 | } 194 | 195 | this.propagateErrors( 196 | latentHeadsErrors = latentHeadsErrors, 197 | labelerErrors = labelerErrors, 198 | positionalEncoderErrors = positionalEncoderErrors) 199 | } 200 | 201 | /** 202 | * Calculate the errors of the latent heads 203 | * 204 | * @param lss the latent syntactic structure 205 | * @param goldTree the gold tree of the sentence 206 | * 207 | * @return the errors of the latent heads 208 | */ 209 | private fun calculateLatentHeadsErrors(lss: LatentSyntacticStructure, 210 | goldTree: DependencyTree): List = 211 | MSECalculator().calculateErrors( 212 | outputSequence = lss.latentHeads, 213 | outputGoldSequence = this.getExpectedLatentHeads(lss, goldTree)) 214 | 215 | /** 216 | * Return a list containing the expected latent heads, one for each token of the sentence. 217 | * 218 | * @param lss the latent syntactic structure 219 | * @param goldTree the gold tree of the sentence 220 | * 221 | * @return the expected latent heads 222 | */ 223 | private fun getExpectedLatentHeads(lss: LatentSyntacticStructure, 224 | goldTree: DependencyTree): List = 225 | 226 | lss.sentence.tokens.map { token -> 227 | 228 | val goldHeadId: Int? = goldTree.getHead(token.id) 229 | 230 | when { 231 | goldHeadId == null -> lss.virtualRoot 232 | this.skipPunctuationErrors && token.isComma -> lss.getLatentHeadById(token.id) // no errors 233 | else -> lss.getContextVectorById(goldHeadId) 234 | } 235 | } 236 | 237 | /** 238 | * Propagate the errors through the encoders. 239 | * 240 | * @param latentHeadsErrors the latent heads errors 241 | * @param labelerErrors the labeler errors 242 | * @param positionalEncoderErrors the positional encoder errors 243 | */ 244 | private fun propagateErrors(latentHeadsErrors: List, 245 | labelerErrors: List?, 246 | positionalEncoderErrors: PointerNetworkProcessor.InputErrors?) { 247 | 248 | val contextVectorsErrors: List = latentHeadsErrors.map { it.zerosLike() } 249 | 250 | positionalEncoderErrors?.let { contextVectorsErrors.assignSum(it.inputVectorsErrors) } 251 | 252 | this.labeler?.propagateErrors(labelerErrors!!, this.labelerOptimizer!!, copy = false)?.let { labelerInputErrors -> 253 | contextVectorsErrors.assignSum(labelerInputErrors.contextErrors) 254 | this.propagateRootErrors(labelerInputErrors.rootErrors) 255 | } 256 | 257 | this.lssEncoder.backward(outputErrors = LSSEncoder.OutputErrors( 258 | size = latentHeadsErrors.size, 259 | contextVectors = contextVectorsErrors, 260 | latentHeads = latentHeadsErrors)) 261 | 262 | this.lssEncoderOptimizer.accumulate(this.lssEncoder.getParamsErrors(copy = false)) 263 | } 264 | 265 | /** 266 | * Propagate the [errors] through the virtual root embedding. 267 | * 268 | * @param errors the errors 269 | */ 270 | private fun propagateRootErrors(errors: DenseNDArray) { 271 | this.updateMethod.update(array = this.parser.model.lssModel.rootEmbedding, errors = errors) 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/LHRTransferLearning.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.lssencoder.LSSEncoder 12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure 13 | import com.kotlinnlp.neuralparser.helpers.preprocessors.SentencePreprocessor 14 | import com.kotlinnlp.neuralparser.helpers.Trainer 15 | import com.kotlinnlp.neuralparser.helpers.validator.Validator 16 | import com.kotlinnlp.neuralparser.helpers.preprocessors.BasePreprocessor 17 | import com.kotlinnlp.neuralparser.language.ParsingSentence 18 | import com.kotlinnlp.neuralparser.language.ParsingToken 19 | import com.kotlinnlp.simplednn.core.functionalities.losses.MSECalculator 20 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.UpdateMethod 21 | import com.kotlinnlp.simplednn.core.functionalities.updatemethods.radam.RADAMMethod 22 | import com.kotlinnlp.simplednn.core.optimizer.ParamsOptimizer 23 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 24 | import com.kotlinnlp.simplednn.utils.scheduling.ExampleScheduling 25 | 26 | /** 27 | * The transfer learning training helper. 28 | * 29 | * @param referenceParser the neural parser used as reference 30 | * @param targetParser the neural parser to train via transfer learning 31 | * @param epochs the number of training epochs 32 | * @param validator the validation helper (if it is null no validation is done after each epoch) 33 | * @param modelFilename the name of the file in which to save the best trained model 34 | * @param updateMethod the update method (Learning Rate, ADAM, AdaGrad, ...) 35 | * @param contextDropout the dropout probability of the target context encodings (default 0.0) 36 | * @param headsDropout the dropout probability of the target latent heads encodings (default 0.0) 37 | * @param sentencePreprocessor the sentence preprocessor (e.g. to perform morphological analysis) 38 | * @param verbose a Boolean indicating if the verbose mode is enabled (default = true) 39 | */ 40 | class LHRTransferLearning( 41 | private val referenceParser: LHRParser, 42 | private val targetParser: LHRParser, 43 | private val epochs: Int, 44 | validator: Validator?, 45 | modelFilename: String, 46 | private val updateMethod: UpdateMethod<*> = RADAMMethod(stepSize = 0.001, beta1 = 0.9, beta2 = 0.999), 47 | contextDropout: Double = 0.0, 48 | headsDropout: Double = 0.0, 49 | sentencePreprocessor: SentencePreprocessor = BasePreprocessor(), 50 | verbose: Boolean = true 51 | ) : Trainer( 52 | neuralParser = targetParser, 53 | batchSize = 1, 54 | epochs = epochs, 55 | validator = validator, 56 | modelFilename = modelFilename, 57 | minRelevantErrorsCountToUpdate = 1, 58 | sentencePreprocessor = sentencePreprocessor, 59 | verbose = verbose 60 | ) { 61 | 62 | /** 63 | * The [LSSEncoder] of the reference parser. 64 | */ 65 | private val referenceLSSEncoder: LSSEncoder = 66 | LSSEncoder(model = this.referenceParser.model.lssModel) 67 | 68 | /** 69 | * The [LSSEncoder] of the target parser. 70 | */ 71 | private val targetLSSEncoder: LSSEncoder = 72 | LSSEncoder(model = this.targetParser.model.lssModel, contextDropout = contextDropout, headsDropout = headsDropout) 73 | 74 | /** 75 | * The optimizer of the context encoder. 76 | */ 77 | private val targetLSSEncoderOptimizer = ParamsOptimizer(this.updateMethod) 78 | 79 | /** 80 | * Train the [targetParser] with the given [sentence] and [goldTree]. 81 | * Transfer the knowledge acquired by the LSS encoder of a reference parser to that of the target parser. 82 | * 83 | * @param sentence the input sentence 84 | * @param goldTree the gold tree of the sentence 85 | */ 86 | override fun trainSentence(sentence: ParsingSentence, goldTree: DependencyTree.Labeled) { 87 | 88 | this.beforeSentenceLearning() 89 | 90 | val targetLSS: LatentSyntacticStructure = this.targetLSSEncoder.forward(sentence) 91 | val refLSS: LatentSyntacticStructure = this.referenceLSSEncoder.forward(sentence) 92 | 93 | val contextErrors: List = MSECalculator().calculateErrors( 94 | outputSequence = targetLSS.contextVectors, 95 | outputGoldSequence = refLSS.contextVectors) 96 | 97 | this.targetLSSEncoder.backward(LSSEncoder.OutputErrors(size = sentence.tokens.size, contextVectors = contextErrors)) 98 | this.targetLSSEncoderOptimizer.accumulate((this.targetLSSEncoder.getParamsErrors())) 99 | } 100 | 101 | /** 102 | * Method to call before learning a new sentence. 103 | */ 104 | private fun beforeSentenceLearning() { 105 | if (this.updateMethod is ExampleScheduling) this.updateMethod.newExample() 106 | } 107 | 108 | /** 109 | * Update the model parameters. 110 | */ 111 | override fun update() { 112 | this.targetLSSEncoderOptimizer.update() 113 | } 114 | 115 | /** 116 | * @return the count of the relevant errors 117 | */ 118 | override fun getRelevantErrorsCount(): Int = 1 119 | 120 | /** 121 | * @return a string representation of the configuration of this Trainer 122 | */ 123 | override fun toString(): String = """ 124 | %-33s : %s 125 | """.trimIndent().format( 126 | "Epochs", this.epochs 127 | ) 128 | } 129 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/CyclesFixer.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs 12 | 13 | /** 14 | * Naive strategy to fix possible cycles in a [dependencyTree]. 15 | * 16 | * @param dependencyTree the dependency tree 17 | * @param scoredArcs the scored arcs between pair of tree elements 18 | */ 19 | internal class CyclesFixer(private val dependencyTree: DependencyTree, private val scoredArcs: ScoredArcs) { 20 | 21 | /** 22 | * The set of direct elements of the tree (elements that aren't involved in cycles). 23 | */ 24 | private lateinit var directElements: Set 25 | 26 | /** 27 | * Fix the cycles of the dependency tree. 28 | */ 29 | fun fixCycles() { 30 | 31 | val cycles: List = this.dependencyTree.getCycles() 32 | 33 | this.directElements = this.dependencyTree.elements.toSet() - cycles.toElementsSet() 34 | 35 | cycles.forEach { this.fixCycle(it) } 36 | } 37 | 38 | /** 39 | * @return the set of elements from a list of path 40 | */ 41 | private fun List.toElementsSet(): Set { 42 | 43 | val elements = mutableSetOf() 44 | this.forEach { path -> elements += path.arcs.map { it.dependent } } 45 | return elements 46 | } 47 | 48 | /** 49 | * Remove a [cycle] from the dependency tree. 50 | * 51 | * @param cycle a cycle of the dependency tree 52 | */ 53 | private fun fixCycle(cycle: DependencyTree.Path) { 54 | 55 | val dep: Int = this.removeLowestScoringArc(cycle.arcs) 56 | val (newGov: Int, score: Double) = this.findBestGovernor(dep) 57 | this.dependencyTree.setArc(dependent = dep, governor = newGov, score = score) 58 | } 59 | 60 | /** 61 | * Remove the lowest scoring arc and return the related dependent to be reattached. 62 | * 63 | * @param arcs a list of arcs 64 | * 65 | * @return the element to be reattached. 66 | */ 67 | private fun removeLowestScoringArc(arcs: List): Int { 68 | 69 | val arc: DependencyTree.Arc = this.getLowestScoringArc(arcs) 70 | this.dependencyTree.removeArc(dependent = arc.dependent, governor = arc.governor) 71 | return arc.dependent 72 | } 73 | 74 | /** 75 | * @param arcs a list of arcs 76 | * 77 | * @return the lowest scoring arc according to the [scoredArcs]. 78 | */ 79 | private fun getLowestScoringArc(arcs: List): DependencyTree.Arc = 80 | arcs.minBy { arc -> this.scoredArcs.getScore(dependentId = arc.dependent, governorId = arc.governor) }!! 81 | 82 | /** 83 | * Find the best governor for the given element that doesn't introduce a cycle. 84 | * 85 | * @param element an element of the dependency tree 86 | * 87 | * @return the new governor id and the related score 88 | */ 89 | private fun findBestGovernor(element: Int): Pair { 90 | 91 | val headsMap: Map = this.scoredArcs.getHeadsMap(element) 92 | 93 | val candidates: List = this.directElements.intersect(headsMap.keys).filter { candidateGov -> 94 | !this.dependencyTree.introduceCycle(dependent = element, governor = candidateGov) 95 | } 96 | 97 | return headsMap.filter { it.key in candidates }.maxBy { it.value }!!.toPair() 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/GreedyDependencyTreeBuilder.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure 12 | import com.kotlinnlp.lssencoder.decoder.ScoredArcs 13 | import com.kotlinnlp.neuralparser.language.ParsingSentence 14 | import com.kotlinnlp.neuralparser.language.ParsingToken 15 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.Labeler 16 | 17 | /** 18 | * A helper that builds the dependency tree with the highest scoring configurations. 19 | * 20 | * @param lss the latent syntactic structure of the input sentence 21 | */ 22 | internal class GreedyDependencyTreeBuilder( 23 | private val lss: LatentSyntacticStructure, 24 | private val scoresMap: ScoredArcs, 25 | private val labeler: Labeler? 26 | ) { 27 | 28 | /** 29 | * Build a new dependency tree from the latent syntactic structure [lss], using the possible attachments in the 30 | * [scoresMap]. 31 | * 32 | * @return the annotated dependency tree with the highest score, built from the given LSS 33 | */ 34 | fun build(): DependencyTree = 35 | if (this.labeler != null) 36 | DependencyTree.Labeled(this.lss.sentence.tokens.map { it.id }).apply { 37 | assignHighestScoringHeads() 38 | fixCycles() 39 | assignLabels() 40 | } 41 | else 42 | DependencyTree.Unlabeled(this.lss.sentence.tokens.map { it.id }).apply { 43 | assignHighestScoringHeads() 44 | fixCycles() 45 | } 46 | 47 | 48 | /** 49 | * Assign the heads to this dependency tree using the highest scoring arcs of the [scoresMap]. 50 | */ 51 | private fun DependencyTree.assignHighestScoringHeads() { 52 | 53 | val (topId: Int, topScore: Double) = scoresMap.findHighestScoringTop() 54 | 55 | this.setAttachmentScore(dependent = topId, score = topScore) 56 | 57 | this.elements.filter { it != topId }.forEach { depId -> 58 | 59 | val (govId: Int, score: Double) = scoresMap.findHighestScoringHead( 60 | dependentId = depId, 61 | except = listOf(ScoredArcs.rootId))!! 62 | 63 | this.setArc( 64 | dependent = depId, 65 | governor = govId, 66 | allowCycle = true, 67 | score = score) 68 | } 69 | } 70 | 71 | /** 72 | * Fix possible cycles using the [scoresMap]. 73 | */ 74 | private fun DependencyTree.fixCycles() = CyclesFixer(dependencyTree = this, scoredArcs = scoresMap).fixCycles() 75 | 76 | /** 77 | * Annotate this dependency tree with the labels. 78 | */ 79 | private fun DependencyTree.Labeled.assignLabels() { 80 | 81 | labeler!!.predict(Labeler.Input(lss, this)).forEach { tokenId, configurations -> 82 | this.setGrammaticalConfiguration(dependent = tokenId, configuration = configurations.first().config) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/helpers/keyextractors/PosTagKeyExtractor.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.helpers.keyextractors 9 | 10 | import com.kotlinnlp.neuralparser.language.ParsingSentence 11 | import com.kotlinnlp.neuralparser.language.ParsingToken 12 | import com.kotlinnlp.tokensencoder.embeddings.keyextractor.EmbeddingKeyExtractor 13 | 14 | /** 15 | * An [EmbeddingKeyExtractor] by POS tag. 16 | */ 17 | object PosTagKeyExtractor : EmbeddingKeyExtractor { 18 | 19 | /** 20 | * Private val used to serialize the class (needed by Serializable). 21 | */ 22 | @Suppress("unused") 23 | private const val serialVersionUID: Long = 1L 24 | 25 | /** 26 | * @param sentence a generic sentence 27 | * @param tokenId the id of the token from which to extract the key 28 | * 29 | * @return the POS as string 30 | */ 31 | override fun getKey(sentence: ParsingSentence, tokenId: Int): String = 32 | sentence.tokens[tokenId].pos?.toString() ?: "_" 33 | } 34 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/PositionalEncoder.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules 9 | 10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator 11 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor 12 | import com.kotlinnlp.simplednn.core.optimizer.ParamsErrorsList 13 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkModel 14 | import com.kotlinnlp.simplednn.deeplearning.attention.pointernetwork.PointerNetworkProcessor 15 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 16 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory 17 | 18 | /** 19 | * The PositionalEncoder. 20 | * 21 | * @param model the model of the pointer network 22 | * @property id an identification number useful to track a specific encoder 23 | */ 24 | class PositionalEncoder( 25 | private val model: PointerNetworkModel, 26 | override val id: Int = 0 27 | ) : NeuralProcessor< 28 | List, // InputType 29 | List, // OutputType 30 | List, // ErrorsType 31 | PointerNetworkProcessor.InputErrors // InputErrorsType 32 | > { 33 | 34 | companion object { 35 | 36 | /** 37 | * @param predictions the list of prediction 38 | * 39 | * @return the errors of the given predictions 40 | */ 41 | fun calculateErrors(predictions: List): List { 42 | 43 | return predictions.mapIndexed { index, prediction -> 44 | val expectedValues = DenseNDArrayFactory.oneHotEncoder(length = predictions.size, oneAt = index) 45 | SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, outputGold = expectedValues) 46 | } 47 | } 48 | } 49 | 50 | /** 51 | * Propagate the errors to the input. 52 | */ 53 | override val propagateToInput: Boolean = true 54 | 55 | /** 56 | * The pointer processor used as encoder. 57 | */ 58 | private val encoder = PointerNetworkProcessor(this.model) 59 | 60 | /** 61 | * The Forward. 62 | * 63 | * @param input the input 64 | * 65 | * @return the result of the forward 66 | */ 67 | override fun forward(input: List): List { 68 | 69 | this.encoder.setInputSequence(input) 70 | 71 | return input.map { this.encoder.forward(it) } 72 | } 73 | 74 | /** 75 | * The Backward. 76 | * 77 | * @param outputErrors the errors of the last forward 78 | */ 79 | override fun backward(outputErrors: List) { 80 | this.encoder.backward(outputErrors) 81 | } 82 | 83 | /** 84 | * Return the input errors of the last backward. 85 | * 86 | * @param copy whether to return by value or by reference (default true) 87 | * 88 | * @return the input errors 89 | */ 90 | override fun getInputErrors(copy: Boolean): PointerNetworkProcessor.InputErrors = this.encoder.getInputErrors() 91 | 92 | /** 93 | * Return the params errors of the last backward. 94 | * 95 | * @param copy a Boolean indicating whether the returned errors must be a copy or a reference (default true) 96 | * 97 | * @return the parameters errors 98 | */ 99 | override fun getParamsErrors(copy: Boolean): ParamsErrorsList = this.encoder.getParamsErrors(copy = copy) 100 | } 101 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/Labeler.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 12 | import com.kotlinnlp.lssencoder.LatentSyntacticStructure 13 | import com.kotlinnlp.neuralparser.language.ParsingSentence 14 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.ScoredGrammar 15 | import com.kotlinnlp.simplednn.core.neuralprocessor.NeuralProcessor 16 | import com.kotlinnlp.simplednn.core.neuralprocessor.batchfeedforward.BatchFeedforwardProcessor 17 | import com.kotlinnlp.simplednn.simplemath.ndarray.Shape 18 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArrayFactory 20 | import com.kotlinnlp.utils.notEmptyOr 21 | 22 | /** 23 | * The Labeler. 24 | * 25 | * @property model the model 26 | * @param dropout the dropout probability (default 0.0) 27 | * @property id an identification number useful to track a specific encoder 28 | */ 29 | class Labeler( 30 | val model: LabelerModel, 31 | dropout: Double = 0.0, 32 | override val id: Int = 0 33 | ) : NeuralProcessor< 34 | Labeler.Input, // InputType 35 | List, // OutputType 36 | List, // ErrorsType 37 | Labeler.InputErrors // InputErrorsType 38 | > { 39 | 40 | /** 41 | * The input of this labeler. 42 | * 43 | * @param lss the latent syntactic structure 44 | * @param dependencyTree the dependency tree 45 | */ 46 | data class Input(val lss: LatentSyntacticStructure<*, *>, val dependencyTree: DependencyTree) 47 | 48 | /** 49 | * The input errors of this labeler. 50 | * 51 | * @param rootErrors the errors of the virtual root 52 | * @param contextErrors the errors of the context vectors 53 | */ 54 | data class InputErrors(val rootErrors: DenseNDArray, val contextErrors: List) 55 | 56 | /** 57 | * This encoder propagate the errors to the input. 58 | */ 59 | override val propagateToInput: Boolean = true 60 | 61 | /** 62 | * The processor that classify the grammar of a token. 63 | */ 64 | private val processor = 65 | BatchFeedforwardProcessor(model = this.model.networkModel, dropout = dropout, propagateToInput = true) 66 | 67 | /** 68 | * The dependency tree of the last input, used during the training. 69 | */ 70 | private lateinit var dependencyTree: DependencyTree 71 | 72 | /** 73 | * Score the possible grammatical configurations of each token of a given input. 74 | * 75 | * @param input a [Labeler] input 76 | * 77 | * @return a map of valid grammatical configurations (sorted by descending score) associated to each token id 78 | */ 79 | fun predict(input: Input): Map> { 80 | 81 | return this.forward(input) 82 | .asSequence() 83 | .map { it.toScoredGrammar() } 84 | .withIndex() 85 | .associate { (tokenIndex, configurations) -> 86 | 87 | val tokenId: Int = input.dependencyTree.elements[tokenIndex] 88 | 89 | val validConfigurations: List = (input.lss.sentence as ParsingSentence).getValidConfigurations( 90 | tokenIndex = tokenIndex, 91 | headIndex = input.dependencyTree.getHead(tokenId)?.let { input.dependencyTree.getPosition(it) }, 92 | configurations = configurations) 93 | 94 | tokenId to validConfigurations 95 | .filter { it.score >= this.model.labelerScoreThreshold } 96 | .notEmptyOr { validConfigurations.subList(0, 1) } 97 | } 98 | } 99 | 100 | 101 | /** 102 | * Return the network outcomes for each token. 103 | * 104 | * @param input a [Labeler] input 105 | * 106 | * @return the network outcomes for each token 107 | */ 108 | override fun forward(input: Input): List { 109 | 110 | this.dependencyTree = input.dependencyTree 111 | 112 | return this.processor.forward( 113 | input = input.lss.sentence.tokens.map { this.extractFeatures(tokenId = it.id, lss = input.lss) }.toTypedArray()) 114 | } 115 | 116 | /** 117 | * Propagate the errors through the neural components of the labeler. 118 | * 119 | * @param outputErrors the list of errors 120 | */ 121 | override fun backward(outputErrors: List) { 122 | 123 | this.processor.backward(outputErrors) 124 | } 125 | 126 | /** 127 | * @return the input errors and the root errors 128 | */ 129 | override fun getInputErrors(copy: Boolean): InputErrors { 130 | 131 | val inputErrors: List> = this.processor.getInputsErrors(copy = false) 132 | 133 | val contextErrors = List(size = inputErrors.size, init = { 134 | DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize)) 135 | }) 136 | 137 | val rootErrors: DenseNDArray = DenseNDArrayFactory.zeros(Shape(this.model.contextEncodingSize)) 138 | 139 | inputErrors.forEachIndexed { tokenIndex, (depErrors, govErrors) -> 140 | 141 | val tokenId: Int = this.dependencyTree.elements[tokenIndex] 142 | val depVector: DenseNDArray = contextErrors[tokenIndex] 143 | val govVector: DenseNDArray = this.dependencyTree.getHead(tokenId)?.let { 144 | contextErrors[this.dependencyTree.getPosition(it)] 145 | } ?: rootErrors 146 | 147 | depVector.assignSum(depErrors) 148 | govVector.assignSum(govErrors) 149 | } 150 | 151 | return InputErrors(rootErrors = rootErrors, contextErrors = contextErrors) 152 | } 153 | 154 | /** 155 | * @param copy a Boolean indicating whether the returned errors must be a copy or a reference 156 | * 157 | * @return the errors of the [Labeler] parameters 158 | */ 159 | override fun getParamsErrors(copy: Boolean) = this.processor.getParamsErrors(copy = copy) 160 | 161 | /** 162 | * Transform the array resulting from the prediction into a list of [ScoredGrammar]. 163 | * 164 | * @return a list of [ScoredGrammar] 165 | */ 166 | private fun DenseNDArray.toScoredGrammar(): List = (0 until this.length) 167 | .map { i -> ScoredGrammar(getGrammaticalConfiguration(i), score = this[i]) } 168 | .sortedWith(compareByDescending { it.score }) 169 | 170 | /** 171 | * @param index a prediction index 172 | * 173 | * @return the grammatical configuration with the given [index] 174 | */ 175 | private fun getGrammaticalConfiguration(index: Int): GrammaticalConfiguration = 176 | this.model.grammaticalConfigurations.getElement(index)!! 177 | 178 | /** 179 | * @param tokenId the id of a token of the input sentence 180 | * @param lss the latent syntactic structure of the input sentence 181 | * 182 | * @return the list of features that encode the given token 183 | */ 184 | private fun extractFeatures(tokenId: Int, lss: LatentSyntacticStructure<*, *>): List = 185 | listOf( 186 | lss.getContextVectorById(tokenId), 187 | this.dependencyTree.getHead(tokenId)?.let { lss.getContextVectorById(it) } ?: lss.virtualRoot 188 | ) 189 | } 190 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/LabelerModel.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler 9 | 10 | import com.kotlinnlp.dependencytree.DependencyTree 11 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 12 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterion 13 | import com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils.LossCriterionType 14 | import com.kotlinnlp.simplednn.core.functionalities.activations.Softmax 15 | import com.kotlinnlp.simplednn.core.functionalities.activations.Tanh 16 | import com.kotlinnlp.simplednn.core.layers.LayerInterface 17 | import com.kotlinnlp.simplednn.core.layers.LayerType 18 | import com.kotlinnlp.simplednn.core.layers.StackedLayersParameters 19 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 20 | import com.kotlinnlp.utils.DictionarySet 21 | import java.io.Serializable 22 | 23 | /** 24 | * The model of the [Labeler]. 25 | * 26 | * @property contextEncodingSize the size of the token encoding vectors 27 | * @property grammaticalConfigurations the dictionary set of all the possible grammatical configurations 28 | * @property lossCriterionType the training mode 29 | */ 30 | class LabelerModel( 31 | val contextEncodingSize: Int, 32 | val grammaticalConfigurations: DictionarySet, 33 | val lossCriterionType: LossCriterionType 34 | ) : Serializable { 35 | 36 | companion object { 37 | 38 | /** 39 | * Private val used to serialize the class (needed by Serializable). 40 | */ 41 | @Suppress("unused") 42 | private const val serialVersionUID: Long = 1L 43 | } 44 | 45 | /** 46 | * The score threshold above which to consider a labeler output valid. 47 | * It makes sense with the Softmax activation function. 48 | */ 49 | internal val labelerScoreThreshold: Double = 1.0 / this.grammaticalConfigurations.size 50 | 51 | /** 52 | * The Network model that predicts the grammatical configurations. 53 | */ 54 | val networkModel = StackedLayersParameters( 55 | LayerInterface(sizes = listOf(this.contextEncodingSize, this.contextEncodingSize)), 56 | LayerInterface( 57 | size = this.contextEncodingSize, 58 | connectionType = LayerType.Connection.Affine, 59 | activationFunction = Tanh), 60 | LayerInterface( 61 | type = LayerType.Input.Dense, 62 | size = this.grammaticalConfigurations.size, 63 | connectionType = LayerType.Connection.Feedforward, 64 | activationFunction = when (this.lossCriterionType) { 65 | LossCriterionType.Softmax -> Softmax() 66 | LossCriterionType.HingeLoss -> null 67 | }) 68 | ) 69 | 70 | /** 71 | * Return the errors of a given labeler predictions, respect to a gold dependency tree. 72 | * Errors are calculated comparing the last predictions done with the given gold grammatical configurations. 73 | * 74 | * @param predictions the current network predictions 75 | * @param goldTree the gold tree of the sentence 76 | * 77 | * @return a list of predictions errors 78 | */ 79 | fun calculateLoss(predictions: List, goldTree: DependencyTree.Labeled): List { 80 | 81 | val errorsList = mutableListOf() 82 | 83 | predictions.forEachIndexed { tokenIndex, prediction -> 84 | 85 | val tokenId: Int = goldTree.elements[tokenIndex] 86 | val errors: DenseNDArray = LossCriterion(this.lossCriterionType).getPredictionErrors( 87 | prediction = prediction, 88 | goldIndex = this.grammaticalConfigurations.getId(goldTree.getConfiguration(tokenId))!!) 89 | 90 | errorsList.add(errors) 91 | } 92 | 93 | return errorsList 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/HingeLoss.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils 9 | 10 | import com.kotlinnlp.simplednn.core.functionalities.losses.getErrorsByHingeLoss 11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 12 | 13 | /** 14 | * The loss criterion that calculates the errors with the hinge loss method. 15 | */ 16 | class HingeLoss : LossCriterion { 17 | 18 | /** 19 | * @param prediction a prediction array 20 | * @param goldIndex the index of the gold value 21 | * 22 | * @return the errors of the given prediction 23 | */ 24 | override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray = 25 | getErrorsByHingeLoss(prediction = prediction, goldIndex = goldIndex) 26 | } 27 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterion.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils 9 | 10 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 11 | 12 | /** 13 | * The LossCriterion interface. 14 | */ 15 | interface LossCriterion { 16 | 17 | companion object { 18 | 19 | /** 20 | * The LossCriterion builder. 21 | * 22 | * @param type the loss criterion type 23 | */ 24 | operator fun invoke(type: LossCriterionType): LossCriterion = when (type) { 25 | LossCriterionType.Softmax -> Softmax() 26 | LossCriterionType.HingeLoss -> HingeLoss() 27 | } 28 | } 29 | 30 | /** 31 | * @param prediction a prediction array 32 | * @param goldIndex the index of the gold value 33 | * 34 | * @return the errors of the given prediction 35 | */ 36 | fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray 37 | } 38 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/LossCriterionType.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils 9 | 10 | /** 11 | * The available loss criterion. 12 | * 13 | * @property Softmax calculate the errors with cross-entropy softmax 14 | * @property HingeLoss calculate the errors with the hinge loss method 15 | */ 16 | enum class LossCriterionType { Softmax, HingeLoss } 17 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/ScoredGrammar.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils 9 | 10 | import com.kotlinnlp.linguisticdescription.GrammaticalConfiguration 11 | 12 | /** 13 | * The outcome of a single prediction of the labeler. 14 | * 15 | * @property config the grammatical configuration 16 | * @property score the score 17 | */ 18 | data class ScoredGrammar(val config: GrammaticalConfiguration, val score: Double) 19 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/neuralmodules/labeler/utils/Softmax.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.neuralmodules.labeler.utils 9 | 10 | import com.kotlinnlp.simplednn.core.functionalities.losses.SoftmaxCrossEntropyCalculator 11 | import com.kotlinnlp.simplednn.simplemath.ndarray.dense.DenseNDArray 12 | 13 | /** 14 | * The loss criterion that calculates the errors with the cross-entropy softmax. 15 | */ 16 | class Softmax : LossCriterion { 17 | 18 | /** 19 | * @param prediction a prediction array 20 | * @param goldIndex the index of the gold value 21 | * 22 | * @return the errors of the given prediction 23 | */ 24 | override fun getPredictionErrors(prediction: DenseNDArray, goldIndex: Int): DenseNDArray = 25 | SoftmaxCrossEntropyCalculator.calculateErrors(output = prediction, goldIndex = goldIndex) 26 | } 27 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/BaseConverter.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence 11 | import com.kotlinnlp.linguisticdescription.sentence.token.Token 12 | import com.kotlinnlp.neuralparser.language.ParsingSentence 13 | import com.kotlinnlp.neuralparser.language.ParsingToken 14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter 15 | 16 | /** 17 | * The sentence converter from a [ParsingSentence] to a generic [Sentence]. 18 | */ 19 | class BaseConverter : SentenceConverter> { 20 | 21 | companion object { 22 | 23 | /** 24 | * Private val used to serialize the class (needed by Serializable). 25 | */ 26 | @Suppress("unused") 27 | private const val serialVersionUID: Long = 1L 28 | } 29 | 30 | /** 31 | * Convert a given [ParsingSentence] to a generic [Sentence] simply casting it. 32 | * 33 | * @param sentence the input sentence 34 | * 35 | * @return the converted sentence 36 | */ 37 | @Suppress("UNCHECKED_CAST") 38 | override fun convert(sentence: ParsingSentence): Sentence = sentence as Sentence 39 | } 40 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/FormConverter.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.Sentence 11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken 12 | import com.kotlinnlp.neuralparser.language.ParsingSentence 13 | import com.kotlinnlp.neuralparser.language.ParsingToken 14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter 15 | 16 | /** 17 | * The sentence converter from a [ParsingSentence] to a sentence of FormToken. 18 | */ 19 | class FormConverter : SentenceConverter> { 20 | 21 | companion object { 22 | 23 | /** 24 | * Private val used to serialize the class (needed by Serializable). 25 | */ 26 | @Suppress("unused") 27 | private const val serialVersionUID: Long = 1L 28 | } 29 | 30 | /** 31 | * Convert a given [ParsingSentence] to a to a sentence of FormToken simply casting it. 32 | * 33 | * @param sentence the input sentence 34 | * 35 | * @return the converted sentence 36 | */ 37 | @Suppress("UNCHECKED_CAST") 38 | override fun convert(sentence: ParsingSentence): Sentence = sentence as Sentence 39 | } -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/parsers/lhrparser/sentenceconverters/MorphoConverter.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-present KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * -----------------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.parsers.lhrparser.sentenceconverters 9 | 10 | import com.kotlinnlp.linguisticdescription.sentence.MorphoSentence 11 | import com.kotlinnlp.linguisticdescription.sentence.token.FormToken 12 | import com.kotlinnlp.neuralparser.language.ParsingSentence 13 | import com.kotlinnlp.neuralparser.language.ParsingToken 14 | import com.kotlinnlp.tokensencoder.wrapper.SentenceConverter 15 | 16 | /** 17 | * The sentence converter from a [ParsingSentence] to a [MorphoSentence]. 18 | */ 19 | class MorphoConverter : SentenceConverter> { 20 | 21 | companion object { 22 | 23 | /** 24 | * Private val used to serialize the class (needed by Serializable). 25 | */ 26 | @Suppress("unused") 27 | private const val serialVersionUID: Long = 1L 28 | } 29 | 30 | /** 31 | * Convert a given [ParsingSentence] to a [MorphoSentence] simply casting it. 32 | * 33 | * @param sentence the input sentence 34 | * 35 | * @return the converted sentence 36 | */ 37 | @Suppress("UNCHECKED_CAST") 38 | override fun convert(sentence: ParsingSentence): MorphoSentence = sentence as MorphoSentence 39 | } 40 | -------------------------------------------------------------------------------- /src/main/kotlin/com/kotlinnlp/neuralparser/utils/Extensions.kt: -------------------------------------------------------------------------------- 1 | /* Copyright 2017-present The KotlinNLP Authors. All Rights Reserved. 2 | * 3 | * This Source Code Form is subject to the terms of the Mozilla Public 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | * ------------------------------------------------------------------*/ 7 | 8 | package com.kotlinnlp.neuralparser.utils 9 | 10 | import com.kotlinnlp.conllio.CoNLLReader 11 | import com.kotlinnlp.conllio.Sentence as CoNLLSentence 12 | import com.kotlinnlp.conllio.Sentence.InvalidTree 13 | import java.io.File 14 | 15 | /** 16 | * Load sentences from a CoNLL file. 17 | * 18 | * @param type the string that describes the type of sentences 19 | * @param filePath the file path 20 | * @param maxSentences the max number of sentences to load 21 | * @param skipNonProjective whether to skip non-projective sentences 22 | * 23 | * @return the list of loaded sentences 24 | */ 25 | fun loadSentences(type: String, 26 | filePath: String, 27 | maxSentences: Int?, 28 | skipNonProjective: Boolean): List { 29 | 30 | println("Loading $type sentences from '%s'%s%s...".format( 31 | filePath, 32 | maxSentences?.let { " (max $it)" } ?: "", 33 | if (skipNonProjective) " skipping non-projective" else "" 34 | )) 35 | 36 | return filePath.loadFromTreeBank(skipNonProjective = skipNonProjective, maxSentences = maxSentences) 37 | } 38 | 39 | /** 40 | * Return a list of CoNLL sentences from a tree-bank at this path. 41 | * 42 | * @param maxSentences the maximum number of sentences to load (null = unlimited) 43 | * @param skipNonProjective whether to skip non-projective sentences 44 | * 45 | * @throws InvalidTree if the tree of a sentence is not valid 46 | */ 47 | private fun String.loadFromTreeBank(maxSentences: Int? = null, 48 | skipNonProjective: Boolean = false): List { 49 | 50 | var index = 0 51 | val sentences = ArrayList() 52 | 53 | CoNLLReader.forEachSentence(File(this)) { sentence -> 54 | 55 | if (maxSentences == null || index < maxSentences) { 56 | 57 | if (sentence.hasAnnotatedHeads()) sentence.assertValidCoNLLTree() 58 | 59 | val skip: Boolean = skipNonProjective && sentence.isNonProjective() 60 | 61 | if (!skip) sentences.add(sentence) 62 | } 63 | 64 | index++ 65 | } 66 | 67 | return sentences.toList() 68 | } 69 | 70 | --------------------------------------------------------------------------------