├── .clang-format ├── .gitignore ├── .travis.yml ├── AUTHORS ├── CITATION.cff ├── COPYING ├── ChangeLog ├── ChangeLog.old ├── Doxyfile ├── INSTALL ├── Makefile.am ├── NEWS ├── README.md ├── ZHfstOspeller.cc ├── ZHfstOspeller.h ├── ZHfstOspellerXmlMetadata.cc ├── ZHfstOspellerXmlMetadata.h ├── authors.xml ├── autogen.sh ├── configure.ac ├── doc └── index.html ├── edit2-small.png ├── hfst-ol.cc ├── hfst-ol.h ├── hfst-ospell-office.1 ├── hfst-ospell.1 ├── hfstol-stdafx.h ├── hfstospell.pc.in ├── m4 └── ax_check_compile_flag.m4 ├── main-cicling.cc ├── main-fsmnlp-2012.cc ├── main-ispell.cc ├── main-lrec2013.cc ├── main-norvig.cc ├── main-survey.cc ├── main.cc ├── office.cc ├── ol-exceptions.h ├── ospell.cc ├── ospell.h ├── predict.cc ├── test └── editdist.py ├── tests ├── acceptor.basic.txt ├── analyse-spell.sh ├── analyser.default.txt ├── bad-errormodel.sh ├── bad_errormodel.zhfst ├── basic-edit1.sh ├── basic-zhfst.sh ├── basic_test.xml ├── empty-descriptions.sh ├── empty-locale.sh ├── empty-titles.sh ├── empty-zhfst.sh ├── empty_descriptions.xml ├── empty_descriptions.zhfst ├── empty_locale.xml ├── empty_locale.zhfst ├── empty_titles.xml ├── empty_titles.zhfst ├── errmodel.basic.txt ├── errmodel.edit1.txt ├── errmodel.extrachars.txt ├── no-errormodel.sh ├── no_errmodel.xml ├── no_errormodel.zhfst ├── speller_analyser.zhfst ├── speller_basic.zhfst ├── speller_edit1.zhfst ├── test.strings ├── trailing-spaces.sh ├── trailing_spaces.xml └── trailing_spaces.zhfst ├── windows-Makefile.am └── windows-configure.ac /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: GNU 3 | --- 4 | Language: Cpp 5 | AccessModifierOffset: -2 6 | AlignAfterOpenBracket: Align 7 | AlignArrayOfStructures: None 8 | AlignConsecutiveMacros: None 9 | AlignConsecutiveAssignments: None 10 | AlignConsecutiveBitFields: None 11 | AlignConsecutiveDeclarations: None 12 | AlignEscapedNewlines: Right 13 | AlignOperands: Align 14 | AlignTrailingComments: true 15 | AllowAllArgumentsOnNextLine: true 16 | AllowAllParametersOfDeclarationOnNextLine: true 17 | AllowShortEnumsOnASingleLine: true 18 | AllowShortBlocksOnASingleLine: Never 19 | AllowShortCaseLabelsOnASingleLine: false 20 | AllowShortFunctionsOnASingleLine: All 21 | AllowShortLambdasOnASingleLine: All 22 | AllowShortIfStatementsOnASingleLine: Never 23 | AllowShortLoopsOnASingleLine: false 24 | AlwaysBreakAfterDefinitionReturnType: All 25 | AlwaysBreakAfterReturnType: AllDefinitions 26 | AlwaysBreakBeforeMultilineStrings: false 27 | AlwaysBreakTemplateDeclarations: MultiLine 28 | AttributeMacros: 29 | - __capability 30 | BinPackArguments: true 31 | BinPackParameters: true 32 | BraceWrapping: 33 | AfterCaseLabel: true 34 | AfterClass: true 35 | AfterControlStatement: Always 36 | AfterEnum: true 37 | AfterFunction: true 38 | AfterNamespace: true 39 | AfterObjCDeclaration: true 40 | AfterStruct: true 41 | AfterUnion: true 42 | AfterExternBlock: true 43 | BeforeCatch: true 44 | BeforeElse: true 45 | BeforeLambdaBody: false 46 | BeforeWhile: true 47 | IndentBraces: true 48 | SplitEmptyFunction: true 49 | SplitEmptyRecord: true 50 | SplitEmptyNamespace: true 51 | BreakBeforeBinaryOperators: All 52 | BreakBeforeConceptDeclarations: true 53 | BreakBeforeBraces: Allman 54 | BreakBeforeInheritanceComma: false 55 | BreakInheritanceList: BeforeColon 56 | BreakBeforeTernaryOperators: true 57 | BreakConstructorInitializersBeforeComma: false 58 | BreakConstructorInitializers: BeforeColon 59 | BreakAfterJavaFieldAnnotations: false 60 | BreakStringLiterals: true 61 | ColumnLimit: 79 62 | CommentPragmas: '^ IWYU pragma:' 63 | QualifierAlignment: Leave 64 | CompactNamespaces: false 65 | ConstructorInitializerIndentWidth: 4 66 | ContinuationIndentWidth: 4 67 | Cpp11BracedListStyle: false 68 | DeriveLineEnding: true 69 | DerivePointerAlignment: false 70 | DisableFormat: false 71 | EmptyLineAfterAccessModifier: Never 72 | EmptyLineBeforeAccessModifier: LogicalBlock 73 | ExperimentalAutoDetectBinPacking: false 74 | PackConstructorInitializers: BinPack 75 | BasedOnStyle: '' 76 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 77 | AllowAllConstructorInitializersOnNextLine: true 78 | FixNamespaceComments: false 79 | ForEachMacros: 80 | - foreach 81 | - Q_FOREACH 82 | - BOOST_FOREACH 83 | IfMacros: 84 | - KJ_IF_MAYBE 85 | IncludeBlocks: Preserve 86 | IncludeCategories: 87 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 88 | Priority: 2 89 | SortPriority: 0 90 | CaseSensitive: false 91 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 92 | Priority: 3 93 | SortPriority: 0 94 | CaseSensitive: false 95 | - Regex: '.*' 96 | Priority: 1 97 | SortPriority: 0 98 | CaseSensitive: false 99 | IncludeIsMainRegex: '(Test)?$' 100 | IncludeIsMainSourceRegex: '' 101 | IndentAccessModifiers: false 102 | IndentCaseLabels: false 103 | IndentCaseBlocks: false 104 | IndentGotoLabels: true 105 | IndentPPDirectives: None 106 | IndentExternBlock: AfterExternBlock 107 | IndentRequires: false 108 | IndentWidth: 4 109 | IndentWrappedFunctionNames: false 110 | InsertTrailingCommas: None 111 | JavaScriptQuotes: Leave 112 | JavaScriptWrapImports: true 113 | KeepEmptyLinesAtTheStartOfBlocks: true 114 | LambdaBodyIndentation: Signature 115 | MacroBlockBegin: '' 116 | MacroBlockEnd: '' 117 | MaxEmptyLinesToKeep: 1 118 | NamespaceIndentation: None 119 | ObjCBinPackProtocolList: Auto 120 | ObjCBlockIndentWidth: 2 121 | ObjCBreakBeforeNestedBlockParam: true 122 | ObjCSpaceAfterProperty: false 123 | ObjCSpaceBeforeProtocolList: true 124 | PenaltyBreakAssignment: 2 125 | PenaltyBreakBeforeFirstCallParameter: 19 126 | PenaltyBreakComment: 300 127 | PenaltyBreakFirstLessLess: 120 128 | PenaltyBreakOpenParenthesis: 0 129 | PenaltyBreakString: 1000 130 | PenaltyBreakTemplateDeclaration: 10 131 | PenaltyExcessCharacter: 1000000 132 | PenaltyReturnTypeOnItsOwnLine: 60 133 | PenaltyIndentedWhitespace: 0 134 | PointerAlignment: Right 135 | PPIndentWidth: -1 136 | ReferenceAlignment: Pointer 137 | ReflowComments: true 138 | RemoveBracesLLVM: false 139 | SeparateDefinitionBlocks: Leave 140 | ShortNamespaceLines: 1 141 | SortIncludes: CaseSensitive 142 | SortJavaStaticImport: Before 143 | SortUsingDeclarations: true 144 | SpaceAfterCStyleCast: false 145 | SpaceAfterLogicalNot: false 146 | SpaceAfterTemplateKeyword: true 147 | SpaceBeforeAssignmentOperators: true 148 | SpaceBeforeCaseColon: false 149 | SpaceBeforeCpp11BracedList: false 150 | SpaceBeforeCtorInitializerColon: true 151 | SpaceBeforeInheritanceColon: true 152 | SpaceBeforeParens: ControlStatements 153 | SpaceBeforeParensOptions: 154 | AfterControlStatements: false 155 | AfterForeachMacros: false 156 | AfterFunctionDefinitionName: false 157 | AfterFunctionDeclarationName: false 158 | AfterIfMacros: false 159 | AfterOverloadedOperator: false 160 | BeforeNonEmptyParentheses: false 161 | SpaceAroundPointerQualifiers: Default 162 | SpaceBeforeRangeBasedForLoopColon: true 163 | SpaceInEmptyBlock: false 164 | SpaceInEmptyParentheses: false 165 | SpacesBeforeTrailingComments: 1 166 | SpacesInAngles: Never 167 | SpacesInConditionalStatement: false 168 | SpacesInContainerLiterals: true 169 | SpacesInCStyleCastParentheses: false 170 | SpacesInLineCommentPrefix: 171 | Minimum: 1 172 | Maximum: -1 173 | SpacesInParentheses: false 174 | SpacesInSquareBrackets: false 175 | SpaceBeforeSquareBrackets: false 176 | BitFieldColonSpacing: Both 177 | Standard: c++03 178 | StatementAttributeLikeMacros: 179 | - Q_EMIT 180 | StatementMacros: 181 | - Q_UNUSED 182 | - QT_REQUIRE_VERSION 183 | TabWidth: 8 184 | UseCRLF: false 185 | UseTab: Never 186 | WhitespaceSensitiveMacros: 187 | - STRINGIZE 188 | - PP_STRINGIZE 189 | - BOOST_PP_STRINGIZE 190 | - NS_SWIFT_NAME 191 | - CF_SWIFT_NAME 192 | ... 193 | 194 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .deps 2 | .libs 3 | autom4te.cache 4 | build-aux 5 | *.o 6 | *.lo 7 | Makefile 8 | Makefile.in 9 | config.* 10 | configure 11 | hfst-ospell 12 | hfst-ospell-office 13 | hfstospell.pc 14 | libhfstospell.la 15 | libtool 16 | stamp-h1 17 | aclocal.m4 18 | m4/libtool.m4 19 | m4/ltoptions.m4 20 | m4/ltsugar.m4 21 | m4/ltversion.m4 22 | m4/lt~obsolete.m4 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: cpp 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - libxml++2.6-dev 9 | - libarchive-dev 10 | 11 | script: 12 | - autoreconf -fi 13 | - ./configure --disable-static --enable-zhfst 14 | - make -j3 15 | - make -j1 check 16 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Authors of HFST ospell 2 | ---------------------- 3 | 4 | This lists authors relevant for copyright issues. See also THANKS. 5 | 6 | 2012-2017, Erik Axelson 7 | 2010-2016, Sam Hardwick 8 | 2015-2017, Tino Didriksen 9 | 2010-2016, Sjur Nørstebø Moshagen 10 | 2010-2016, Tommi Pirinen 11 | 2013, Francis M. Tyers 12 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | authors: 2 | - family-names: Pirinen 3 | given-names: Flammie A 4 | orcid: "https://orcid.org/0000-0003-1207-5395" 5 | - family-names: Hardwick 6 | given-names: Sam 7 | cff-version: 1.2.0 8 | date-released: "2022-03-13" 9 | keywords: 10 | - spell-checking 11 | - nlp 12 | message: If you use this software, please cite it using these metadata. 13 | repository-code: "https://github.com/hfst/hfst-ospell" 14 | title: HFST ospell 15 | version: 0.5.3 16 | preferred-citation: 17 | authors: 18 | - family-names: Pirinen 19 | given-names: Flammie A 20 | - family-names: Hardwick 21 | given-names: Sam 22 | - family-names: Lindén 23 | given-names: Krister 24 | title: "Effect of language and error models on efficiency of finite-state spell-checking and correction" 25 | type: article 26 | license: GPL-3.0 27 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, 5 | 2006 Free Software Foundation, Inc. 6 | 7 | This file is free documentation; the Free Software Foundation gives 8 | unlimited permission to copy, distribute and modify it. 9 | 10 | Basic Installation 11 | ================== 12 | 13 | Briefly, the shell commands `./configure; make; make install' should 14 | configure, build, and install this package. The following 15 | more-detailed instructions are generic; see the `README' file for 16 | instructions specific to this package. 17 | 18 | The `configure' shell script attempts to guess correct values for 19 | various system-dependent variables used during compilation. It uses 20 | those values to create a `Makefile' in each directory of the package. 21 | It may also create one or more `.h' files containing system-dependent 22 | definitions. Finally, it creates a shell script `config.status' that 23 | you can run in the future to recreate the current configuration, and a 24 | file `config.log' containing compiler output (useful mainly for 25 | debugging `configure'). 26 | 27 | It can also use an optional file (typically called `config.cache' 28 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 29 | the results of its tests to speed up reconfiguring. Caching is 30 | disabled by default to prevent problems with accidental use of stale 31 | cache files. 32 | 33 | If you need to do unusual things to compile the package, please try 34 | to figure out how `configure' could check whether to do them, and mail 35 | diffs or instructions to the address given in the `README' so they can 36 | be considered for the next release. If you are using the cache, and at 37 | some point `config.cache' contains results you don't want to keep, you 38 | may remove or edit it. 39 | 40 | The file `configure.ac' (or `configure.in') is used to create 41 | `configure' by a program called `autoconf'. You need `configure.ac' if 42 | you want to change it or regenerate `configure' using a newer version 43 | of `autoconf'. 44 | 45 | The simplest way to compile this package is: 46 | 47 | 1. `cd' to the directory containing the package's source code and type 48 | `./configure' to configure the package for your system. 49 | 50 | Running `configure' might take a while. While running, it prints 51 | some messages telling which features it is checking for. 52 | 53 | 2. Type `make' to compile the package. 54 | 55 | 3. Optionally, type `make check' to run any self-tests that come with 56 | the package. 57 | 58 | 4. Type `make install' to install the programs and any data files and 59 | documentation. 60 | 61 | 5. You can remove the program binaries and object files from the 62 | source code directory by typing `make clean'. To also remove the 63 | files that `configure' created (so you can compile the package for 64 | a different kind of computer), type `make distclean'. There is 65 | also a `make maintainer-clean' target, but that is intended mainly 66 | for the package's developers. If you use it, you may have to get 67 | all sorts of other programs in order to regenerate files that came 68 | with the distribution. 69 | 70 | Compilers and Options 71 | ===================== 72 | 73 | Some systems require unusual options for compilation or linking that the 74 | `configure' script does not know about. Run `./configure --help' for 75 | details on some of the pertinent environment variables. 76 | 77 | You can give `configure' initial values for configuration parameters 78 | by setting variables in the command line or in the environment. Here 79 | is an example: 80 | 81 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 82 | 83 | *Note Defining Variables::, for more details. 84 | 85 | Compiling For Multiple Architectures 86 | ==================================== 87 | 88 | You can compile the package for more than one kind of computer at the 89 | same time, by placing the object files for each architecture in their 90 | own directory. To do this, you can use GNU `make'. `cd' to the 91 | directory where you want the object files and executables to go and run 92 | the `configure' script. `configure' automatically checks for the 93 | source code in the directory that `configure' is in and in `..'. 94 | 95 | With a non-GNU `make', it is safer to compile the package for one 96 | architecture at a time in the source code directory. After you have 97 | installed the package for one architecture, use `make distclean' before 98 | reconfiguring for another architecture. 99 | 100 | Installation Names 101 | ================== 102 | 103 | By default, `make install' installs the package's commands under 104 | `/usr/local/bin', include files under `/usr/local/include', etc. You 105 | can specify an installation prefix other than `/usr/local' by giving 106 | `configure' the option `--prefix=PREFIX'. 107 | 108 | You can specify separate installation prefixes for 109 | architecture-specific files and architecture-independent files. If you 110 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 111 | PREFIX as the prefix for installing programs and libraries. 112 | Documentation and other data files still use the regular prefix. 113 | 114 | In addition, if you use an unusual directory layout you can give 115 | options like `--bindir=DIR' to specify different values for particular 116 | kinds of files. Run `configure --help' for a list of the directories 117 | you can set and what kinds of files go in them. 118 | 119 | If the package supports it, you can cause programs to be installed 120 | with an extra prefix or suffix on their names by giving `configure' the 121 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 122 | 123 | Optional Features 124 | ================= 125 | 126 | Some packages pay attention to `--enable-FEATURE' options to 127 | `configure', where FEATURE indicates an optional part of the package. 128 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 129 | is something like `gnu-as' or `x' (for the X Window System). The 130 | `README' should mention any `--enable-' and `--with-' options that the 131 | package recognizes. 132 | 133 | For packages that use the X Window System, `configure' can usually 134 | find the X include and library files automatically, but if it doesn't, 135 | you can use the `configure' options `--x-includes=DIR' and 136 | `--x-libraries=DIR' to specify their locations. 137 | 138 | Specifying the System Type 139 | ========================== 140 | 141 | There may be some features `configure' cannot figure out automatically, 142 | but needs to determine by the type of machine the package will run on. 143 | Usually, assuming the package is built to be run on the _same_ 144 | architectures, `configure' can figure that out, but if it prints a 145 | message saying it cannot guess the machine type, give it the 146 | `--build=TYPE' option. TYPE can either be a short name for the system 147 | type, such as `sun4', or a canonical name which has the form: 148 | 149 | CPU-COMPANY-SYSTEM 150 | 151 | where SYSTEM can have one of these forms: 152 | 153 | OS KERNEL-OS 154 | 155 | See the file `config.sub' for the possible values of each field. If 156 | `config.sub' isn't included in this package, then this package doesn't 157 | need to know the machine type. 158 | 159 | If you are _building_ compiler tools for cross-compiling, you should 160 | use the option `--target=TYPE' to select the type of system they will 161 | produce code for. 162 | 163 | If you want to _use_ a cross compiler, that generates code for a 164 | platform different from the build platform, you should specify the 165 | "host" platform (i.e., that on which the generated programs will 166 | eventually be run) with `--host=TYPE'. 167 | 168 | Sharing Defaults 169 | ================ 170 | 171 | If you want to set default values for `configure' scripts to share, you 172 | can create a site shell script called `config.site' that gives default 173 | values for variables like `CC', `cache_file', and `prefix'. 174 | `configure' looks for `PREFIX/share/config.site' if it exists, then 175 | `PREFIX/etc/config.site' if it exists. Or, you can set the 176 | `CONFIG_SITE' environment variable to the location of the site script. 177 | A warning: not all `configure' scripts look for a site script. 178 | 179 | Defining Variables 180 | ================== 181 | 182 | Variables not defined in a site shell script can be set in the 183 | environment passed to `configure'. However, some packages may run 184 | configure again during the build, and the customized values of these 185 | variables may be lost. In order to avoid this problem, you should set 186 | them in the `configure' command line, using `VAR=value'. For example: 187 | 188 | ./configure CC=/usr/local2/bin/gcc 189 | 190 | causes the specified `gcc' to be used as the C compiler (unless it is 191 | overridden in the site shell script). 192 | 193 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 194 | an Autoconf bug. Until the bug is fixed you can use this workaround: 195 | 196 | CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash 197 | 198 | `configure' Invocation 199 | ====================== 200 | 201 | `configure' recognizes the following options to control how it operates. 202 | 203 | `--help' 204 | `-h' 205 | Print a summary of the options to `configure', and exit. 206 | 207 | `--version' 208 | `-V' 209 | Print the version of Autoconf used to generate the `configure' 210 | script, and exit. 211 | 212 | `--cache-file=FILE' 213 | Enable the cache: use and save the results of the tests in FILE, 214 | traditionally `config.cache'. FILE defaults to `/dev/null' to 215 | disable caching. 216 | 217 | `--config-cache' 218 | `-C' 219 | Alias for `--cache-file=config.cache'. 220 | 221 | `--quiet' 222 | `--silent' 223 | `-q' 224 | Do not print messages saying which checks are being made. To 225 | suppress all normal output, redirect it to `/dev/null' (any error 226 | messages will still be shown). 227 | 228 | `--srcdir=DIR' 229 | Look for the package's source code in directory DIR. Usually 230 | `configure' can determine that directory automatically. 231 | 232 | `configure' also accepts some other, not widely useful, options. Run 233 | `configure --help' for more details. 234 | 235 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | ## Process this file with automake to produce Makefile.in 2 | 3 | # Copyright 2010 University of Helsinki 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # to silence: 18 | # libtoolize: Consider adding `-I m4' to ACLOCAL_AMFLAGS in Makefile.am. 19 | ACLOCAL_AMFLAGS=-I m4 20 | 21 | # targets 22 | if EXTRA_DEMOS 23 | CONFERENCE_DEMOS=hfst-ospell-norvig hfst-ospell-fsmnlp-2012 hfst-ospell-cicling\ 24 | hfst-ospell-survey hfst-ospell-lrec2013 hfst-ispell 25 | endif # EXTRA_DEMOS 26 | 27 | if HFST_OSPELL_OFFICE 28 | MAYBE_HFST_OSPELL_OFFICE=hfst-ospell-office 29 | endif # HFST_OSPELL_OFFICE 30 | if HFST_OSPELL_PREDICT 31 | MAYBE_HFST_OSPELL_PREDICT=hfst-ospell-predict 32 | endif 33 | 34 | bin_PROGRAMS=hfst-ospell $(MAYBE_HFST_OSPELL_OFFICE) $(CONFERENCE_DEMOS) \ 35 | $(MAYBE_HFST_OSPELL_PREDICT) 36 | lib_LTLIBRARIES=libhfstospell.la 37 | man1_MANS=hfst-ospell.1 hfst-ospell-office.1 38 | 39 | PKG_LIBS= 40 | PKG_CXXFLAGS= 41 | 42 | if WANT_ARCHIVE 43 | PKG_LIBS+=$(LIBARCHIVE_LIBS) 44 | PKG_CXXFLAGS+=$(LIBARCHIVE_CFLAGS) 45 | endif 46 | 47 | if WANT_LIBXMLPP 48 | PKG_LIBS+=$(LIBXMLPP_LIBS) 49 | PKG_CXXFLAGS+=$(LIBXMLPP_CFLAGS) 50 | endif 51 | 52 | if WANT_TINYXML2 53 | PKG_LIBS+=$(TINYXML2_LIBS) 54 | PKG_CXXFLAGS+=$(TINYXML2_CFLAGS) 55 | endif 56 | 57 | # library parts 58 | libhfstospell_la_SOURCES=hfst-ol.cc ospell.cc \ 59 | ZHfstOspeller.cc ZHfstOspellerXmlMetadata.cc 60 | libhfstospell_la_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS) 61 | libhfstospell_la_LDFLAGS=-no-undefined -version-info 11:0:0 \ 62 | $(PKG_LIBS) 63 | 64 | # link sample program against library here 65 | hfst_ospell_SOURCES=main.cc 66 | hfst_ospell_LDADD=libhfstospell.la 67 | hfst_ospell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 68 | $(PKG_CXXFLAGS) 69 | 70 | if HFST_OSPELL_PREDICT 71 | hfst_ospell_predict_SOURCES=predict.cc 72 | hfst_ospell_predict_LDADD=libhfstospell.la 73 | hfst_ospell_predict_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 74 | $(PKG_CXXFLAGS) 75 | endif 76 | 77 | if HFST_OSPELL_OFFICE 78 | 79 | hfst_ospell_office_SOURCES=office.cc 80 | hfst_ospell_office_LDADD=libhfstospell.la 81 | hfst_ospell_office_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS) 82 | 83 | endif # HFST_OSPELL_OFFICE 84 | 85 | if EXTRA_DEMOS 86 | 87 | hfst_ospell_norvig_SOURCES=main-norvig.cc 88 | hfst_ospell_norvig_LDADD=libhfstospell.la 89 | hfst_ospell_norvig_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 90 | $(PKG_CXXFLAGS) 91 | 92 | hfst_ospell_cicling_SOURCES=main-cicling.cc 93 | hfst_ospell_cicling_LDADD=libhfstospell.la 94 | hfst_ospell_cicling_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 95 | $(PKG_CXXFLAGS) 96 | 97 | hfst_ospell_lrec2013_SOURCES=main-lrec2013.cc 98 | hfst_ospell_lrec2013_LDADD=libhfstospell.la 99 | hfst_ospell_lrec2013_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 100 | $(PKG_CXXFLAGS) 101 | 102 | hfst_ospell_survey_SOURCES=main-survey.cc 103 | hfst_ospell_survey_LDADD=libhfstospell.la 104 | hfst_ospell_survey_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 105 | $(PKG_CXXFLAGS) 106 | 107 | hfst_ospell_fsmnlp_2012_SOURCES=main-fsmnlp-2012.cc 108 | hfst_ospell_fsmnlp_2012_LDADD=libhfstospell.la 109 | hfst_ospell_fsmnlp_2012_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 110 | $(PKG_CXXFLAGS) 111 | endif # EXTRA_DEMOS 112 | 113 | if EXTRA_DEMOS 114 | 115 | hfst_ispell_SOURCES=main-ispell.cc 116 | hfst_ispell_LDADD=libhfstospell.la 117 | hfst_ispell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 118 | $(PKG_CXXFLAGS) 119 | 120 | endif # EXTRA_DEMOS 121 | 122 | # install headers for library in hfst's includedir 123 | include_HEADERS=hfst-ol.h ospell.h ol-exceptions.h \ 124 | ZHfstOspeller.h ZHfstOspellerXmlMetadata.h \ 125 | hfstol-stdafx.h 126 | 127 | # pkgconfig 128 | pkgconfigdir=$(libdir)/pkgconfig 129 | pkgconfig_DATA=hfstospell.pc 130 | 131 | # tests 132 | TESTS=tests/basic-zhfst.sh tests/basic-edit1.sh \ 133 | tests/empty-descriptions.sh tests/empty-titles.sh tests/empty-locale.sh \ 134 | tests/trailing-spaces.sh tests/bad-errormodel.sh tests/empty-zhfst.sh \ 135 | tests/analyse-spell.sh tests/no-errormodel.sh 136 | if WANT_ARCHIVE 137 | XFAIL_TESTS=tests/empty-descriptions.sh tests/empty-titles.sh tests/empty-locale.sh tests/empty-zhfst.sh 138 | else 139 | XFAIL_TESTS=tests/empty-descriptions.sh tests/empty-titles.sh tests/empty-locale.sh tests/empty-zhfst.sh \ 140 | tests/basic-zhfst.sh tests/basic-edit1.sh tests/trailing-spaces.sh tests/bad-errormodel.sh \ 141 | tests/analyse-spell.sh tests/no-errormodel.sh 142 | endif 143 | 144 | if CAN_DOXYGEN 145 | doxygen: 146 | $(DOXYGEN) 147 | endif 148 | 149 | EXTRA_DIST=hfst-ospell.1 hfst-ospell-office.1 tests/basic-zhfst.sh tests/basic-edit1.sh \ 150 | tests/empty-descriptions.sh tests/empty-titles.sh tests/empty-locale.sh \ 151 | tests/trailing-spaces.sh tests/bad-errormodel.sh tests/empty-zhfst.sh \ 152 | tests/analyse-spell.sh tests/no-errormodel.sh \ 153 | tests/empty-descriptions.sh tests/empty-titles.sh tests/empty-locale.sh tests/empty-zhfst.sh \ 154 | tests/acceptor.basic.txt tests/analyser.default.txt tests/errmodel.basic.txt tests/errmodel.edit1.txt tests/errmodel.extrachars.txt \ 155 | tests/test.strings \ 156 | tests/bad_errormodel.zhfst tests/empty_descriptions.zhfst tests/empty_locale.zhfst tests/empty_titles.zhfst tests/no_errormodel.zhfst \ 157 | tests/speller_analyser.zhfst tests/speller_basic.zhfst tests/speller_edit1.zhfst tests/trailing_spaces.zhfst \ 158 | tests/basic_test.xml tests/empty_descriptions.xml tests/empty_locale.xml tests/empty_titles.xml tests/no_errmodel.xml tests/trailing_spaces.xml 159 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | NEWS for hfst-ospell 2 | ==================== 3 | 4 | This file contains all noteworthy changes in HFST-ospell development between 5 | releases. For full listing of changes see ChangeLog. 6 | 7 | Noteworthy changes in 0.5.1 8 | --------------------------- 9 | 10 | * change correction finding order (fixes issue #28) 11 | * require C++1y (C++14) 12 | * observe ICU namespacing (fixes issue #42) 13 | 14 | Noteworthy changes in 0.5.0 15 | --------------------------- 16 | 17 | * rename hfst_ol namespace to hfst_ospell to avoid conflicts 18 | * improve distinguishing between lemmas and tags in analysis 19 | * fix issue #37 20 | * avoid shadowing multicharacter ascii-beginning symbols 21 | * use minimal XML parsing to get locale, title, and description for Voikko and other frontends 22 | 23 | Noteworthy changes in 0.4.5 24 | --------------------------- 25 | 26 | * this is a bugfix release 27 | 28 | Noteworthy changes in 0.4.4 29 | --------------------------- 30 | 31 | * restructure order of files 32 | * remove HFST dependency 33 | * fix issue #26 34 | * check for failures in archive extraction 35 | * fix dll issues on windows 36 | * allow building without XML support 37 | 38 | Noteworthy changes in 0.4.3 39 | --------------------------- 40 | 41 | * fixes for big endian conversions 42 | * use max version for tinyxml 43 | 44 | Noteworthy changes in 0.4.2 45 | --------------------------- 46 | 47 | * small modifications to tests and documentation 48 | 49 | Noteworthy changes in 0.4.1 50 | --------------------------- 51 | 52 | * set time cutoff to 6.0 seconds for ospell-office 53 | * minor bug fixes 54 | 55 | Noteworthy changes in 0.4.0 56 | --------------------------- 57 | 58 | * add option --beam to hfst-ospell for restricting the search to a margin above the optimum 59 | * add option --time-cutoff to hfst-ospell for restricting the time spent on searching for better corrections 60 | * add option --enable-hfst-ospell-office to configure (defaults to yes) 61 | * use libarchive2 if libarchive3 is not available 62 | * determine and use highest supported C++ standard when compiling 63 | * support unknown and identity arcs both in the lexicon and error model 64 | 65 | Noteworthy changes in 0.3.0 66 | --------------------------- 67 | 68 | * New API for analysing and suggesting 69 | * Moved code from headers to implementation files (API change) 70 | * Added Doxygen to mark stable API 71 | * Fixes for bad and malformed metadara handling 72 | * Limiting number of suggestions now works 73 | 74 | Noteworthy changes in 0.2.5 75 | --------------------------- 76 | 77 | * optional support for tinyxml2 78 | * preliminary support for two-tape automata and *analysis* lookup 79 | * conference demos are no longer built by default 80 | * libarchive newer than 3 allowed 81 | 82 | Noteworthy changes in 0.2.4 83 | --------------------------- 84 | 85 | * renamed the package hfstospell (from hfst-ospell), the previous rename caused 86 | build issues. 87 | 88 | Noteworthy changes in 0.2.3 89 | --------------------------- 90 | 91 | * fixed a bug that caused certain types of paths with flag diacritics not to 92 | be accepted. 93 | 94 | Noteworthy changes in 0.2.2 95 | --------------------------- 96 | 97 | * Memory and speed improvements; data structures for automaton changed 98 | 99 | * Tests and bug fixes for building 100 | 101 | Noteworthy changes in 0.2.1 102 | --------------------------- 103 | 104 | * Added support for extracting zipped transducer collections to memory instead 105 | of temporary files 106 | 107 | * Changed from libxml to libxml++ for XML parsing 108 | 109 | Noteworthy changes in 0.2.0 110 | --------------------------- 111 | 112 | * Added support for zipped XML based transducer collection format. 113 | 114 | * Few new frontends for various experiments 115 | 116 | * Lots of metadata everywhere 117 | 118 | Noteworthy changes in 0.1.1 119 | --------------------------- 120 | 121 | * Added autoconfiscation to avoid bugs like missing Makefile in tarball 122 | 123 | Noteworthy changes in 0.1 124 | ------------------------- 125 | 126 | * First release 127 | 128 | 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hfst-ospell library and toy commandline tester 2 | 3 | This is a minimal hfst optimized lookup format based spell checker library and 4 | a demonstrational implementation of command line based spell checker. The 5 | library is licenced under Apache licence version 2, other licences can be 6 | obtained from University of Helsinki. 7 | 8 | [![Build Status](https://travis-ci.org/hfst/hfst-ospell.svg?branch=master)](https://travis-ci.org/hfst/hfst-ospell) 9 | 10 | ## Dependencies 11 | 12 | - libxml++2 13 | - libarchive 14 | 15 | ## Debian packages for dependencies 16 | 17 | - libxml++2-dev 18 | - libarchive-dev 19 | 20 | ## Usage 21 | 22 | Usage in external programs: 23 | 24 | #include 25 | 26 | and compile your project with: 27 | 28 | $(pkg-config --cflags hfstospell) 29 | 30 | and link with: 31 | 32 | $(pkg-config --libs hfstospell) 33 | 34 | ## Programming examples 35 | 36 | The library lives in a namespace called hfst_ospell. Pass (weighted!) Transducer 37 | pointers to the Speller constructor, eg.: 38 | 39 | FILE * error_source = fopen(error_filename, "r"); 40 | FILE * lexicon_file = fopen(lexicon_filename, "r"); 41 | hfst_ospell::Transducer * error; 42 | hfst_ospell::Transducer * lexicon; 43 | try { 44 | error = new hfst_ospell::Transducer(error_source); 45 | lexicon = new hfst_ospell::Transducer(lexicon_file); 46 | } catch (hfst_ospell::TransducerParsingException& e) { 47 | /* problem with transducer file, usually completely 48 | different type of file - there's no magic number 49 | in the header to check for this */ 50 | } 51 | hfst_ospell::Speller * speller; 52 | try { 53 | speller = new hfst_ospell::Speller(error, lexicon); 54 | } catch (hfst_ospell::AlphabetTranslationException& e) { 55 | /* problem with translating between the two alphabets */ 56 | } 57 | 58 | 59 | And use the functions: 60 | 61 | // returns true if line is found in lexicon 62 | bool hfst_ospell::Speller::check(char * line); 63 | 64 | // CorrectionQueue is a priority queue, sorted by weight 65 | hfst_ospell::CorrectionQueue hfst_ospell::Speller::correct(char * line); 66 | 67 | 68 | to communicate with it. See main.cc for a concrete usage example. 69 | 70 | ## Command-line tool 71 | 72 | Main.cc provides a demo utility with the following help message: 73 | 74 | Usage: hfst-ospell [OPTIONS] ERRORSOURCE LEXICON 75 | Run a composition of ERRORSOURCE and LEXICON on standard input and 76 | print corrected output 77 | 78 | -h, --help Print this help message 79 | -V, --version Print version information 80 | -v, --verbose Be verbose 81 | -q, --quiet Don't be verbose (default) 82 | -s, --silent Same as quiet 83 | 84 | 85 | Report bugs to hfst-bugs@ling.helsinki.fi 86 | 87 | # Use in real-world applications 88 | 89 | The HFST based spellers can be used in real applications with help of 90 | [voikko](http://voikko.sf.net). Voikko in turn can be used with enchant, 91 | libreoffice, and firefox. 92 | 93 | -------------------------------------------------------------------------------- /ZHfstOspeller.cc: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++ -*- */ 2 | // Copyright 2010 University of Helsinki 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #if HAVE_CONFIG_H 17 | # include 18 | #endif 19 | 20 | // C 21 | #if HAVE_LIBARCHIVE 22 | # include 23 | # include 24 | #endif 25 | // C++ 26 | #if HAVE_LIBXML 27 | # include 28 | #endif 29 | #include 30 | #include 31 | 32 | using std::string; 33 | using std::map; 34 | 35 | // local 36 | #include "ospell.h" 37 | #include "hfst-ol.h" 38 | #include "ZHfstOspeller.h" 39 | 40 | #ifdef WIN32 41 | #include 42 | #include 43 | #include 44 | #endif 45 | 46 | namespace hfst_ospell 47 | { 48 | 49 | #if HAVE_LIBARCHIVE 50 | inline std::string extract_to_mem(archive* ar, archive_entry* entry) { 51 | size_t full_length = 0; 52 | const struct stat* st = archive_entry_stat(entry); 53 | size_t buffsize = st->st_size; 54 | if (buffsize == 0) { 55 | std::cerr << archive_error_string(ar) << std::endl; 56 | throw ZHfstZipReadingError("Reading archive resulted in zero length entry"); 57 | } 58 | 59 | std::string buff(buffsize, 0); 60 | for (;;) { 61 | auto curr = archive_read_data(ar, &buff[0] + full_length, buffsize - full_length); 62 | if (0 == curr) { 63 | break; 64 | } 65 | else if (ARCHIVE_RETRY == curr) { 66 | continue; 67 | } 68 | else if (ARCHIVE_FAILED == curr) { 69 | throw ZHfstZipReadingError("Archive broken (ARCHIVE_FAILED)"); 70 | } 71 | else if (curr < 0) { 72 | throw ZHfstZipReadingError("Archive broken..."); 73 | } 74 | else { 75 | full_length += curr; 76 | } 77 | } 78 | 79 | if (full_length == 0) { 80 | std::cerr << archive_error_string(ar) << std::endl; 81 | throw ZHfstZipReadingError("Reading archive resulted in zero length"); 82 | } 83 | 84 | return buff; 85 | } 86 | 87 | inline Transducer* transducer_to_mem(archive* ar, archive_entry* entry) { 88 | std::string buff = extract_to_mem(ar, entry); 89 | Transducer *trans = new Transducer(&buff[0]); 90 | return trans; 91 | } 92 | 93 | inline char* extract_to_tmp_dir(archive* ar) { 94 | #ifdef WIN32 95 | char rv[MAX_PATH+1]; 96 | if (!GetTempPathA(MAX_PATH, rv)) { 97 | throw ZHfstZipReadingError("Could not get temporary path"); 98 | } 99 | strcat(rv, "zhfstospellXXXXXX"); 100 | mktemp(rv); 101 | int temp_fd = open(rv, _O_CREAT | _O_TRUNC | _O_RDWR); 102 | #else 103 | char rv[] = "/tmp/zhfstospellXXXXXXXX"; 104 | int temp_fd = mkstemp(rv); 105 | #endif 106 | int rr = archive_read_data_into_fd(ar, temp_fd); 107 | if ((rr != ARCHIVE_EOF) && (rr != ARCHIVE_OK)) { 108 | throw ZHfstZipReadingError("Archive not EOF'd or OK'd"); 109 | } 110 | close(temp_fd); 111 | return strdup(rv); 112 | } 113 | 114 | inline Transducer* transducer_to_tmp_dir(archive* ar) { 115 | char *filename = extract_to_tmp_dir(ar); 116 | FILE* f = fopen(filename, "rb"); 117 | free(filename); 118 | if (f == nullptr) { 119 | throw ZHfstTemporaryWritingError("reading acceptor back from temp file"); 120 | } 121 | return new Transducer(f); 122 | } 123 | 124 | #endif // HAVE_LIBARCHIVE 125 | 126 | ZHfstOspeller::ZHfstOspeller() : 127 | suggestions_maximum_(0), 128 | maximum_weight_(-1.0), 129 | beam_(-1.0), 130 | time_cutoff_(0.0), 131 | can_spell_(false), 132 | can_correct_(false), 133 | can_analyse_(true), 134 | current_speller_(0), 135 | current_sugger_(0) 136 | { 137 | } 138 | 139 | ZHfstOspeller::~ZHfstOspeller() 140 | { 141 | if ((current_speller_ != NULL) && (current_sugger_ != NULL)) 142 | { 143 | if (current_speller_ != current_sugger_) 144 | { 145 | delete current_speller_; 146 | delete current_sugger_; 147 | } 148 | else 149 | { 150 | delete current_speller_; 151 | } 152 | current_sugger_ = 0; 153 | current_speller_ = 0; 154 | } 155 | for (auto& acceptor : acceptors_) 156 | { 157 | delete acceptor.second; 158 | } 159 | for (auto& errmodel : errmodels_) 160 | { 161 | delete errmodel.second; 162 | } 163 | can_spell_ = false; 164 | can_correct_ = false; 165 | } 166 | 167 | void 168 | ZHfstOspeller::inject_speller(Speller * s) 169 | { 170 | current_speller_ = s; 171 | current_sugger_ = s; 172 | can_spell_ = true; 173 | can_correct_ = true; 174 | } 175 | 176 | void 177 | ZHfstOspeller::set_queue_limit(unsigned long limit) 178 | { 179 | suggestions_maximum_ = limit; 180 | } 181 | 182 | void 183 | ZHfstOspeller::set_weight_limit(Weight limit) 184 | { 185 | maximum_weight_ = limit; 186 | } 187 | 188 | void 189 | ZHfstOspeller::set_beam(Weight beam) 190 | { 191 | beam_ = beam; 192 | } 193 | 194 | void 195 | ZHfstOspeller::set_time_cutoff(float time_cutoff) 196 | { 197 | time_cutoff_ = time_cutoff; 198 | } 199 | 200 | bool 201 | ZHfstOspeller::spell(const string& wordform) 202 | { 203 | if (can_spell_ && (current_speller_ != 0)) 204 | { 205 | char* wf = strdup(wordform.c_str()); 206 | bool rv = current_speller_->check(wf); 207 | free(wf); 208 | return rv; 209 | } 210 | return false; 211 | } 212 | 213 | CorrectionQueue 214 | ZHfstOspeller::suggest(const string& wordform) 215 | { 216 | CorrectionQueue rv; 217 | if ((can_correct_) && (current_sugger_ != 0)) 218 | { 219 | char* wf = strdup(wordform.c_str()); 220 | rv = current_sugger_->correct(wf, 221 | suggestions_maximum_, 222 | maximum_weight_, 223 | beam_, 224 | time_cutoff_); 225 | free(wf); 226 | return rv; 227 | } 228 | return rv; 229 | } 230 | 231 | AnalysisQueue 232 | ZHfstOspeller::analyse(const string& wordform, bool ask_sugger) 233 | { 234 | AnalysisQueue rv; 235 | char* wf = strdup(wordform.c_str()); 236 | if ((can_analyse_) && (!ask_sugger) && (current_speller_ != 0)) 237 | { 238 | rv = current_speller_->analyse(wf); 239 | } 240 | else if ((can_analyse_) && (ask_sugger) && (current_sugger_ != 0)) 241 | { 242 | rv = current_sugger_->analyse(wf); 243 | } 244 | free(wf); 245 | return rv; 246 | } 247 | 248 | AnalysisSymbolsQueue 249 | ZHfstOspeller::analyseSymbols(const string& wordform, bool ask_sugger) 250 | { 251 | AnalysisSymbolsQueue rv; 252 | char* wf = strdup(wordform.c_str()); 253 | if ((can_analyse_) && (!ask_sugger) && (current_speller_ != 0)) 254 | { 255 | rv = current_speller_->analyseSymbols(wf); 256 | } 257 | else if ((can_analyse_) && (ask_sugger) && (current_sugger_ != 0)) 258 | { 259 | rv = current_sugger_->analyseSymbols(wf); 260 | } 261 | free(wf); 262 | return rv; 263 | } 264 | 265 | AnalysisCorrectionQueue 266 | ZHfstOspeller::suggest_analyses(const string& wordform) 267 | { 268 | AnalysisCorrectionQueue rv; 269 | // FIXME: should be atomic 270 | CorrectionQueue cq = suggest(wordform); 271 | while (cq.size() > 0) 272 | { 273 | AnalysisQueue aq = analyse(cq.top().first, true); 274 | while (aq.size() > 0) 275 | { 276 | StringPair sp(cq.top().first, aq.top().first); 277 | StringPairWeightPair spwp(sp, aq.top().second); 278 | rv.push(spwp); 279 | aq.pop(); 280 | } 281 | cq.pop(); 282 | } 283 | return rv; 284 | } 285 | 286 | void 287 | ZHfstOspeller::read_zhfst(const string& filename) 288 | { 289 | #if HAVE_LIBARCHIVE 290 | struct archive* ar = archive_read_new(); 291 | struct archive_entry* entry = 0; 292 | 293 | #if USE_LIBARCHIVE_2 294 | archive_read_support_compression_all(ar); 295 | #else 296 | archive_read_support_filter_all(ar); 297 | #endif // USE_LIBARCHIVE_2 298 | 299 | archive_read_support_format_all(ar); 300 | int rr = archive_read_open_filename(ar, filename.c_str(), 10240); 301 | if (rr != ARCHIVE_OK) 302 | { 303 | throw ZHfstZipReadingError("Archive not OK"); 304 | } 305 | for (int rr = archive_read_next_header(ar, &entry); 306 | rr != ARCHIVE_EOF; 307 | rr = archive_read_next_header(ar, &entry)) 308 | { 309 | if (rr != ARCHIVE_OK) 310 | { 311 | throw ZHfstZipReadingError("Archive not OK"); 312 | } 313 | char* filename = strdup(archive_entry_pathname(entry)); 314 | if (strncmp(filename, "acceptor.", strlen("acceptor.")) == 0) { 315 | Transducer* trans = nullptr; 316 | 317 | #if ZHFST_EXTRACT_TO_MEM == 1 318 | // Try to memory first... 319 | try { 320 | trans = transducer_to_mem(ar, entry); 321 | } 322 | catch (...) { 323 | // If that failed, try to /tmp 324 | //std::cerr << "Failed to memory - falling back to /tmp" << std::endl; 325 | trans = transducer_to_tmp_dir(ar); 326 | } 327 | #else 328 | // Try to /tmp first... 329 | try { 330 | trans = transducer_to_tmp_dir(ar); 331 | } 332 | catch (...) { 333 | // If that failed, try to memory 334 | //std::cerr << "Failed to /tmp - falling back to memory" << std::endl; 335 | trans = transducer_to_mem(ar, entry); 336 | } 337 | #endif 338 | if (trans == nullptr) { 339 | throw ZHfstZipReadingError("Failed to extract acceptor"); 340 | } 341 | 342 | char* p = filename; 343 | p += strlen("acceptor."); 344 | size_t descr_len = 0; 345 | for (const char* q = p; *q != '\0'; q++) 346 | { 347 | if (*q == '.') 348 | { 349 | break; 350 | } 351 | else 352 | { 353 | descr_len++; 354 | } 355 | } 356 | 357 | char* descr = hfst_strndup(p, descr_len); 358 | acceptors_[descr] = trans; 359 | free(descr); 360 | } 361 | else if (strncmp(filename, "errmodel.", strlen("errmodel.")) == 0) { 362 | Transducer* trans = nullptr; 363 | 364 | #if ZHFST_EXTRACT_TO_MEM == 1 365 | // Try to memory first... 366 | try { 367 | trans = transducer_to_mem(ar, entry); 368 | } 369 | catch (...) { 370 | // If that failed, try to /tmp 371 | //std::cerr << "Failed to memory - falling back to /tmp" << std::endl; 372 | trans = transducer_to_tmp_dir(ar); 373 | } 374 | #else 375 | // Try to /tmp first... 376 | try { 377 | trans = transducer_to_tmp_dir(ar); 378 | } 379 | catch (...) { 380 | // If that failed, try to memory 381 | //std::cerr << "Failed to /tmp - falling back to memory" << std::endl; 382 | trans = transducer_to_mem(ar, entry); 383 | } 384 | #endif 385 | if (trans == nullptr) { 386 | throw ZHfstZipReadingError("Failed to extract error model"); 387 | } 388 | 389 | const char* p = filename; 390 | p += strlen("errmodel."); 391 | size_t descr_len = 0; 392 | for (const char* q = p; *q != '\0'; q++) 393 | { 394 | if (*q == '.') 395 | { 396 | break; 397 | } 398 | else 399 | { 400 | descr_len++; 401 | } 402 | } 403 | 404 | char* descr = hfst_strndup(p, descr_len); 405 | errmodels_[descr] = trans; 406 | free(descr); 407 | } // if acceptor or errmodel 408 | else if (strcmp(filename, "index.xml") == 0) { 409 | // Always try to memory first, as index.xml is tiny 410 | try { 411 | std::string full_data = extract_to_mem(ar, entry); 412 | metadata_.read_xml(&full_data[0], full_data.size()); 413 | } 414 | catch (...) { 415 | char* temporary = extract_to_tmp_dir(ar); 416 | metadata_.read_xml(temporary); 417 | free(temporary); 418 | } 419 | } 420 | else 421 | { 422 | fprintf(stderr, "Unknown file in archive %s\n", filename); 423 | } 424 | free(filename); 425 | } // while r != ARCHIVE_EOF 426 | archive_read_close(ar); 427 | 428 | #if USE_LIBARCHIVE_2 429 | archive_read_finish(ar); 430 | #else 431 | archive_read_free(ar); 432 | #endif // USE_LIBARCHIVE_2 433 | 434 | if ((errmodels_.find("default") != errmodels_.end()) && 435 | (acceptors_.find("default") != acceptors_.end())) 436 | { 437 | current_speller_ = new Speller( 438 | errmodels_["default"], 439 | acceptors_["default"] 440 | ); 441 | current_sugger_ = current_speller_; 442 | can_spell_ = true; 443 | can_correct_ = true; 444 | } 445 | else if ((acceptors_.size() > 0) && (errmodels_.size() > 0)) 446 | { 447 | fprintf(stderr, "Could not find default speller, using %s %s\n", 448 | acceptors_.begin()->first.c_str(), 449 | errmodels_.begin()->first.c_str()); 450 | current_speller_ = new Speller( 451 | errmodels_.begin()->second, 452 | acceptors_.begin()->second 453 | ); 454 | current_sugger_ = current_speller_; 455 | can_spell_ = true; 456 | can_correct_ = true; 457 | } 458 | else if ((acceptors_.size() > 0) && 459 | (acceptors_.find("default") != acceptors_.end())) 460 | { 461 | current_speller_ = new Speller(0, acceptors_["default"]); 462 | current_sugger_ = current_speller_; 463 | can_spell_ = true; 464 | can_correct_ = false; 465 | } 466 | else if (acceptors_.size() > 0) 467 | { 468 | current_speller_ = new Speller(0, acceptors_.begin()->second); 469 | current_sugger_ = current_speller_; 470 | can_spell_ = true; 471 | can_correct_ = false; 472 | } 473 | else 474 | { 475 | throw ZHfstZipReadingError("No automata found in zip"); 476 | } 477 | can_analyse_ = can_spell_ | can_correct_; 478 | #else 479 | throw ZHfstZipReadingError("Zip support was disabled"); 480 | #endif // HAVE_LIBARCHIVE 481 | } 482 | 483 | 484 | const ZHfstOspellerXmlMetadata& 485 | ZHfstOspeller::get_metadata() const 486 | { 487 | return metadata_; 488 | } 489 | 490 | string 491 | ZHfstOspeller::metadata_dump() const 492 | { 493 | return metadata_.debug_dump(); 494 | 495 | } 496 | } // namespace hfst_ospell 497 | -------------------------------------------------------------------------------- /ZHfstOspeller.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++ -*- */ 2 | // Copyright 2010 University of Helsinki 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | //! @mainpage API to HFST ospell WFST spell-checking 17 | //! 18 | //! The hfst-ospell API has several layers for different end-users. A suggested 19 | //! starting point for new user is the @c ZHfstOspeller object, which reads an 20 | //! automaton set from zipped hfst file with metadata and provides high level 21 | //! access to it with generic spell-checking, correction and analysis functions. 22 | //! Second level of access is the Speller object, which can be used to 23 | //! construct spell-checker with two automata and traverse it and query 24 | //! low-level properties. The Speller is constructed with two Transducer objects 25 | //! which are the low-level access point to the automata with all the gory 26 | //! details of transition tables and symbol translations, headers and such. 27 | 28 | #ifndef HFST_OSPELL_ZHFSTOSPELLER_H_ 29 | #define HFST_OSPELL_ZHFSTOSPELLER_H_ 30 | 31 | #include "hfstol-stdafx.h" 32 | 33 | #if HAVE_CONFIG_H 34 | # include 35 | #endif 36 | 37 | #include 38 | #include 39 | 40 | #include "ospell.h" 41 | #include "hfst-ol.h" 42 | #include "ZHfstOspellerXmlMetadata.h" 43 | 44 | namespace hfst_ospell 45 | { 46 | //! @brief ZHfstOspeller class holds one speller contained in one 47 | //! zhfst file. 48 | //! Ospeller can perform all basic writer tool functionality that 49 | //! is supporte by the automata in the zhfst archive. 50 | class ZHfstOspeller 51 | { 52 | public: 53 | //! @brief create speller with default values for undefined 54 | //! language. 55 | OSPELL_API ZHfstOspeller(); 56 | //! @brief destroy all automata used by the speller. 57 | OSPELL_API ~ZHfstOspeller(); 58 | 59 | //! @brief assign a speller-suggestor circumventing the ZHFST format 60 | OSPELL_API void inject_speller(Speller * s); 61 | //! @brief set upper limit to priority queue when performing 62 | // suggestions or analyses. 63 | OSPELL_API void set_queue_limit(unsigned long limit); 64 | //! @brief set upper limit for weights 65 | OSPELL_API void set_weight_limit(Weight limit); 66 | //! @brief set search beam 67 | OSPELL_API void set_beam(Weight beam); 68 | //! @brief set time cutoff for correcting 69 | OSPELL_API void set_time_cutoff(float time_cutoff); 70 | //! @brief construct speller from named file containing valid 71 | //! zhfst archive. 72 | OSPELL_API void read_zhfst(const std::string& filename); 73 | 74 | //! @brief check if the given word is spelled correctly 75 | OSPELL_API bool spell(const std::string& wordform); 76 | //! @brief construct an ordered set of corrections for misspelled 77 | //! word form. 78 | OSPELL_API CorrectionQueue suggest(const std::string& wordform); 79 | //! @brief analyse word form morphologically 80 | //! @param wordform the string to analyse 81 | //! @param ask_sugger whether to use the spelling correction model 82 | // instead of the detection model 83 | AnalysisQueue analyse(const std::string& wordform, 84 | bool ask_sugger = false); 85 | //! @brief analyse word form morphologically, unconcatenated output 86 | //! strings (making it easier to find Multichar_symbols of 87 | //! the FST) 88 | //! @param wordform the string to analyse 89 | //! @param ask_sugger whether to use the spelling correction model 90 | // instead of the detection model 91 | AnalysisSymbolsQueue analyseSymbols(const std::string& wordform, 92 | bool ask_sugger = false); 93 | //! @brief construct an ordered set of corrections with analyses 94 | AnalysisCorrectionQueue suggest_analyses(const std::string& 95 | wordform); 96 | //! @brief hyphenate word form 97 | HyphenationQueue hyphenate(const std::string& wordform); 98 | 99 | //! @brief get access to metadata read from XML. 100 | const ZHfstOspellerXmlMetadata& get_metadata() const; 101 | //! @brief create string representation of the speller for 102 | //! programmer to debug 103 | std::string metadata_dump() const; 104 | private: 105 | //! @brief file or path where the speller came from 106 | std::string filename_; 107 | //! @brief upper bound for suggestions generated and given 108 | unsigned long suggestions_maximum_; 109 | //! @brief upper bound for suggestion weight generated and given 110 | Weight maximum_weight_; 111 | //! @brief upper bound for search beam around best candidate 112 | Weight beam_; 113 | //! @brief upper bound for search time in seconds 114 | float time_cutoff_; 115 | //! @brief whether automatons loaded yet can be used to check 116 | //! spelling 117 | bool can_spell_; 118 | //! @brief whether automatons loaded yet can be used to correct 119 | //! word forms 120 | bool can_correct_; 121 | //! @brief whether automatons loaded yet can be used to analyse 122 | //! word forms 123 | bool can_analyse_; 124 | //! @brief whether automatons loaded yet can be used to hyphenate 125 | //! word forms 126 | bool can_hyphenate_; 127 | //! @brief dictionaries loaded 128 | std::map acceptors_; 129 | //! @brief error models loaded 130 | std::map errmodels_; 131 | //! @brief pointer to current speller 132 | Speller* current_speller_; 133 | //! @brief pointer to current correction model 134 | Speller* current_sugger_; 135 | //! @brief pointer to current morphological analyser 136 | Speller* current_analyser_; 137 | //! @brief pointer to current hyphenator 138 | Transducer* current_hyphenator_; 139 | //! @brief the metadata of loaded speller 140 | ZHfstOspellerXmlMetadata metadata_; 141 | }; 142 | 143 | //! @brief Top-level exception for zhfst handling. 144 | 145 | //! Contains a human-readable error message that can be displayed to 146 | //! end-user as additional info when either solving exception or exiting. 147 | class ZHfstException : public std::runtime_error 148 | { 149 | public: 150 | ZHfstException() : std::runtime_error("unknown") {} 151 | //! @brief construct error with human readable message. 152 | //! 153 | //! the message will be displayed when recovering or dying from 154 | //! exception 155 | explicit ZHfstException(const std::string& message) : std::runtime_error(message) {} 156 | }; 157 | 158 | //! @brief Generic error in metadata parsing. 159 | // 160 | //! Gets raised if metadata is erroneous or missing. 161 | class ZHfstMetaDataParsingError : public ZHfstException 162 | { 163 | public: 164 | explicit ZHfstMetaDataParsingError(const std::string& message) : ZHfstException(message) {} 165 | }; 166 | 167 | //! @brief Exception for XML parser errors. 168 | // 169 | //! Gets raised if underlying XML parser finds an error in XML data. 170 | //! Errors include non-valid XML, missing or erroneous attributes or 171 | //! elements, etc. 172 | class ZHfstXmlParsingError : public ZHfstException 173 | { 174 | public: 175 | explicit ZHfstXmlParsingError(const std::string& message) : ZHfstException(message) {} 176 | }; 177 | 178 | //! @brief Generic error while reading zip file. 179 | //! 180 | //! Happens when libarchive is unable to proceed reading zip file or 181 | //! zip file is missing required files. 182 | class ZHfstZipReadingError : public ZHfstException 183 | { 184 | public: 185 | explicit ZHfstZipReadingError(const std::string& message) : ZHfstException(message) {} 186 | }; 187 | 188 | //! @brief Error when writing to temporary location. 189 | // 190 | //! This exception gets thrown, when e.g., zip extraction is unable to 191 | //! find or open temporary file for writing. 192 | class ZHfstTemporaryWritingError : public ZHfstException 193 | { 194 | public: 195 | explicit ZHfstTemporaryWritingError(const std::string& message) : ZHfstException(message) {} 196 | }; 197 | 198 | } // namespace hfst_ospell 199 | 200 | 201 | #endif // HFST_OSPELL_OSPELLER_SET_H_ 202 | // vim: set ft=cpp.doxygen: 203 | -------------------------------------------------------------------------------- /ZHfstOspellerXmlMetadata.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++ -*- */ 2 | // Copyright 2010 University of Helsinki 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef HFST_OSPELL_ZHFSTOSPELLERXMLMETADATA_H_ 17 | #define HFST_OSPELL_ZHFSTOSPELLERXMLMETADATA_H_ 1 18 | 19 | #include "hfstol-stdafx.h" 20 | 21 | #if HAVE_CONFIG_H 22 | # include 23 | #endif 24 | 25 | #include 26 | 27 | using std::map; 28 | 29 | #if HAVE_LIBXML 30 | # include 31 | #elif HAVE_TINYXML2 32 | # include 33 | #endif 34 | 35 | #include "ospell.h" 36 | #include "hfst-ol.h" 37 | 38 | namespace hfst_ospell 39 | { 40 | //! @brief data type for associating set of translations to languages. 41 | typedef std::map LanguageVersions; 42 | 43 | 44 | //! @brief ZHfstOspellerInfo represents one info block of an zhfst file. 45 | //! @see https://victorio.uit.no/langtech/trunk/plan/proof/doc/lexfile-spec.xml 46 | struct ZHfstOspellerInfoMetadata 47 | { 48 | //! @brief active locale of speller in BCP format 49 | std::string locale_; 50 | //! @brief transalation of titles to all languages 51 | LanguageVersions title_; 52 | //! @brief translation of descriptions to all languages 53 | LanguageVersions description_; 54 | //! @brief version defintition as free form string 55 | std::string version_; 56 | //! @brief vcs revision as string 57 | std::string vcsrev_; 58 | //! @brief date for age of speller as string 59 | std::string date_; 60 | //! @brief producer of the speller 61 | std::string producer_; 62 | //! @brief email address of the speller 63 | std::string email_; 64 | //! @brief web address of the speller 65 | std::string website_; 66 | }; 67 | //! @brief Represents one acceptor block in XML metadata 68 | struct ZHfstOspellerAcceptorMetadata 69 | { 70 | //! @brief unique id of acceptor 71 | std::string id_; 72 | //! @brief descr part of acceptor 73 | std::string descr_; 74 | //! @brief type of dictionary 75 | std::string type_; 76 | //! @brief type of transducer 77 | std::string transtype_; 78 | //! @brief titles of dictionary in languages 79 | LanguageVersions title_; 80 | //! @brief descriptions of dictionary in languages 81 | LanguageVersions description_; 82 | }; 83 | //! @brief Represents one errmodel block in XML metadata 84 | struct ZHfstOspellerErrModelMetadata 85 | { 86 | //! @brief id of each error model in set 87 | std::string id_; 88 | //! @brief descr part of each id 89 | std::string descr_; 90 | //! @brief title of error models in languages 91 | LanguageVersions title_; 92 | //! @brief description of error models in languages 93 | LanguageVersions description_; 94 | //! @brief types of error models 95 | std::vector type_; 96 | //! @brief models 97 | std::vector model_; 98 | }; 99 | //! @brief holds one index.xml metadata for whole ospeller 100 | class ZHfstOspellerXmlMetadata 101 | { 102 | public: 103 | //! @brief construct metadata for undefined language and other default 104 | //! values 105 | ZHfstOspellerXmlMetadata(); 106 | //! @brief read metadata from XML file by @a filename. 107 | void read_xml(const std::string& filename); 108 | //! @brief read XML from in memory @a data pointer with given @a length 109 | //! 110 | //! Depending on the XML library compiled in, the data length may 111 | //! be omitted and the buffer may be overflown. 112 | void read_xml(const char* data, size_t data_length); 113 | //! @brief create a programmer readable dump of XML metadata. 114 | //! 115 | //! shows linear serialisation of all header data in random order. 116 | std::string debug_dump() const; 117 | 118 | public: 119 | ZHfstOspellerInfoMetadata info_; //!< The info node data 120 | //! @brief data for acceptor nodes 121 | std::map acceptor_; 122 | //! @brief data for errmodel nodes 123 | std::vector errmodel_; 124 | #if HAVE_LIBXML 125 | private: 126 | void parse_xml(const xmlpp::Document* doc); 127 | void verify_hfstspeller(xmlpp::Node* hfstspellerNode); 128 | void parse_info(xmlpp::Node* infoNode); 129 | void parse_locale(xmlpp::Node* localeNode); 130 | void parse_title(xmlpp::Node* titleNode); 131 | void parse_description(xmlpp::Node* descriptionNode); 132 | void parse_version(xmlpp::Node* versionNode); 133 | void parse_date(xmlpp::Node* dateNode); 134 | void parse_producer(xmlpp::Node* producerNode); 135 | void parse_contact(xmlpp::Node* contactNode); 136 | void parse_acceptor(xmlpp::Node* acceptorNode); 137 | void parse_title(xmlpp::Node* titleNode, const std::string& accName); 138 | void parse_description(xmlpp::Node* descriptionNode, 139 | const std::string& accName); 140 | void parse_errmodel(xmlpp::Node* errmodelNode); 141 | void parse_title(xmlpp::Node* titleNode, size_t errm_count); 142 | void parse_description(xmlpp::Node* descriptionNode, size_t errm_count); 143 | void parse_type(xmlpp::Node* typeNode, size_t errm_count); 144 | void parse_model(xmlpp::Node* modelNode, size_t errm_count); 145 | #elif HAVE_TINYXML2 146 | private: 147 | void parse_xml(const tinyxml2::XMLDocument& doc); 148 | void verify_hfstspeller(const tinyxml2::XMLElement& hfstspellerNode); 149 | void parse_info(const tinyxml2::XMLElement& infoNode); 150 | void parse_locale(const tinyxml2::XMLElement& localeNode); 151 | void parse_title(const tinyxml2::XMLElement& titleNode); 152 | void parse_description(const tinyxml2::XMLElement& descriptionNode); 153 | void parse_version(const tinyxml2::XMLElement& versionNode); 154 | void parse_date(const tinyxml2::XMLElement& dateNode); 155 | void parse_producer(const tinyxml2::XMLElement& producerNode); 156 | void parse_contact(const tinyxml2::XMLElement& contactNode); 157 | void parse_acceptor(const tinyxml2::XMLElement& acceptorNode); 158 | void parse_title(const tinyxml2::XMLElement& titleNode, const std::string& accName); 159 | void parse_description(const tinyxml2::XMLElement& descriptionNode, 160 | const std::string& accName); 161 | void parse_errmodel(const tinyxml2::XMLElement& errmodelNode); 162 | void parse_title(const tinyxml2::XMLElement& titleNode, size_t errm_count); 163 | void parse_description(const tinyxml2::XMLElement& descriptionNode, size_t errm_count); 164 | void parse_type(const tinyxml2::XMLElement& typeNode, size_t errm_count); 165 | void parse_model(const tinyxml2::XMLElement& modelNode, size_t errm_count); 166 | 167 | #endif 168 | }; 169 | } 170 | 171 | #endif // inclusion GUARD 172 | // vim: set ft=cpp.doxygen: 173 | -------------------------------------------------------------------------------- /authors.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Tommi A Pirinen <tommi.pirinen@helsinki.fi> 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | ## Process this file with autoconf to produce configure script 2 | 3 | ## Copyright (C) 2010 University of Helsinki 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # autoconf requirements 19 | AC_PREREQ([2.62]) 20 | AC_INIT([hfstospell],[0.5.4],[hfst-bugs@helsinki.fi],[hfstospell],[http://hfst.github.io]) 21 | 22 | LT_PREREQ([2.2.6]) 23 | 24 | # init 25 | AC_CONFIG_AUX_DIR([build-aux]) 26 | AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign check-news color-tests silent-rules]) 27 | AM_SILENT_RULES([yes]) 28 | AC_REVISION([$Revision$]) 29 | AC_CONFIG_MACRO_DIR([m4]) 30 | AC_CONFIG_SRCDIR([ospell.cc]) 31 | AC_CONFIG_HEADERS([config.h]) 32 | 33 | # Information on package 34 | HFSTOSPELL_NAME=hfstospell 35 | HFSTOSPELL_MAJOR=0 36 | HFSTOSPELL_MINOR=5 37 | HFSTOSPELL_EXTENSION=.4 38 | HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION 39 | AC_SUBST(HFSTOSPELL_MAJOR) 40 | AC_SUBST(HFSTOSPELL_MINOR) 41 | AC_SUBST(HFSTOSPELL_VERSION) 42 | AC_SUBST(HFSTOSPELL_NAME) 43 | 44 | PKG_PROG_PKG_CONFIG 45 | AC_PATH_PROG([DOXYGEN], [doxygen], [false]) 46 | AM_CONDITIONAL([CAN_DOXYGEN], [test "x$DOXYGEN" != xfalse]) 47 | 48 | # Settings 49 | AC_ARG_ENABLE([extra_demos], 50 | [AS_HELP_STRING([--enable-extra-demos], 51 | [build conference demos for science reproduction @<:@default=no@:>@])], 52 | [enable_extra_demos=$enableval], [enable_extra_demos=no]) 53 | AM_CONDITIONAL([EXTRA_DEMOS], [test x$enable_extra_demos != xno]) 54 | AC_ARG_ENABLE([hfst_ospell_office], 55 | [AS_HELP_STRING([--enable-hfst-ospell-office], 56 | [build hfst-ospell-office @<:@default=yes@:>@])], 57 | [enable_hfst_ospell_office=$enableval], [enable_hfst_ospell_office=yes]) 58 | AM_CONDITIONAL([HFST_OSPELL_OFFICE], [test x$enable_hfst_ospell_office != xno]) 59 | AC_ARG_ENABLE([hfst_ospell_predict], 60 | [AS_HELP_STRING([--enable-hfst-ospell-predict], 61 | [build hfst-ospell-predict @<:@default=yes@:>@])], 62 | [enable_hfst_ospell_predict=$enableval], [enable_hfst_ospell_predict=yes]) 63 | AM_CONDITIONAL([HFST_OSPELL_PREDICT], [test x$enable_hfst_ospell_predict != xno]) 64 | AC_ARG_ENABLE([zhfst], 65 | [AS_HELP_STRING([--enable-zhfst], 66 | [support zipped complex automaton sets @<:@default=check@:>@])], 67 | [enable_zhfst=$enableval], [enable_zhfst=check]) 68 | AC_ARG_WITH([libxmlpp], 69 | [AS_HELP_STRING([--with-libxmlpp], 70 | [support xml metadata for zipped automaton sets with libxml++-2.6 @<:@default=yes@:>@])], 71 | [with_libxmlpp=$withval], [with_libxmlpp=yes]) 72 | AC_ARG_WITH([tinyxml2], 73 | [AS_HELP_STRING([--with-tinyxml2], 74 | [support xml metadata for zipped automaton sets with tinyxml2 @<:@default=no@:>@])], 75 | [with_tinyxml2=$withval], [with_tinyxml2=no]) 76 | 77 | AC_ARG_WITH([extract], 78 | [AS_HELP_STRING([--with-extract=TARGET], 79 | [extract zhfst archives to tmpdir or mem @<:@default=mem@:>@])], 80 | [with_extract=$withval], [with_extract=mem]) 81 | AS_IF([test "x$with_extract" = xmem], [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [1], 82 | [Define to extract zhfst archives to char buffer first])], 83 | [AS_IF([test "x$with_extract" = xtmpdir], 84 | [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [0], 85 | [Define to extract zhfst to tmp dir first])], 86 | [AC_MSG_ERROR([Use with-extract to mem or tmpdir])])]) 87 | 88 | # Checks for programs 89 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) 90 | AC_PROG_CC 91 | AC_PROG_CXX 92 | 93 | LT_INIT 94 | AC_PROG_INSTALL 95 | AC_PROG_LN_S 96 | AC_PROG_MAKE_SET 97 | 98 | # Checks for libraries 99 | AS_IF([test x$enable_zhfst != xno], 100 | [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 3], 101 | [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives]) 102 | enable_zhfst=yes], 103 | [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 2], 104 | [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives]) 105 | AC_DEFINE([USE_LIBARCHIVE_2], [1], [Use libarchive2]) 106 | enable_zhfst=yes], 107 | [AS_IF([test x$enable_zhfst != xcheck], 108 | [AC_MSG_ERROR([zhfst support requires either libarchive or libarchive2]) 109 | enable_zhfst=no], 110 | [enable_zhfst=no])])])]) 111 | 112 | AM_CONDITIONAL([WANT_ARCHIVE], [test x$enable_zhfst != xno]) 113 | AS_IF([test x$with_libxmlpp != xno], 114 | [PKG_CHECK_MODULES([LIBXMLPP], [libxml++-2.6 >= 2.10.0], 115 | [AC_DEFINE([HAVE_LIBXML], [1], [Use libxml++]) 116 | enable_xml=libxmlpp], 117 | [AC_MSG_ERROR([libxml++ failed]) 118 | enable_xml=no])]) 119 | AM_CONDITIONAL([WANT_LIBXMLPP], [test x$enable_xml = xlibxmlpp]) 120 | AS_IF([test x$with_tinyxml2 != xno -a x$with_libxmlpp = xno], 121 | [PKG_CHECK_MODULES([TINYXML2], [tinyxml2 >= 1.0.8 tinyxml2 < 3], 122 | [AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml]) 123 | enable_xml=tinyxml2], 124 | [AC_MSG_ERROR([tinyxml missing]) 125 | enable_xml=no])]) 126 | AM_CONDITIONAL([WANT_TINYXML2], [test x$enable_xml = xtinyxml2]) 127 | 128 | # Find ICU in the new and old way 129 | PKG_CHECK_MODULES(ICU, [icu-uc >= 4], [], [ 130 | AC_PATH_PROG([ICU_CONFIG], [icu-config], [false]) 131 | AS_IF([test x$ICU_CONFIG != xfalse], [ 132 | ICU_LIBS=$($ICU_CONFIG --ldflags) 133 | ]) 134 | ]) 135 | LIBS="$LIBS $ICU_LIBS" 136 | 137 | # Checks for header files 138 | AC_CHECK_HEADERS([getopt.h error.h]) 139 | 140 | # Checks for types 141 | AC_TYPE_SIZE_T 142 | 143 | # Checks for structures 144 | 145 | # Checks for compiler characteristics 146 | AC_C_BIGENDIAN 147 | 148 | # Checks for library functions 149 | AC_CHECK_FUNCS([strndup error]) 150 | # Checks for system services 151 | 152 | # Require highest supported C++ standard 153 | AC_LANG(C++) 154 | AX_CHECK_COMPILE_FLAG([-std=c++23], [CXXFLAGS="$CXXFLAGS -std=c++23"], [ 155 | AX_CHECK_COMPILE_FLAG([-std=c++2b], [CXXFLAGS="$CXXFLAGS -std=c++2b"], [ 156 | AX_CHECK_COMPILE_FLAG([-std=c++20], [CXXFLAGS="$CXXFLAGS -std=c++20"], [ 157 | AX_CHECK_COMPILE_FLAG([-std=c++2a], [CXXFLAGS="$CXXFLAGS -std=c++2a"], [ 158 | AX_CHECK_COMPILE_FLAG([-std=c++17], [CXXFLAGS="$CXXFLAGS -std=c++17"], [ 159 | AX_CHECK_COMPILE_FLAG([-std=c++1z], [CXXFLAGS="$CXXFLAGS -std=c++1z"], [ 160 | AC_MSG_ERROR([Could not enable at least C++1z (C++17) - upgrade your compiler]) 161 | ]) 162 | ]) 163 | ]) 164 | ]) 165 | ]) 166 | ]) 167 | 168 | # config files 169 | AC_CONFIG_FILES([Makefile hfstospell.pc]) 170 | 171 | # output 172 | AC_OUTPUT 173 | 174 | cat < 2 | 3 | 4 | 5 | HFST ospell–Free WFST spell-checker and library 6 | 7 | 8 |

[edit distance 2 automaton] 10 | HFST ospell

11 |

12 | HFST ospell is a free open source spell-checker using weighted 13 | finite-state automata. It is a light-weight library for using 14 | combinations of two automata–a language model and an error model–for 15 | spell-checking and correction. 16 |

17 |

18 | It has optional support for XML-based metadata using libxml++2 or 19 | tinyxml2. Automata compression is supported through libarchive, 20 | currently with zip format. 21 |

22 |

23 | The API of the library is stable to support updating the shared library 24 | while keeping the automata and the plugins for enchant and LibreOffice 25 | in place. The api documentation is maintained with 26 | doxygen. 27 |

28 |

29 | You can download the library and small demo applications from 30 | HFST’s main sourceforge site. 31 |

32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /edit2-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/edit2-small.png -------------------------------------------------------------------------------- /hfst-ol.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++ -*- */ 2 | // Copyright 2010 University of Helsinki 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | /* 17 | * This file contains some classes, typedefs and constant common to all 18 | * hfst-optimized-lookup stuff. This is just to get them out of the way 19 | * of the actual ospell code. 20 | */ 21 | 22 | #ifndef HFST_OSPELL_HFST_OL_H_ 23 | #define HFST_OSPELL_HFST_OL_H_ 24 | 25 | #include "hfstol-stdafx.h" 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include "ol-exceptions.h" 37 | 38 | namespace hfst_ospell { 39 | 40 | typedef uint16_t SymbolNumber; 41 | typedef uint32_t TransitionTableIndex; 42 | typedef std::vector SymbolVector; 43 | typedef std::vector KeyTable; 44 | typedef std::map StringSymbolMap; 45 | typedef short ValueNumber; 46 | typedef float Weight; 47 | 48 | // Forward declarations to typedef some more containers 49 | class TransitionIndex; 50 | class Transition; 51 | class FlagDiacriticOperation; 52 | 53 | typedef std::vector TransitionIndexVector; 54 | typedef std::vector TransitionVector; 55 | 56 | typedef std::map OperationMap; 57 | 58 | const SymbolNumber NO_SYMBOL = USHRT_MAX; 59 | const TransitionTableIndex NO_TABLE_INDEX = UINT_MAX; 60 | const Weight INFINITE_WEIGHT = static_cast(NO_TABLE_INDEX); 61 | const unsigned int MAX_SYMBOL_BYTES = 1000; 62 | 63 | // This is 2^31, hopefully equal to UINT_MAX/2 rounded up. 64 | // For some profound reason it can't be replaced with (UINT_MAX+1)/2. 65 | const TransitionTableIndex TARGET_TABLE = 2147483648u; 66 | 67 | // the flag diacritic operators as given in 68 | // Beesley & Karttunen, Finite State Morphology (U of C Press 2003) 69 | enum FlagDiacriticOperator {P, N, R, D, C, U}; 70 | 71 | enum HeaderFlag {Weighted, Deterministic, Input_deterministic, Minimized, 72 | Cyclic, Has_epsilon_epsilon_transitions, 73 | Has_input_epsilon_transitions, Has_input_epsilon_cycles, 74 | Has_unweighted_input_epsilon_cycles}; 75 | 76 | // Will probably turn into a compile-time constant 77 | bool is_big_endian(void); 78 | uint16_t read_uint16_flipping_endianness(FILE * f); 79 | uint16_t read_uint16_flipping_endianness(char * raw); 80 | uint32_t read_uint32_flipping_endianness(char * raw); 81 | float read_float_flipping_endianness(FILE * f); 82 | 83 | // Utility function for dealing with raw memory 84 | void skip_c_string(char ** raw); 85 | 86 | //! Internal class for Transducer processing. 87 | 88 | //! Contains low-level processing stuff. 89 | class TransducerHeader 90 | { 91 | private: 92 | SymbolNumber number_of_symbols; 93 | SymbolNumber number_of_input_symbols; 94 | TransitionTableIndex size_of_transition_index_table; 95 | TransitionTableIndex size_of_transition_target_table; 96 | 97 | TransitionTableIndex number_of_states; 98 | TransitionTableIndex number_of_transitions; 99 | 100 | bool weighted; 101 | bool deterministic; 102 | bool input_deterministic; 103 | bool minimized; 104 | bool cyclic; 105 | bool has_epsilon_epsilon_transitions; 106 | bool has_input_epsilon_transitions; 107 | bool has_input_epsilon_cycles; 108 | bool has_unweighted_input_epsilon_cycles; 109 | void read_property(bool &property, FILE * f); 110 | void read_property(bool &property, char ** raw); 111 | void skip_hfst3_header(FILE * f); 112 | void skip_hfst3_header(char ** f); 113 | 114 | public: 115 | //! 116 | //! @brief read header from file @a f 117 | TransducerHeader(FILE * f); 118 | 119 | //! 120 | //! read header from raw memory data @a raw 121 | TransducerHeader(char ** raw); 122 | //! 123 | //! count symbols 124 | SymbolNumber symbol_count(void); 125 | //! 126 | //! count input symbols 127 | SymbolNumber input_symbol_count(void); 128 | //! 129 | //! index table size 130 | TransitionTableIndex index_table_size(void); 131 | //! 132 | //! target table size 133 | TransitionTableIndex target_table_size(void); 134 | //! 135 | //! check for flag 136 | bool probe_flag(HeaderFlag flag); 137 | }; 138 | 139 | //! Internal class for flag diacritic processing. 140 | 141 | //! Contains low-level processing stuff. 142 | class FlagDiacriticOperation 143 | { 144 | private: 145 | const FlagDiacriticOperator operation; 146 | const SymbolNumber feature; 147 | const ValueNumber value; 148 | public: 149 | //! 150 | //! Construct flag diacritic of from \@ @a op . @a feat . @a val \@. 151 | FlagDiacriticOperation(const FlagDiacriticOperator op, 152 | const SymbolNumber feat, 153 | const ValueNumber val): 154 | operation(op), feature(feat), value(val) {} 155 | 156 | // dummy constructor 157 | FlagDiacriticOperation(): 158 | operation(P), feature(NO_SYMBOL), value(0) {} 159 | 160 | //! 161 | //! check if flag 162 | bool isFlag(void) const; 163 | //! 164 | //! Operation something I don't understand really. 165 | FlagDiacriticOperator Operation(void) const; 166 | //! 167 | //! No clue 168 | SymbolNumber Feature(void) const; 169 | //! 170 | //! Not a slightest idea 171 | ValueNumber Value(void) const; 172 | 173 | }; 174 | 175 | //! Internal class for alphabet processing. 176 | 177 | //! Contains low-level processing stuff. 178 | class TransducerAlphabet 179 | { 180 | private: 181 | KeyTable kt; 182 | OperationMap operations; 183 | SymbolNumber unknown_symbol; 184 | SymbolNumber identity_symbol; 185 | SymbolNumber flag_state_size; 186 | SymbolNumber orig_symbol_count; 187 | StringSymbolMap string_to_symbol; 188 | void process_symbol(char * line); 189 | 190 | void read(FILE * f, SymbolNumber number_of_symbols); 191 | void read(char ** raw, SymbolNumber number_of_symbols); 192 | 193 | public: 194 | //! 195 | //! read alphabets from file @a f 196 | TransducerAlphabet(FILE *f, SymbolNumber number_of_symbols); 197 | //! 198 | //! read alphabes from raw data @a raw 199 | TransducerAlphabet(char ** raw, SymbolNumber number_of_symbols); 200 | 201 | void add_symbol(std::string & sym); 202 | void add_symbol(char * sym); 203 | //! 204 | //! get alphabet's keytable mapping 205 | KeyTable * get_key_table(void); 206 | //! 207 | //! get flag operation map stuff 208 | OperationMap * get_operation_map(void); 209 | //! 210 | //! get state's size 211 | SymbolNumber get_state_size(void); 212 | //! 213 | //! get position of unknown symbol 214 | SymbolNumber get_unknown(void) const; 215 | SymbolNumber get_identity(void) const; 216 | //! get orig symbol count 217 | SymbolNumber get_orig_symbol_count(void) const; 218 | //! 219 | //! get mapping from strings to symbols 220 | StringSymbolMap * get_string_to_symbol(void); 221 | bool has_string(std::string const & s) const; 222 | //! 223 | //! get if given symbol is a flag 224 | bool is_flag(SymbolNumber symbol); 225 | }; 226 | 227 | class LetterTrie; 228 | typedef std::vector LetterTrieVector; 229 | 230 | //! Internal class for alphabet processing. 231 | 232 | //! Contains low-level processing stuff. 233 | class LetterTrie 234 | { 235 | private: 236 | LetterTrieVector letters; 237 | SymbolVector symbols; 238 | 239 | public: 240 | LetterTrie(void): 241 | letters(UCHAR_MAX+1, static_cast(NULL)), 242 | symbols(UCHAR_MAX+1,NO_SYMBOL) 243 | {} 244 | //! 245 | //! add a string to alphabets with a key 246 | void add_string(const char * p,SymbolNumber symbol_key); 247 | //! 248 | //! find a key for string or add it 249 | SymbolNumber find_key(char ** p); 250 | bool has_key_starting_with(const char c) const; 251 | ~LetterTrie(); 252 | }; 253 | 254 | //! Internal class for alphabet processing. 255 | 256 | //! Contains low-level processing stuff. 257 | class Encoder { 258 | 259 | private: 260 | LetterTrie letters; 261 | SymbolVector ascii_symbols; 262 | 263 | void read_input_symbols(KeyTable * kt, SymbolNumber number_of_input_symbols); 264 | 265 | public: 266 | //! 267 | //! create encoder from keytable 268 | Encoder(KeyTable * kt, SymbolNumber number_of_input_symbols); 269 | SymbolNumber find_key(char ** p); 270 | void read_input_symbol(const char * s, const int s_num); 271 | void read_input_symbol(std::string const & s, const int s_num); 272 | }; 273 | 274 | typedef std::vector FlagDiacriticState; 275 | 276 | //! Internal class for transition data. 277 | 278 | //! Contains low-level processing stuff. 279 | class TransitionIndex 280 | { 281 | protected: 282 | SymbolNumber input_symbol; //!< transition's input symbol 283 | TransitionTableIndex first_transition_index; //!< first transition location 284 | 285 | public: 286 | 287 | //! 288 | //! Each TransitionIndex has an input symbol and a target index. 289 | static const size_t SIZE = 290 | sizeof(SymbolNumber) + sizeof(TransitionTableIndex); 291 | 292 | //! 293 | //! Create transition index for symbol 294 | TransitionIndex(const SymbolNumber input, 295 | const TransitionTableIndex first_transition): 296 | input_symbol(input), 297 | first_transition_index(first_transition) 298 | {} 299 | //! 300 | //! return target of transition 301 | TransitionTableIndex target(void) const; 302 | //! 303 | //! whether it's final state 304 | bool final(void) const; 305 | //! 306 | //! retrieve final weight 307 | Weight final_weight(void) const; 308 | //! 309 | //! symbol number for transitions input 310 | SymbolNumber get_input(void) const; 311 | }; 312 | 313 | //! Internal class for transition processing. 314 | 315 | //! Contains low-level processing stuff. 316 | class Transition 317 | { 318 | protected: 319 | SymbolNumber input_symbol; //!< input symbol 320 | SymbolNumber output_symbol; //!< output symbol 321 | TransitionTableIndex target_index; //!< location of target of transition 322 | Weight transition_weight; //!< tranisition's weight 323 | 324 | public: 325 | 326 | //! Each transition has an input symbol, an output symbol and 327 | //! a target index. 328 | static const size_t SIZE = 329 | 2 * sizeof(SymbolNumber) + sizeof(TransitionTableIndex) + sizeof(Weight); 330 | 331 | //! 332 | //! Create transition with input, output, target and weight. 333 | Transition(const SymbolNumber input, 334 | const SymbolNumber output, 335 | const TransitionTableIndex target, 336 | const Weight w): 337 | input_symbol(input), 338 | output_symbol(output), 339 | target_index(target), 340 | transition_weight(w) 341 | {} 342 | 343 | Transition(): 344 | input_symbol(NO_SYMBOL), 345 | output_symbol(NO_SYMBOL), 346 | target_index(NO_TABLE_INDEX), 347 | transition_weight(INFINITE_WEIGHT) 348 | {} 349 | 350 | //! 351 | //! get transitions target 352 | TransitionTableIndex target(void) const; 353 | //! 354 | //! get output symbol 355 | SymbolNumber get_output(void) const; 356 | //! 357 | //! get input symbol 358 | SymbolNumber get_input(void) const; 359 | //! 360 | //! get transition weight 361 | Weight get_weight(void) const; 362 | //! 363 | //! whether transition is final 364 | bool final(void) const; 365 | }; 366 | 367 | //! Internal class for Transducer processing. 368 | 369 | //! Contains low-level processing stuff. 370 | class IndexTable 371 | { 372 | private: 373 | char * indices; 374 | TransitionTableIndex size; 375 | void read(FILE * f, 376 | TransitionTableIndex number_of_table_entries); 377 | void read(char ** raw, 378 | TransitionTableIndex number_of_table_entries); 379 | void convert_to_big_endian(void); 380 | 381 | public: 382 | //! 383 | //! read index table from file @a f. 384 | IndexTable(FILE * f, 385 | TransitionTableIndex number_of_table_entries); 386 | //! 387 | //! read index table from raw data @a raw. 388 | IndexTable(char ** raw, 389 | TransitionTableIndex number_of_table_entries); 390 | ~IndexTable(void); 391 | //! 392 | //! input symbol for the index 393 | SymbolNumber input_symbol(TransitionTableIndex i) const; 394 | //! 395 | //! target state location for the index 396 | TransitionTableIndex target(TransitionTableIndex i) const; 397 | //! 398 | //! whether it's final transition 399 | bool final(TransitionTableIndex i) const; 400 | //! 401 | //! transition's weight 402 | Weight final_weight(TransitionTableIndex i) const; 403 | }; 404 | 405 | //! Internal class for transition processing. 406 | 407 | //! Contains low-level processing stuff. 408 | class TransitionTable 409 | { 410 | protected: 411 | //! 412 | //! raw transition data 413 | char * transitions; 414 | TransitionTableIndex size; 415 | 416 | //! 417 | //! read known amount of transitions from file @a f 418 | void read(FILE * f, 419 | TransitionTableIndex number_of_table_entries); 420 | //! read known amount of transitions from raw dara @a data 421 | void read(char ** raw, 422 | TransitionTableIndex number_of_table_entries); 423 | void convert_to_big_endian(void); 424 | public: 425 | //! 426 | //! read transition table from file @a f 427 | TransitionTable(FILE * f, 428 | TransitionTableIndex transition_count); 429 | //! 430 | //! read transition table from raw data @a raw 431 | TransitionTable(char ** raw, 432 | TransitionTableIndex transition_count); 433 | 434 | ~TransitionTable(void); 435 | //! 436 | //! transition's input symbol 437 | SymbolNumber input_symbol(TransitionTableIndex i) const; 438 | //! 439 | //! transition's output symbol 440 | SymbolNumber output_symbol(TransitionTableIndex i) const; 441 | //! 442 | //! target node location 443 | TransitionTableIndex target(TransitionTableIndex i) const; 444 | //! 445 | //! weight of transiton 446 | Weight weight(TransitionTableIndex i) const; 447 | //! 448 | //! whether it's final 449 | bool final(TransitionTableIndex i) const; 450 | 451 | 452 | }; 453 | 454 | template 455 | void debug_print(printable p) 456 | { 457 | if (0) { 458 | std::cerr << p; 459 | } 460 | } 461 | 462 | } // namespace hfst_ospell 463 | 464 | #endif // HFST_OSPELL_HFST_OL_H_ 465 | -------------------------------------------------------------------------------- /hfst-ospell-office.1: -------------------------------------------------------------------------------- 1 | .TH HFST-OSPELL-OFFICE "1" "March 2018" "hfst-ospell-office " "User Commands" 2 | .SH NAME 3 | hfst-ospell-office \- Spell checker tool based on HFST 4 | .SH SYNOPSIS 5 | .B hfst-ospell-office 6 | [\fIOPTIONS\fR] \fIZHFST-ARCHIVE\fR 7 | .SH DESCRIPTION 8 | Use automata in ZHFST\-ARCHIVE or from OPTIONS to check and correct 9 | .SH OPTIONS 10 | .TP 11 | \fB\-\-verbatim\fR 12 | Check the input as-is without any transformations 13 | .SH "REPORTING BUGS" 14 | Report bugs to mail@tinodidriksen.com and/or hfst\-bugs@helsinki.fi 15 | .PP 16 | hfstospell 0.5.0 17 | .br 18 | Mar 05 2018 13:47:00 19 | .br 20 | Copyright (C) 2009 \- 2018 University of Helsinki 21 | -------------------------------------------------------------------------------- /hfst-ospell.1: -------------------------------------------------------------------------------- 1 | .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.40.4. 2 | .TH HFST-OSPELL "1" "March 2018" "hfst-ospell " "User Commands" 3 | .SH NAME 4 | hfst-ospell \- Spell checker tool based on HFST 5 | .SH SYNOPSIS 6 | .B hfstospell 7 | [\fIOPTIONS\fR] [\fIZHFST-ARCHIVE\fR] 8 | .SH DESCRIPTION 9 | Use automata in ZHFST\-ARCHIVE or from OPTIONS to check and correct 10 | .TP 11 | \fB\-h\fR, \fB\-\-help\fR 12 | Print this help message 13 | .TP 14 | \fB\-V\fR, \fB\-\-version\fR 15 | Print version information 16 | .TP 17 | \fB\-v\fR, \fB\-\-verbose\fR 18 | Be verbose 19 | .TP 20 | \fB\-q\fR, \fB\-\-quiet\fR 21 | Don't be verbose (default) 22 | .TP 23 | \fB\-s\fR, \fB\-\-silent\fR 24 | Same as quiet 25 | .TP 26 | \fB\-a\fR, \fB\-\-analyse\fR 27 | Analyse strings and corrections 28 | .TP 29 | \fB\-n\fR, \fB\-\-limit\fR=\fIN\fR 30 | Show at most N suggestions 31 | .TP 32 | \fB\-w\fR, \fB\-\-max\-weight\fR=\fIW\fR 33 | Suppress corrections with weights above W 34 | .TP 35 | \fB\-b\fR, \fB\-\-beam\fR=\fIW\fR 36 | Suppress corrections worse than best candidate by more than W 37 | .TP 38 | \fB\-t\fR, \fB\-\-time\-cutoff\fR=\fIT\fR 39 | Stop trying to find better corrections after T seconds (T is a float) 40 | .TP 41 | \fB\-S\fR, \fB\-\-suggest\fR 42 | Suggest corrections to mispellings 43 | .TP 44 | \fB\-X\fR, \fB\-\-real\-word\fR 45 | Also suggest corrections to correct words 46 | .TP 47 | \fB\-m\fR, \fB\-\-error\-model\fR 48 | Use this error model (must also give lexicon as option) 49 | .TP 50 | \fB\-l\fR, \fB\-\-lexicon\fR 51 | Use this lexicon (must also give erro model as option) 52 | .SH "REPORTING BUGS" 53 | Report bugs to hfst\-bugs@helsinki.fi 54 | .PP 55 | hfstospell 0.5.0 56 | copyright (C) 2009 \- 2018 University of Helsinki 57 | .SH "SEE ALSO" 58 | The full documentation for 59 | .B hfst-ospell 60 | is maintained as a Texinfo manual. If the 61 | .B info 62 | and 63 | .B hfst-ospell 64 | programs are properly installed at your site, the command 65 | .IP 66 | .B info hfst-ospell 67 | .PP 68 | should give you access to the complete manual. 69 | -------------------------------------------------------------------------------- /hfstol-stdafx.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C++ -*- */ 2 | // Copyright 2010 University of Helsinki 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef HFST_OSPELL_STDAFX_H_ 17 | #define HFST_OSPELL_STDAFX_H_ 18 | 19 | #ifdef _MSC_VER 20 | // warning C4512: assignment operator could not be generated 21 | #pragma warning (disable: 4512) 22 | // warning C4456: declaration hides previous local declaration 23 | #pragma warning (disable: 4456) 24 | // warning C4458: declaration hides class member 25 | #pragma warning (disable: 4458) 26 | // warning C4996: POSIX names deprecated 27 | #pragma warning (disable: 4996) 28 | #endif 29 | 30 | #ifdef _WIN32 31 | #ifdef LIBHFSTOSPELL_EXPORTS 32 | #define OSPELL_API __declspec(dllexport) 33 | #else 34 | #define OSPELL_API __declspec(dllimport) 35 | #endif 36 | #else 37 | #ifdef LIBHFSTOSPELL_EXPORTS 38 | #define OSPELL_API __attribute__ ((visibility ("default"))) 39 | #else 40 | #define OSPELL_API 41 | #endif 42 | #endif 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /hfstospell.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | 6 | Name: hfstospell 7 | Description: Finite-state transducer based spell checker library 8 | Version: @HFSTOSPELL_VERSION@ 9 | Libs: -L${libdir} -l@HFSTOSPELL_NAME@ 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /m4/ax_check_compile_flag.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check whether the given FLAG works with the current language's compiler 12 | # or gives an error. (Warnings, however, are ignored) 13 | # 14 | # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on 15 | # success/failure. 16 | # 17 | # If EXTRA-FLAGS is defined, it is added to the current language's default 18 | # flags (e.g. CFLAGS) when the check is done. The check is thus made with 19 | # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to 20 | # force the compiler to issue an error when a bad flag is given. 21 | # 22 | # INPUT gives an alternative input source to AC_COMPILE_IFELSE. 23 | # 24 | # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this 25 | # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. 26 | # 27 | # LICENSE 28 | # 29 | # Copyright (c) 2008 Guido U. Draheim 30 | # Copyright (c) 2011 Maarten Bosmans 31 | # 32 | # This program is free software: you can redistribute it and/or modify it 33 | # under the terms of the GNU General Public License as published by the 34 | # Free Software Foundation, either version 3 of the License, or (at your 35 | # option) any later version. 36 | # 37 | # This program is distributed in the hope that it will be useful, but 38 | # WITHOUT ANY WARRANTY; without even the implied warranty of 39 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 40 | # Public License for more details. 41 | # 42 | # You should have received a copy of the GNU General Public License along 43 | # with this program. If not, see . 44 | # 45 | # As a special exception, the respective Autoconf Macro's copyright owner 46 | # gives unlimited permission to copy, distribute and modify the configure 47 | # scripts that are the output of Autoconf when processing the Macro. You 48 | # need not follow the terms of the GNU General Public License when using 49 | # or distributing such scripts, even though portions of the text of the 50 | # Macro appear in them. The GNU General Public License (GPL) does govern 51 | # all other use of the material that constitutes the Autoconf Macro. 52 | # 53 | # This special exception to the GPL applies to versions of the Autoconf 54 | # Macro released by the Autoconf Archive. When you make and distribute a 55 | # modified version of the Autoconf Macro, you may extend this special 56 | # exception to the GPL to apply to your modified version as well. 57 | 58 | #serial 3 59 | 60 | AC_DEFUN([AX_CHECK_COMPILE_FLAG], 61 | [AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX 62 | AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl 63 | AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ 64 | ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS 65 | _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" 66 | AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], 67 | [AS_VAR_SET(CACHEVAR,[yes])], 68 | [AS_VAR_SET(CACHEVAR,[no])]) 69 | _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) 70 | AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], 71 | [m4_default([$2], :)], 72 | [m4_default([$3], :)]) 73 | AS_VAR_POPDEF([CACHEVAR])dnl 74 | ])dnl AX_CHECK_COMPILE_FLAGS 75 | -------------------------------------------------------------------------------- /main-cicling.cc: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright 2009 University of Helsinki 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | */ 18 | 19 | /* 20 | This is a toy commandline utility for testing spellers on standard io. 21 | */ 22 | 23 | #include "ospell.h" 24 | #include 25 | #include 26 | #include 27 | 28 | #define PACKAGE_NAME "hfst-ospell" 29 | #define PACKAGE_STRING "hfst-ospell 0.1" 30 | #define PACKAGE_BUGREPORT "hfst-bugs@ling.helsinki.fi" 31 | 32 | bool print_usage(void) 33 | { 34 | std::cerr << 35 | "\n" << 36 | "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" << 37 | "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" << 38 | "print corrected output\n" << 39 | "\n" << 40 | " -h, --help Print this help message\n" << 41 | " -V, --version Print version information\n" << 42 | " -v, --verbose Be verbose\n" << 43 | " -q, --quiet Don't be verbose (default)\n" << 44 | " -s, --silent Same as quiet\n" << 45 | "\n" << 46 | "\n" << 47 | "Report bugs to " << PACKAGE_BUGREPORT << "\n" << 48 | "\n"; 49 | return true; 50 | } 51 | 52 | bool print_version(void) 53 | { 54 | std::cerr << 55 | "\n" << 56 | PACKAGE_STRING << std::endl << 57 | "copyright (C) 2009 University of Helsinki\n"; 58 | return true; 59 | } 60 | 61 | bool print_short_help(void) 62 | { 63 | print_usage(); 64 | return true; 65 | } 66 | 67 | int main(int argc, char **argv) 68 | { 69 | 70 | FILE * mutator_file = NULL; 71 | FILE * lexicon_file = NULL; 72 | 73 | int c; 74 | bool verbose = false; 75 | 76 | while (true) 77 | { 78 | static struct option long_options[] = 79 | { 80 | // first the hfst-mandated options 81 | {"help", no_argument, 0, 'h'}, 82 | {"version", no_argument, 0, 'V'}, 83 | {"verbose", no_argument, 0, 'v'}, 84 | {"quiet", no_argument, 0, 'q'}, 85 | {"silent", no_argument, 0, 's'}, 86 | {0, 0, 0, 0 } 87 | }; 88 | 89 | int option_index = 0; 90 | c = getopt_long(argc, argv, "hVvqs", long_options, &option_index); 91 | 92 | if (c == -1) // no more options to look at 93 | break; 94 | 95 | switch (c) { 96 | case 'h': 97 | print_usage(); 98 | return EXIT_SUCCESS; 99 | break; 100 | 101 | case 'V': 102 | print_version(); 103 | return EXIT_SUCCESS; 104 | break; 105 | 106 | case 'v': 107 | verbose = true; 108 | break; 109 | 110 | case 'q': // fallthrough 111 | case 's': 112 | break; 113 | 114 | default: 115 | std::cerr << "Invalid option\n\n"; 116 | print_short_help(); 117 | return EXIT_FAILURE; 118 | break; 119 | } 120 | } 121 | // no more options, we should now be at the input filenames 122 | if ( (optind + 2) < argc) { 123 | std::cerr << "More than two input files given\n"; 124 | return EXIT_FAILURE; 125 | } else if ( (optind + 2) > argc) 126 | { 127 | std::cerr << "Need two input files\n"; 128 | return EXIT_FAILURE; 129 | } else { 130 | mutator_file = fopen(argv[(optind)], "r"); 131 | if (mutator_file == NULL) { 132 | std::cerr << "Could not open file " << argv[(optind)] 133 | << std::endl; 134 | return 1; 135 | } 136 | lexicon_file = fopen(argv[(optind + 1)], "r"); 137 | if (lexicon_file == NULL) { 138 | std::cerr << "Could not open file " << argv[(optind + 1)] 139 | << std::endl; 140 | return 1; 141 | } 142 | } 143 | hfst_ospell::Transducer * mutator; 144 | hfst_ospell::Transducer * lexicon; 145 | mutator = new hfst_ospell::Transducer(mutator_file); 146 | if (!mutator->is_weighted()) { 147 | std::cerr << "Error source was unweighted, exiting\n\n"; 148 | return EXIT_FAILURE; 149 | } 150 | lexicon = new hfst_ospell::Transducer(lexicon_file); 151 | if (!lexicon->is_weighted()) { 152 | std::cerr << "Lexicon was unweighted, exiting\n\n"; 153 | return EXIT_FAILURE; 154 | } 155 | 156 | hfst_ospell::Speller * speller; 157 | 158 | try { 159 | speller = new hfst_ospell::Speller(mutator, lexicon); 160 | } catch (hfst_ospell::AlphabetTranslationException& e) { 161 | std::cerr << 162 | "Unable to build speller - symbol " << e.what() << " not " 163 | "present in lexicon's alphabet\n"; 164 | return EXIT_FAILURE; 165 | } 166 | 167 | char * str = (char*) malloc(65535); 168 | while (!std::cin.eof()) { 169 | std::cin.getline(str, 65535); 170 | if (str[0] == '\0') { 171 | continue; 172 | } 173 | // n += 1 174 | char* p = strdup(str); 175 | char* tok = strtok(p, "\t"); 176 | assert(tok != NULL); 177 | char* mispelt = strdup(tok); 178 | tok = strtok(NULL, "\t"); 179 | assert(tok != NULL); 180 | char* corr = strdup(tok); 181 | tok = strtok(NULL, "\t"); 182 | assert(tok != NULL); 183 | char* context = strdup(tok); 184 | // unknown += (corr in NWORDS) 185 | hfst_ospell::CorrectionQueue corrections = speller->correct(mispelt); 186 | if (corrections.size() == 0) 187 | { 188 | // correction too far 189 | fprintf(stdout, "%s\t%s\t%s[inf]\t%s\n", 190 | mispelt, corr, mispelt, context); 191 | } 192 | else 193 | { 194 | fprintf(stdout, "%s\t%s", mispelt, corr); 195 | if (speller->check(mispelt)) 196 | { 197 | fprintf(stdout, "\t%s[0]", mispelt); 198 | } 199 | while (corrections.size() > 0) 200 | { 201 | fprintf(stdout, "\t%s[%f]", corrections.top().first.c_str(), 202 | corrections.top().second); 203 | corrections.pop(); 204 | } 205 | fprintf(stdout, "\t%s\n", context); 206 | } // corrections size != 0 207 | } 208 | return EXIT_SUCCESS; 209 | } 210 | -------------------------------------------------------------------------------- /main-fsmnlp-2012.cc: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright 2009 University of Helsinki 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | */ 18 | 19 | /* 20 | This is a toy commandline utility for testing spellers on standard io. 21 | */ 22 | 23 | 24 | #if HAVE_CONFIG_H 25 | # include 26 | #endif 27 | #if HAVE_GETOPT_H 28 | # include 29 | #endif 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "ol-exceptions.h" 37 | #include "ospell.h" 38 | #include "ZHfstOspeller.h" 39 | 40 | 41 | using hfst_ospell::ZHfstOspeller; 42 | using hfst_ospell::Transducer; 43 | 44 | static bool quiet = false; 45 | static bool verbose = false; 46 | static FILE* profile_file; 47 | clock_t profile_start, profile_end; 48 | 49 | bool print_usage(void) 50 | { 51 | std::cerr << 52 | "\n" << 53 | "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" << 54 | " " << PACKAGE_NAME << " [OPTIONS] ZHFST-ARCHIVE\n" << 55 | "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" << 56 | "print corrected output\n" << 57 | "Second form seeks error sources and lexicons from the ZHFST-ARCHIVE\n" 58 | "\n" << 59 | " -h, --help Print this help message\n" << 60 | " -V, --version Print version information\n" << 61 | " -v, --verbose Be verbose\n" << 62 | " -q, --quiet Don't be verbose (default)\n" << 63 | " -s, --silent Same as quiet\n" << 64 | "\n" << 65 | "\n" << 66 | "Report bugs to " << PACKAGE_BUGREPORT << "\n" << 67 | "\n"; 68 | return true; 69 | } 70 | 71 | bool print_version(void) 72 | { 73 | std::cerr << 74 | "\n" << 75 | PACKAGE_STRING << std::endl << 76 | "copyright (C) 2009 - 2011 University of Helsinki\n"; 77 | return true; 78 | } 79 | 80 | bool print_short_help(void) 81 | { 82 | print_usage(); 83 | return true; 84 | } 85 | 86 | int 87 | legacy_spell(const char* errmodel_filename, const char* acceptor_filename) 88 | { 89 | FILE* mutator_file = fopen(errmodel_filename, "r"); 90 | if (mutator_file == NULL) { 91 | std::cerr << "Could not open file " << errmodel_filename 92 | << std::endl; 93 | return EXIT_FAILURE; 94 | } 95 | FILE* lexicon_file = fopen(acceptor_filename, "r"); 96 | if (lexicon_file == NULL) { 97 | std::cerr << "Could not open file " << acceptor_filename 98 | << std::endl; 99 | return EXIT_FAILURE; 100 | } 101 | hfst_ospell::Transducer * mutator; 102 | hfst_ospell::Transducer * lexicon; 103 | mutator = new hfst_ospell::Transducer(mutator_file); 104 | if (!mutator->is_weighted()) { 105 | std::cerr << "Error source was unweighted, exiting\n\n"; 106 | return EXIT_FAILURE; 107 | } 108 | lexicon = new hfst_ospell::Transducer(lexicon_file); 109 | if (!lexicon->is_weighted()) { 110 | std::cerr << "Lexicon was unweighted, exiting\n\n"; 111 | return EXIT_FAILURE; 112 | } 113 | 114 | hfst_ospell::Speller * speller; 115 | 116 | try { 117 | speller = new hfst_ospell::Speller(mutator, lexicon); 118 | } catch (hfst_ospell::AlphabetTranslationException& e) { 119 | std::cerr << 120 | "Unable to build speller - symbol " << e.what() << " not " 121 | "present in lexicon's alphabet\n"; 122 | return EXIT_FAILURE; 123 | } 124 | char * str = (char*) malloc(2000); 125 | 126 | while (!std::cin.eof()) { 127 | std::cin.getline(str, 2000); 128 | if (speller->check(str)) { 129 | std::cout << "\"" << str << "\" is in the lexicon\n\n"; 130 | } else { 131 | hfst_ospell::CorrectionQueue corrections = speller->correct(str); 132 | if (corrections.size() > 0) { 133 | std::cout << "Corrections for \"" << str << "\":\n"; 134 | while (corrections.size() > 0) 135 | { 136 | std::cout << corrections.top().first << " " << corrections.top().second << std::endl; 137 | corrections.pop(); 138 | } 139 | std::cout << std::endl; 140 | } else { 141 | std::cout << "Unable to correct \"" << str << "\"!\n\n"; 142 | } 143 | } 144 | } 145 | return EXIT_SUCCESS; 146 | 147 | } 148 | 149 | int 150 | fallback_spell(const char* errmodel_filename1, const char* errmodel_filename2, 151 | const char* acceptor_filename) 152 | { 153 | FILE* mutator_file1 = fopen(errmodel_filename1, "r"); 154 | if (mutator_file1 == NULL) { 155 | std::cerr << "Could not open file " << errmodel_filename1 156 | << std::endl; 157 | return EXIT_FAILURE; 158 | } 159 | FILE* mutator_file2 = fopen(errmodel_filename2, "r"); 160 | if (mutator_file2 == NULL) { 161 | std::cerr << "Could not open file " << errmodel_filename2 162 | << std::endl; 163 | return EXIT_FAILURE; 164 | } 165 | FILE* lexicon_file = fopen(acceptor_filename, "r"); 166 | if (lexicon_file == NULL) { 167 | std::cerr << "Could not open file " << acceptor_filename 168 | << std::endl; 169 | return EXIT_FAILURE; 170 | } 171 | hfst_ospell::Transducer * mutator1; 172 | hfst_ospell::Transducer * mutator2; 173 | hfst_ospell::Transducer * lexicon; 174 | mutator1= new hfst_ospell::Transducer(mutator_file1); 175 | if (!mutator1->is_weighted()) { 176 | std::cerr << "Error source was unweighted, exiting\n\n"; 177 | return EXIT_FAILURE; 178 | } 179 | mutator2= new hfst_ospell::Transducer(mutator_file2); 180 | if (!mutator2->is_weighted()) { 181 | std::cerr << "Error source was unweighted, exiting\n\n"; 182 | return EXIT_FAILURE; 183 | } 184 | lexicon = new hfst_ospell::Transducer(lexicon_file); 185 | if (!lexicon->is_weighted()) { 186 | std::cerr << "Lexicon was unweighted, exiting\n\n"; 187 | return EXIT_FAILURE; 188 | } 189 | 190 | hfst_ospell::Speller * speller1; 191 | hfst_ospell::Speller * speller2; 192 | 193 | try { 194 | speller1 = new hfst_ospell::Speller(mutator1, lexicon); 195 | } catch (hfst_ospell::AlphabetTranslationException& e) { 196 | std::cerr << 197 | "Unable to build speller - symbol " << e.what() << " not " 198 | "present in lexicon's alphabet\n"; 199 | return EXIT_FAILURE; 200 | } 201 | try { 202 | speller2 = new hfst_ospell::Speller(mutator2, lexicon); 203 | } catch (hfst_ospell::AlphabetTranslationException& e) { 204 | std::cerr << 205 | "Unable to build speller - symbol " << e.what() << " not " 206 | "present in lexicon's alphabet\n"; 207 | return EXIT_FAILURE; 208 | } 209 | char * str = (char*) malloc(2000); 210 | 211 | while (!std::cin.eof()) { 212 | std::cin.getline(str, 2000); 213 | if (speller1->check(str)) { 214 | std::cout << "\"" << str << "\" is in the lexicon 1\n\n"; 215 | } else { 216 | hfst_ospell::CorrectionQueue corrections1 = speller1->correct(str); 217 | if (corrections1.size() > 0) { 218 | std::cout << "Corrections for \"" << str << "\" w/ source 1:\n"; 219 | while (corrections1.size() > 0) 220 | { 221 | std::cout << corrections1.top().first << " " << corrections1.top().second << std::endl; 222 | corrections1.pop(); 223 | } 224 | std::cout << std::endl; 225 | } else { 226 | hfst_ospell::CorrectionQueue corrections2 = speller2->correct(str); 227 | if (corrections2.size() > 0) { 228 | std::cout << "Corrections for \"" << str << "\" w/ source 2:\n"; 229 | while (corrections2.size() > 0) 230 | { 231 | std::cout << corrections2.top().first << " " << corrections2.top().second << std::endl; 232 | corrections2.pop(); 233 | } 234 | std::cout << std::endl; 235 | } else { 236 | std::cout << "Unable to correct \"" << str << "\"!\n\n"; 237 | } 238 | } 239 | } 240 | } 241 | return EXIT_SUCCESS; 242 | 243 | } 244 | 245 | int 246 | zhfst_spell(char* zhfst_filename) 247 | { 248 | ZHfstOspeller speller; 249 | try 250 | { 251 | speller.read_zhfst(zhfst_filename); 252 | } 253 | catch (hfst_ospell::ZHfstZipReadingError zhzre) 254 | { 255 | std::cerr << "cannot read zhfst archive " << zhfst_filename << ":" 256 | << std::endl 257 | << zhzre.what() << "." << std::endl 258 | << "trying to read as legacy automata directory" << std::endl; 259 | speller.read_legacy(zhfst_filename); 260 | } 261 | catch (hfst_ospell::ZHfstLegacyReadingError zhlre) 262 | { 263 | std::cerr << "cannot read legacy hfst speller dir " << zhfst_filename 264 | << ":" << std::endl 265 | << zhlre.what() << "." << std::endl; 266 | return EXIT_FAILURE; 267 | } 268 | 269 | if (verbose) 270 | { 271 | std::cout << "Following metadata was read from ZHFST archive:" << std::endl 272 | << speller.metadata_dump() << std::endl; 273 | } 274 | char * str = (char*) malloc(2000); 275 | 276 | while (!std::cin.eof()) { 277 | std::cin.getline(str, 2000); 278 | if (str[0] == '\0') { 279 | break; 280 | } 281 | if (speller.spell(str)) { 282 | std::cout << "\"" << str << "\" is in the lexicon\n\n"; 283 | } else { 284 | hfst_ospell::CorrectionQueue corrections = speller.suggest(str); 285 | if (corrections.size() > 0) { 286 | std::cout << "Corrections for \"" << str << "\":\n"; 287 | while (corrections.size() > 0) 288 | { 289 | std::cout << corrections.top().first << " " << corrections.top().second << std::endl; 290 | corrections.pop(); 291 | } 292 | std::cout << std::endl; 293 | } else { 294 | std::cout << "Unable to correct \"" << str << "\"!\n\n"; 295 | } 296 | } 297 | } 298 | return EXIT_SUCCESS; 299 | return EXIT_SUCCESS; 300 | } 301 | 302 | void 303 | hfst_print_profile_line() 304 | { 305 | if (profile_file == 0) 306 | { 307 | return; 308 | } 309 | fprintf(profile_file, "ospell"); 310 | clock_t profile_end = clock(); 311 | fprintf(profile_file, "\t%f", ((float)(profile_end - profile_start)) 312 | / CLOCKS_PER_SEC); 313 | struct rusage* usage = static_cast 314 | (malloc(sizeof(struct rusage))); 315 | errno = 0; 316 | int rv = getrusage(RUSAGE_SELF, usage); 317 | if (rv != -1) 318 | { 319 | fprintf(profile_file, "\t%lu.%lu\t%lu.%lu" 320 | "\t%ld\t%ld\t%ld" 321 | "\t%ld" 322 | "\t%ld\t%ld\t%ld" 323 | "\t%ld\t%ld" 324 | "\t%ld\t%ld" 325 | "\t%ld" 326 | "\t%ld\t%ld", 327 | usage->ru_utime.tv_sec, usage->ru_utime.tv_usec, 328 | usage->ru_stime.tv_sec, usage->ru_stime.tv_usec, 329 | usage->ru_maxrss, usage->ru_ixrss, usage->ru_idrss, 330 | usage->ru_isrss, 331 | usage->ru_minflt, usage->ru_majflt, usage->ru_nswap, 332 | usage->ru_inblock, usage->ru_oublock, 333 | usage->ru_msgsnd, usage->ru_msgrcv, 334 | usage->ru_nsignals, 335 | usage->ru_nvcsw, usage->ru_nivcsw); 336 | } 337 | else 338 | { 339 | fprintf(profile_file, "\tgetrusage: %s", strerror(errno)); 340 | } 341 | fprintf(profile_file, "\n"); 342 | } 343 | 344 | 345 | int main(int argc, char **argv) 346 | { 347 | 348 | int c; 349 | 350 | #if HAVE_GETOPT_H 351 | while (true) { 352 | static struct option long_options[] = 353 | { 354 | // first the hfst-mandated options 355 | {"help", no_argument, 0, 'h'}, 356 | {"version", no_argument, 0, 'V'}, 357 | {"verbose", no_argument, 0, 'v'}, 358 | {"quiet", no_argument, 0, 'q'}, 359 | {"silent", no_argument, 0, 's'}, 360 | {"profile", required_argument, 0, 'p'}, 361 | {0, 0, 0, 0 } 362 | }; 363 | 364 | int option_index = 0; 365 | c = getopt_long(argc, argv, "hVvqsp:", long_options, &option_index); 366 | 367 | if (c == -1) // no more options to look at 368 | break; 369 | 370 | switch (c) { 371 | case 'h': 372 | print_usage(); 373 | return EXIT_SUCCESS; 374 | break; 375 | 376 | case 'V': 377 | print_version(); 378 | return EXIT_SUCCESS; 379 | break; 380 | 381 | case 'v': 382 | verbose = true; 383 | quiet = false; 384 | break; 385 | 386 | case 'p': 387 | profile_file = fopen(optarg, "a"); 388 | if (NULL == profile_file) 389 | { 390 | perror("Couldn't open profiling file for appending"); 391 | } 392 | profile_start = clock(); 393 | break; 394 | case 'q': // fallthrough 395 | case 's': 396 | quiet = true; 397 | verbose = false; 398 | break; 399 | 400 | default: 401 | std::cerr << "Invalid option\n\n"; 402 | print_short_help(); 403 | return EXIT_FAILURE; 404 | break; 405 | } 406 | } 407 | #else 408 | int optind = 1; 409 | #endif 410 | // no more options, we should now be at the input filenames 411 | int rv = EXIT_SUCCESS; 412 | if (optind == (argc - 3)) 413 | { 414 | rv = fallback_spell(argv[optind], argv[optind+1], argv[optind+2]); 415 | } 416 | else if (optind == (argc - 2)) 417 | { 418 | rv = legacy_spell(argv[optind], argv[optind+1]); 419 | } 420 | else if (optind == (argc - 1)) 421 | { 422 | rv = zhfst_spell(argv[optind]); 423 | } 424 | else if (optind < (argc - 3)) 425 | { 426 | std::cerr << "No more than three free parameters allowed" << std::endl; 427 | print_short_help(); 428 | return EXIT_FAILURE; 429 | } 430 | else if (optind >= argc) 431 | { 432 | std::cerr << "Give full path to zhfst spellers or two automata" 433 | << std::endl; 434 | print_short_help(); 435 | return EXIT_FAILURE; 436 | } 437 | hfst_print_profile_line(); 438 | return rv; 439 | } 440 | -------------------------------------------------------------------------------- /main-ispell.cc: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright 2009 University of Helsinki 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | */ 18 | 19 | /* 20 | This is a toy commandline utility for testing spellers on standard io. 21 | */ 22 | 23 | 24 | #if HAVE_CONFIG_H 25 | # include 26 | #endif 27 | #if HAVE_GETOPT_H 28 | # include 29 | #endif 30 | #if HAVE_ERROR_H 31 | # include 32 | #else 33 | # define error(status, errnum, fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__); \ 34 | if (status != 0) exit(status); 35 | #endif 36 | 37 | #include "ol-exceptions.h" 38 | #include "ospell.h" 39 | #include "ZHfstOspeller.h" 40 | 41 | using hfst_ospell::ZHfstOspeller; 42 | using hfst_ospell::Transducer; 43 | 44 | //static bool quiet = false; 45 | static bool verbose = false; 46 | 47 | char* 48 | find_dicts(const char* langcode) 49 | { 50 | FILE* testhandle = NULL; 51 | char* testname = (char*)malloc(sizeof(char) * 52 | (strlen(langcode) + strlen("speller-.zhfst") + 1)); 53 | int rv = sprintf(testname, "speller-%s.zhfst", langcode); 54 | if (rv == 0) 55 | { 56 | perror("sprinting path"); 57 | } 58 | testhandle = fopen(testname, "r"); 59 | if (testhandle != NULL) 60 | { 61 | fclose(testhandle); 62 | return testname; 63 | } 64 | free(testname); 65 | testname = (char*)malloc(sizeof(char) * 66 | (strlen(langcode) + 67 | strlen("/usr/share/voikko/3/speller-.zhfst") + 1)); 68 | rv = sprintf(testname, "/usr/share/voikko/3/speller-%s.zhfst", 69 | langcode); 70 | if (rv == 0) 71 | { 72 | perror("sprinting path"); 73 | } 74 | testhandle = fopen(testname, "r"); 75 | if (testhandle != NULL) 76 | { 77 | fclose(testhandle); 78 | return testname; 79 | } 80 | free(testname); 81 | char* homepath = getenv("HOME"); 82 | if (homepath == NULL) 83 | { 84 | return NULL; 85 | } 86 | testname = (char*)malloc(sizeof(char) * 87 | (strlen(homepath) + strlen("/.voikko/3/speller-.zhfst") + 88 | strlen(langcode) + 1)); 89 | rv = sprintf(testname, "%s/.voikko/3/speller-%s.zhfst", homepath, 90 | langcode); 91 | if (rv == 0) 92 | { 93 | perror("sprinting path"); 94 | } 95 | testhandle = fopen(testname, "r"); 96 | if (testhandle != NULL) 97 | { 98 | fclose(testhandle); 99 | return testname; 100 | } 101 | free(testname); 102 | return NULL; 103 | } 104 | 105 | bool print_usage(void) 106 | { 107 | fprintf(stdout, "Usage: %s [OPTION]... [FILE]...\n" 108 | "Check spelling of each FILE. Without FILE, check standard input." 109 | "\n\n", "hfst-ispell"); 110 | fprintf(stdout, 111 | " -1 check only first field in lines " 112 | "(delimiter = tabulator)\n" 113 | " -a Ispell's pipe interface\n" 114 | " --check-url Check URLs, email addresses and directory paths\n" 115 | " -d d[,d2,...] used d (d2 etc.) dictionaries\n" 116 | " -D show available dictionaries\n" 117 | " -G print only correct words or lines\n" 118 | " -h, --help display this help and exit\n" 119 | " -l print mispelled words\n" 120 | " -L print lines with mispelled words\n" 121 | " -v, --version print version number\n" 122 | " -vv print Ispell compatible version number\n" 123 | " -w print misspelled words (= lines) " 124 | "from one word/line input\n" 125 | "\n"); 126 | fprintf(stdout, "Examples: %s -d fi file.txt\n" 127 | " %s -l file.txt\n\n", "hfst-ispell", "hfst-ispell"); 128 | fprintf(stdout, "Report bugs to " PACKAGE_BUGREPORT "\n"); 129 | return true; 130 | } 131 | 132 | bool print_version(bool ispell_strict) 133 | { 134 | fprintf(stdout, "@(#) International Ispell Version 3.2.06 (but really " 135 | PACKAGE_STRING ")\n\n"); 136 | if (!ispell_strict) 137 | { 138 | fprintf(stdout, "Copyright (C) 2013 University of Helsinki. APL\n"); 139 | fprintf(stdout, 140 | "This is free software; see the source for copying conditions. " 141 | " There is NO\n" 142 | "warranty; not even for MERCHANTABILITY or FITNESS FOR A " 143 | " PARTICULAR PURPOSE,\n" 144 | "to the extent permitted by law.\n"); 145 | } 146 | return true; 147 | } 148 | 149 | bool print_short_help(void) 150 | { 151 | print_usage(); 152 | return true; 153 | } 154 | 155 | static 156 | void 157 | print_correct(const char* /*s*/) 158 | { 159 | fprintf(stdout, "*\n"); 160 | } 161 | 162 | static 163 | void 164 | print_corrections(const char* s, hfst_ospell::CorrectionQueue& c) 165 | { 166 | fprintf(stdout, "& %s %zu %d: ", s, c.size(), 0); 167 | bool comma = false; 168 | while (c.size() > 0) 169 | { 170 | if (comma) 171 | { 172 | fprintf(stdout, ", "); 173 | } 174 | fprintf(stdout, "%s", c.top().first.c_str()); 175 | comma = true; 176 | c.pop(); 177 | } 178 | fprintf(stdout, "\n"); 179 | } 180 | 181 | static 182 | void 183 | print_no_corrects(const char* s) 184 | { 185 | fprintf(stdout, "# %s %d\n", s, 0); 186 | } 187 | 188 | int 189 | legacy_spell(const char* errmodel_filename, const char* acceptor_filename) 190 | { 191 | FILE* mutator_file = fopen(errmodel_filename, "r"); 192 | if (mutator_file == NULL) 193 | { 194 | std::cerr << "Could not open file " << errmodel_filename 195 | << std::endl; 196 | return EXIT_FAILURE; 197 | } 198 | FILE* lexicon_file = fopen(acceptor_filename, "r"); 199 | if (lexicon_file == NULL) 200 | { 201 | std::cerr << "Could not open file " << acceptor_filename 202 | << std::endl; 203 | return EXIT_FAILURE; 204 | } 205 | hfst_ospell::Transducer * mutator; 206 | hfst_ospell::Transducer * lexicon; 207 | mutator = new hfst_ospell::Transducer(mutator_file); 208 | if (!mutator->is_weighted()) 209 | { 210 | std::cerr << "Error source was unweighted, exiting\n\n"; 211 | return EXIT_FAILURE; 212 | } 213 | lexicon = new hfst_ospell::Transducer(lexicon_file); 214 | if (!lexicon->is_weighted()) 215 | { 216 | std::cerr << "Lexicon was unweighted, exiting\n\n"; 217 | return EXIT_FAILURE; 218 | } 219 | hfst_ospell::Speller * speller; 220 | try 221 | { 222 | speller = new hfst_ospell::Speller(mutator, lexicon); 223 | } 224 | catch (hfst_ospell::AlphabetTranslationException& e) 225 | { 226 | std::cerr << 227 | "Unable to build speller - symbol " << e.what() << " not " 228 | "present in lexicon's alphabet\n"; 229 | return EXIT_FAILURE; 230 | } 231 | char * str = (char*) malloc(2000); 232 | while (!std::cin.eof()) 233 | { 234 | std::cin.getline(str, 2000); 235 | if (str[0] == '\0') 236 | { 237 | fprintf(stdout, "\n"); 238 | continue; 239 | } 240 | if (str[strlen(str) - 1] == '\r') 241 | { 242 | fprintf(stderr, "\\r is not allowed\n"); 243 | exit(1); 244 | } 245 | if (speller->check(str)) 246 | { 247 | print_correct(str); 248 | } 249 | else 250 | { 251 | hfst_ospell::CorrectionQueue corrections = speller->correct(str, 5); 252 | if (corrections.size() > 0) 253 | { 254 | print_corrections(str, corrections); 255 | } 256 | else 257 | { 258 | print_no_corrects(str); 259 | } 260 | } 261 | } 262 | return EXIT_SUCCESS; 263 | } 264 | 265 | int 266 | zhfst_spell(char* zhfst_filename, FILE* input) 267 | { 268 | ZHfstOspeller speller; 269 | try 270 | { 271 | speller.read_zhfst(zhfst_filename); 272 | } 273 | catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) 274 | { 275 | std::cerr << "cannot finish reading zhfst archive " << zhfst_filename << 276 | ":" << zhmdpe.what() << "." << std::endl; 277 | return EXIT_FAILURE; 278 | } 279 | catch (hfst_ospell::ZHfstZipReadingError zhzre) 280 | { 281 | std::cerr << "cannot read zhfst archive " << zhfst_filename << ":" 282 | << zhzre.what() << "." << std::endl 283 | << "trying to read as legacy automata directory" << std::endl; 284 | try 285 | { 286 | speller.read_legacy(zhfst_filename); 287 | } 288 | catch (hfst_ospell::ZHfstLegacyReadingError zhlre) 289 | { 290 | std::cerr << "cannot fallback to read legacy hfst speller dir " 291 | << zhfst_filename 292 | << ":" << std::endl 293 | << zhlre.what() << "." << std::endl; 294 | return EXIT_FAILURE; 295 | } 296 | } 297 | if (verbose) 298 | { 299 | std::cout << "Following metadata was read from ZHFST archive:" << std::endl 300 | << speller.metadata_dump() << std::endl; 301 | } 302 | char* str = NULL; 303 | size_t len = 0; 304 | while (getline(&str, &len, input) != -1) 305 | { 306 | if (str[0] == '\0') 307 | { 308 | break; 309 | } 310 | if (str[strlen(str) - 1] == '\r') 311 | { 312 | fprintf(stderr, "\\r is not allowed\n"); 313 | exit(1); 314 | } 315 | else if (str[strlen(str) - 1] == '\n') 316 | { 317 | str[strlen(str) - 1] = '\0'; 318 | } 319 | if (speller.spell(str)) 320 | { 321 | print_correct(str); 322 | } 323 | else 324 | { 325 | hfst_ospell::CorrectionQueue corrections = speller.suggest(str); 326 | if (corrections.size() > 0) 327 | { 328 | print_corrections(str, corrections); 329 | } 330 | else 331 | { 332 | print_no_corrects(str); 333 | } 334 | } 335 | } 336 | free(str); 337 | return EXIT_SUCCESS; 338 | } 339 | 340 | int main(int argc, char **argv) 341 | { 342 | 343 | int c; 344 | char* langcode = 0; 345 | //std::locale::global(std::locale("")); 346 | int version = 0; 347 | #if HAVE_GETOPT_H 348 | while (true) { 349 | static struct option long_options[] = 350 | { 351 | // first the hfst-mandated options 352 | {"help", no_argument, 0, 'h'}, 353 | {"version", no_argument, 0, 'v'}, 354 | {"one", no_argument, 0, '1'}, 355 | {"ispell", no_argument, 0, 'a'}, 356 | {"check-url", no_argument, 0, 'X'}, 357 | {"dictionary", required_argument, 0, 'd'}, 358 | {"list", no_argument, 0, 'D'}, 359 | {"mispelt", no_argument, 0, 'l'}, 360 | {"misslines", no_argument, 0, 'L'}, 361 | {"wordperline", no_argument, 0, 'w'}, 362 | {0, 0, 0, 0 } 363 | }; 364 | 365 | int option_index = 0; 366 | c = getopt_long(argc, argv, "1ad:DGhvlLw", long_options, &option_index); 367 | 368 | if (c == -1) // no more options to look at 369 | break; 370 | 371 | switch (c) 372 | { 373 | case 'h': 374 | print_usage(); 375 | return EXIT_SUCCESS; 376 | break; 377 | case 'V': 378 | version += 1; 379 | break; 380 | case 'v': 381 | version += 1; 382 | break; 383 | case 'd': 384 | langcode = optarg; 385 | break; 386 | default: 387 | std::cerr << "Invalid option\n\n"; 388 | print_short_help(); 389 | return EXIT_FAILURE; 390 | break; 391 | } 392 | } 393 | if (version == 1) 394 | { 395 | print_version(false); 396 | return EXIT_SUCCESS; 397 | } 398 | else if (version == 2) 399 | { 400 | print_version(true); 401 | return EXIT_SUCCESS; 402 | } 403 | else if (version >= 3) 404 | { 405 | fprintf(stdout, "Come on, really?\n"); 406 | exit(version); 407 | } 408 | #else 409 | int optind = 1; 410 | #endif 411 | // find the dicts 412 | char* zhfst_file = 0; 413 | if (NULL == langcode) 414 | { 415 | fprintf(stderr, "Currently -d is required since I'm too lazy to check " 416 | "locale\n"); 417 | exit(1); 418 | } 419 | else 420 | { 421 | zhfst_file = find_dicts(langcode); 422 | if (NULL == zhfst_file) 423 | { 424 | fprintf(stderr, "Could not find dictionary %s in standard " 425 | "locations\n" 426 | "Please install one of:\n" 427 | " /usr/share/voikko/3/speller-%s.zhfst\n" 428 | " $HOME/.voikko/3/speller-%s.zhfst\n" 429 | " ./speller-%s.zhfst\n", 430 | langcode, langcode, langcode, langcode); 431 | exit(1); 432 | } 433 | } 434 | // no more options, we should now be at the input filenames 435 | if (optind == argc) 436 | { 437 | return zhfst_spell(zhfst_file, stdin); 438 | } 439 | else if (optind < argc) 440 | { 441 | while (optind < argc) 442 | { 443 | FILE* infile = fopen(argv[optind], "r"); 444 | if (NULL == infile) 445 | { 446 | fprintf(stderr, "Could not open %s for reading", 447 | argv[optind]); 448 | exit(1); 449 | } 450 | zhfst_spell(zhfst_file, infile); 451 | optind++; 452 | } 453 | } 454 | return EXIT_SUCCESS; 455 | } 456 | -------------------------------------------------------------------------------- /main-norvig.cc: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright 2009 University of Helsinki 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | */ 18 | 19 | /* 20 | This is a toy commandline utility for testing spellers on standard io. 21 | */ 22 | 23 | #include "ospell.h" 24 | #include 25 | #include 26 | #include 27 | 28 | #define PACKAGE_NAME "hfst-ospell" 29 | #define PACKAGE_STRING "hfst-ospell 0.1" 30 | #define PACKAGE_BUGREPORT "hfst-bugs@ling.helsinki.fi" 31 | 32 | bool print_usage(void) 33 | { 34 | std::cerr << 35 | "\n" << 36 | "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" << 37 | "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" << 38 | "print corrected output\n" << 39 | "\n" << 40 | " -h, --help Print this help message\n" << 41 | " -V, --version Print version information\n" << 42 | " -v, --verbose Be verbose\n" << 43 | " -q, --quiet Don't be verbose (default)\n" << 44 | " -s, --silent Same as quiet\n" << 45 | "\n" << 46 | "\n" << 47 | "Report bugs to " << PACKAGE_BUGREPORT << "\n" << 48 | "\n"; 49 | return true; 50 | } 51 | 52 | bool print_version(void) 53 | { 54 | std::cerr << 55 | "\n" << 56 | PACKAGE_STRING << std::endl << 57 | "copyright (C) 2009 University of Helsinki\n"; 58 | return true; 59 | } 60 | 61 | bool print_short_help(void) 62 | { 63 | print_usage(); 64 | return true; 65 | } 66 | 67 | int main(int argc, char **argv) 68 | { 69 | 70 | FILE * mutator_file = NULL; 71 | FILE * lexicon_file = NULL; 72 | 73 | int c; 74 | bool verbose = false; 75 | 76 | while (true) 77 | { 78 | static struct option long_options[] = 79 | { 80 | // first the hfst-mandated options 81 | {"help", no_argument, 0, 'h'}, 82 | {"version", no_argument, 0, 'V'}, 83 | {"verbose", no_argument, 0, 'v'}, 84 | {"quiet", no_argument, 0, 'q'}, 85 | {"silent", no_argument, 0, 's'}, 86 | {0, 0, 0, 0 } 87 | }; 88 | 89 | int option_index = 0; 90 | c = getopt_long(argc, argv, "hVvqs", long_options, &option_index); 91 | 92 | if (c == -1) // no more options to look at 93 | break; 94 | 95 | switch (c) { 96 | case 'h': 97 | print_usage(); 98 | return EXIT_SUCCESS; 99 | break; 100 | 101 | case 'V': 102 | print_version(); 103 | return EXIT_SUCCESS; 104 | break; 105 | 106 | case 'v': 107 | verbose = true; 108 | break; 109 | 110 | case 'q': // fallthrough 111 | case 's': 112 | break; 113 | 114 | default: 115 | std::cerr << "Invalid option\n\n"; 116 | print_short_help(); 117 | return EXIT_FAILURE; 118 | break; 119 | } 120 | } 121 | // no more options, we should now be at the input filenames 122 | if ( (optind + 2) < argc) { 123 | std::cerr << "More than two input files given\n"; 124 | return EXIT_FAILURE; 125 | } else if ( (optind + 2) > argc) 126 | { 127 | std::cerr << "Need two input files\n"; 128 | return EXIT_FAILURE; 129 | } else { 130 | mutator_file = fopen(argv[(optind)], "r"); 131 | if (mutator_file == NULL) { 132 | std::cerr << "Could not open file " << argv[(optind)] 133 | << std::endl; 134 | return 1; 135 | } 136 | lexicon_file = fopen(argv[(optind + 1)], "r"); 137 | if (lexicon_file == NULL) { 138 | std::cerr << "Could not open file " << argv[(optind + 1)] 139 | << std::endl; 140 | return 1; 141 | } 142 | } 143 | hfst_ospell::Transducer * mutator; 144 | hfst_ospell::Transducer * lexicon; 145 | mutator = new hfst_ospell::Transducer(mutator_file); 146 | if (!mutator->is_weighted()) { 147 | std::cerr << "Error source was unweighted, exiting\n\n"; 148 | return EXIT_FAILURE; 149 | } 150 | lexicon = new hfst_ospell::Transducer(lexicon_file); 151 | if (!lexicon->is_weighted()) { 152 | std::cerr << "Lexicon was unweighted, exiting\n\n"; 153 | return EXIT_FAILURE; 154 | } 155 | 156 | hfst_ospell::Speller * speller; 157 | 158 | try { 159 | speller = new hfst_ospell::Speller(mutator, lexicon); 160 | } catch (hfst_ospell::AlphabetTranslationException& e) { 161 | std::cerr << 162 | "Unable to build speller - symbol " << e.what() << " not " 163 | "present in lexicon's alphabet\n"; 164 | return EXIT_FAILURE; 165 | } 166 | 167 | char * str = (char*) malloc(2000); 168 | // def spelltest(tests, bias=None, verbose=False): 169 | // n, bad, unknown, start = 0, 0, 0, time.clock() 170 | unsigned long n = 0; 171 | unsigned long bad = 0; 172 | unsigned long unknown = 0; 173 | clock_t start = clock(); 174 | // if bias: 175 | // for target in tests: NWORDS[target] += bias 176 | // for target,wrongs in tests.items(): 177 | // for wrong in wrongs.split(): 178 | while (!std::cin.eof()) { 179 | std::cin.getline(str, 2000); 180 | if (str[0] == '\0') { 181 | continue; 182 | } 183 | // n += 1 184 | n++; 185 | char* p = strdup(str); 186 | char* tok = strtok(p, "\t"); 187 | assert(tok != NULL); 188 | char* mispelt = strdup(tok); 189 | tok = strtok(NULL, "\t"); 190 | assert(tok != NULL); 191 | //w = correct(wrong) 192 | char* corr = strdup(tok); 193 | // unknown += (corr in NWORDS) 194 | if (!speller->check(corr)) 195 | { 196 | unknown++; 197 | } 198 | if (speller->check(mispelt)) 199 | { 200 | // real word spelling error 201 | bad++; 202 | if (verbose) 203 | { 204 | fprintf(stdout, "correct(%s) => %s; expected %s\n", 205 | mispelt, mispelt, corr); 206 | } 207 | } 208 | else 209 | { 210 | 211 | hfst_ospell::CorrectionQueue corrections = speller->correct(mispelt); 212 | if (corrections.size() == 0) 213 | { 214 | bad++; 215 | if (verbose) 216 | { 217 | fprintf(stdout, "correct(%s) => %s; expected %s\n", 218 | mispelt, mispelt, corr); 219 | } 220 | } 221 | else 222 | { 223 | std::string first = corrections.top().first; 224 | //if w!=target: 225 | if (first != corr) 226 | { 227 | //bad += 1 228 | bad++; 229 | // if verbose: 230 | // print 'correct(%r) => %r (%d); expected %r (%d)' % ( 231 | // wrong, w, NWORDS[w], target, NWORDS[target]) 232 | if (verbose) 233 | { 234 | fprintf(stdout, "correct(%s) => %s; " 235 | "expected %s\n", 236 | mispelt, first.c_str(), 237 | corr); 238 | } 239 | } // first != corr 240 | else 241 | { 242 | if (verbose) 243 | { 244 | fprintf(stdout, "correct(%s) => %s " 245 | "as expected %s\n", 246 | mispelt, first.c_str(), 247 | corr); 248 | } 249 | } 250 | } // corrections size != 0 251 | } // word not in dictionary§ 252 | } 253 | //return dict(bad=bad, n=n, bias=bias, pct=int(100. - 100.*bad/n), 254 | // unknown=unknown, secs=int(time.clock()-start) ) 255 | int pct = (int)round(100.0f - 100.0f*(float)bad/(float)n); 256 | float secs = (((float)clock()-(float)start)/(float)CLOCKS_PER_SEC); 257 | fprintf(stdout, 258 | "{'bad': %lu, 'bias': None, 'unknown': %lu, " 259 | "'secs': %f, 'pct': %d, 'n': %lu}\n", 260 | bad, unknown, secs, pct, n); 261 | return EXIT_SUCCESS; 262 | } 263 | -------------------------------------------------------------------------------- /office.cc: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright 2015 University of Helsinki 4 | 5 | Bug reports for this file should go to: 6 | Tino Didriksen 7 | Code adapted from https://github.com/TinoDidriksen/trie-tools 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | 21 | */ 22 | 23 | /* 24 | Tests up to 8 variations of each input token: 25 | - Verbatim 26 | - With leading non-alphanumerics removed 27 | - With trailing non-alphanumerics removed 28 | - With leading and trailing non-alphanumerics removed 29 | - First-lower of all the above 30 | */ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #define U_CHARSET_IS_UTF8 1 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | using namespace icu; 54 | 55 | #include "ZHfstOspeller.h" 56 | 57 | using hfst_ospell::ZHfstOspeller; 58 | using hfst_ospell::Transducer; 59 | 60 | typedef std::map valid_words_t; 61 | valid_words_t valid_words; 62 | 63 | struct word_t { 64 | size_t start, count; 65 | UnicodeString buffer; 66 | }; 67 | std::vector words(16); 68 | std::string buffer, wbuf; 69 | using Alt = std::pair; 70 | std::vector alts; 71 | std::unordered_set outputs; 72 | UnicodeString ubuffer, uc_buffer; 73 | size_t cw; 74 | 75 | bool verbatim = false; 76 | bool debug = false; 77 | hfst_ospell::Weight max_weight = -1.0; 78 | hfst_ospell::Weight beam = -1.0; 79 | float time_cutoff = 6.0; 80 | bool uc_first = false; 81 | bool uc_all = true; 82 | 83 | bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { 84 | outputs.clear(); 85 | alts.clear(); 86 | 87 | // Gather corrections from all the tried variants, starting with verbatim and increasing mangling from there 88 | for (size_t k=0 ; k < cw && alts.size() 0.0 && corrections.top().second > max_weight) { 100 | break; 101 | } 102 | auto w = corrections.top().second * (1.0 + k/10.0); 103 | 104 | buffer.clear(); 105 | if (k != 0) { 106 | words[0].buffer.tempSubString(0, words[k].start).toUTF8String(buffer); 107 | } 108 | if (uc_all) { 109 | UnicodeString::fromUTF8(corrections.top().first).toUpper().toUTF8String(buffer); 110 | } 111 | else if (uc_first) { 112 | uc_buffer.setTo(UnicodeString::fromUTF8(corrections.top().first)); 113 | ubuffer.setTo(uc_buffer, 0, 1); 114 | ubuffer.toUpper(); 115 | ubuffer.append(uc_buffer, 1, uc_buffer.length()-1); 116 | ubuffer.toUTF8String(buffer); 117 | } 118 | else { 119 | buffer.append(corrections.top().first); 120 | } 121 | if (k != 0) { 122 | words[0].buffer.tempSubString(words[k].start + words[k].count).toUTF8String(buffer); 123 | } 124 | 125 | if (debug) { 126 | wbuf.resize(64); 127 | wbuf.resize(sprintf(&wbuf[0], " (%.2f;%zu)", corrections.top().second, k)); 128 | buffer += wbuf; 129 | } 130 | 131 | if (outputs.count(buffer) == 0) { 132 | alts.push_back({w, buffer}); 133 | std::sort(alts.begin(), alts.end()); 134 | while (alts.size() > suggs) { 135 | alts.pop_back(); 136 | } 137 | } 138 | outputs.insert(buffer); 139 | corrections.pop(); 140 | } 141 | } 142 | 143 | if (!alts.empty()) { 144 | std::cout << "&"; 145 | for (auto& alt : alts) { 146 | std::cout << "\t" << alt.second; 147 | } 148 | std::cout << std::endl; 149 | return true; 150 | } 151 | 152 | return false; 153 | } 154 | 155 | bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs) { 156 | ubuffer.setTo(UnicodeString::fromUTF8(word)); 157 | 158 | if (word.size() == 13 && word[5] == 'D' && word == "nuvviDspeller") { 159 | uc_first = false; 160 | uc_all = false; 161 | words[0].start = 0; 162 | words[0].count = ubuffer.length(); 163 | words[0].buffer = ubuffer; 164 | cw = 1; 165 | return false; 166 | } 167 | 168 | uc_first = false; 169 | uc_all = true; 170 | bool has_letters = false; 171 | for (int32_t i=0 ; i 1 && !verbatim) { 200 | size_t count = cchUse; 201 | while (count && !u_isalnum(pwsz[ichStart+count-1])) { 202 | --count; 203 | } 204 | if (count != cchUse) { 205 | // If the input ended with non-alphanumerics, test input with non-alphanumerics trimmed from the end 206 | words[cw].buffer.remove(); 207 | words[cw].start = ichStart; 208 | words[cw].count = count; 209 | words[cw].buffer.append(pwsz, words[cw].start, words[cw].count); 210 | ++cw; 211 | } 212 | 213 | size_t start = ichStart, count2 = cchUse; 214 | while (start < ichStart+cchUse && !u_isalnum(pwsz[start])) { 215 | ++start; 216 | --count2; 217 | } 218 | if (start != ichStart) { 219 | // If the input started with non-alphanumerics, test input with non-alphanumerics trimmed from the start 220 | words[cw].buffer.remove(); 221 | words[cw].start = start; 222 | words[cw].count = count2; 223 | words[cw].buffer.append(pwsz, words[cw].start, words[cw].count); 224 | ++cw; 225 | } 226 | 227 | if (start != ichStart && count != cchUse) { 228 | // If the input both started and ended with non-alphanumerics, test input with non-alphanumerics trimmed from both sides 229 | words[cw].buffer.remove(); 230 | words[cw].start = start; 231 | words[cw].count = count - (cchUse - count2); 232 | words[cw].buffer.append(pwsz, words[cw].start, words[cw].count); 233 | ++cw; 234 | } 235 | } 236 | 237 | for (size_t i=0, e=cw ; isecond = itl->second; 265 | it = itl; 266 | } 267 | else { 268 | valid = speller.spell(buffer); 269 | it->second = valid; // Also mark the original mixed case variant as whatever the first-lower one was 270 | it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; 271 | } 272 | } 273 | } 274 | 275 | if (it->second == true) { 276 | return true; 277 | } 278 | } 279 | 280 | return false; 281 | } 282 | 283 | int zhfst_spell(const char* zhfst_filename) { 284 | ZHfstOspeller speller; 285 | try { 286 | if (debug) { 287 | std::cout << "@@ Loading " << zhfst_filename << " with args max-weight=" << max_weight << ", beam=" << beam << ", time-cutoff=" << time_cutoff << std::endl; 288 | } 289 | speller.read_zhfst(zhfst_filename); 290 | speller.set_weight_limit(max_weight); 291 | speller.set_beam(beam); 292 | speller.set_time_cutoff(time_cutoff); 293 | } 294 | catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) { 295 | fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what()); 296 | return EXIT_FAILURE; 297 | } 298 | catch (hfst_ospell::ZHfstZipReadingError zhzre) { 299 | fprintf(stderr, "cannot read zhfst archive %s:\n%s.\n", zhfst_filename, zhzre.what()); 300 | return EXIT_FAILURE; 301 | } 302 | catch (hfst_ospell::ZHfstXmlParsingError zhxpe) { 303 | fprintf(stderr, "Cannot finish reading index.xml from %s:\n%s.\n", zhfst_filename, zhxpe.what()); 304 | return EXIT_FAILURE; 305 | } 306 | 307 | std::cout << "@@ hfst-ospell-office is alive" << std::endl; 308 | 309 | std::string line; 310 | std::string word; 311 | std::istringstream ss; 312 | while (std::getline(std::cin, line)) { 313 | while (!line.empty() && std::isspace(line[line.size()-1])) { 314 | line.resize(line.size()-1); 315 | } 316 | if (line.empty()) { 317 | continue; 318 | } 319 | 320 | if (line.size() >= 5 && line[0] == '$' && line[1] == '$' && line[3] == ' ') { 321 | if (line[2] == 'd' && isdigit(line[4]) && line.size() == 5) { 322 | debug = (line[4] != '0'); 323 | std::cout << "@@ Option debug changed to " << debug << std::endl; 324 | continue; 325 | } 326 | if (line[2] == 'T' && isdigit(line[4]) && line.size() == 5) { 327 | verbatim = (line[4] != '0'); 328 | std::cout << "@@ Option verbatim changed to " << verbatim << std::endl; 329 | continue; 330 | } 331 | if (line[2] == 'w' && isdigit(line[4])) { 332 | max_weight = std::stof(&line[4]); 333 | speller.set_weight_limit(max_weight); 334 | std::cout << "@@ Option max-weight changed to " << max_weight << std::endl; 335 | continue; 336 | } 337 | if (line[2] == 'b' && isdigit(line[4])) { 338 | beam = std::stof(&line[4]); 339 | speller.set_beam(beam); 340 | std::cout << "@@ Option beam changed to " << beam << std::endl; 341 | continue; 342 | } 343 | if (line[2] == 't' && isdigit(line[4])) { 344 | time_cutoff = std::stof(&line[4]); 345 | speller.set_time_cutoff(time_cutoff); 346 | std::cout << "@@ Option time-cutoff changed to " << time_cutoff << std::endl; 347 | continue; 348 | } 349 | } 350 | 351 | // Just in case anyone decides to use the speller for a minor eternity 352 | if (valid_words.size() > 20480) { 353 | valid_words.clear(); 354 | } 355 | 356 | ss.clear(); 357 | ss.str(line); 358 | size_t suggs = 0; 359 | char c = 0; 360 | if (!(ss >> suggs) || !ss.get(c) || !std::getline(ss, line)) { 361 | std::cout << "!" << std::endl; 362 | continue; 363 | } 364 | 365 | if (is_valid_word(speller, line, suggs)) { 366 | std::cout << "*" << std::endl; 367 | continue; 368 | } 369 | 370 | if (!suggs || !find_alternatives(speller, suggs)) { 371 | std::cout << "#" << std::endl; 372 | } 373 | } 374 | return EXIT_SUCCESS; 375 | } 376 | 377 | void print_help() { 378 | std::cout 379 | << "Usage: hfst-ospell [options] zhfst-archive\n" 380 | << "\n" 381 | << " -h, --help Shows this help\n" 382 | << " -d, --debug Debug output with weights attached to results\n" 383 | << " -T, --verbatim Disables case-folding and non-alphanumeric trimming\n" 384 | << " -w, --max-weight=W Suppress corrections with weights above W\n" 385 | << " -b, --beam=W Suppress corrections worse than best candidate by more than W\n" 386 | << " -t, --time-cutoff=T Stop trying to find better corrections after T seconds; defaults to 6.0\n" 387 | << std::flush; 388 | } 389 | 390 | int main(int argc, char **argv) { 391 | UErrorCode status = U_ZERO_ERROR; 392 | u_init(&status); 393 | if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { 394 | std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl; 395 | return -1; 396 | } 397 | 398 | ucnv_setDefaultName("UTF-8"); 399 | uloc_setDefault("en_US_POSIX", &status); 400 | 401 | struct option long_options[] = 402 | { 403 | {"help", no_argument, 0, 'h'}, 404 | {"debug", no_argument, 0, 'd'}, 405 | {"verbatim", no_argument, 0, 'T'}, 406 | {"max-weight", required_argument, 0, 'w'}, 407 | {"beam", required_argument, 0, 'b'}, 408 | {"time-cutoff", required_argument, 0, 't'}, 409 | {0, 0, 0, 0 } 410 | }; 411 | 412 | int c = 0; 413 | while (true) { 414 | int option_index = 0; 415 | c = getopt_long(argc, argv, "hdTw:b:t:", long_options, &option_index); 416 | 417 | if (c == -1) { 418 | break; 419 | } 420 | 421 | switch (c) { 422 | case 'h': 423 | print_help(); 424 | return EXIT_SUCCESS; 425 | 426 | case 'd': 427 | debug = true; 428 | break; 429 | 430 | case 'T': 431 | verbatim = true; 432 | break; 433 | 434 | case 'w': 435 | max_weight = std::stof(optarg); 436 | break; 437 | 438 | case 'b': 439 | beam = std::stof(optarg); 440 | break; 441 | 442 | case 't': 443 | time_cutoff = std::stof(optarg); 444 | break; 445 | } 446 | } 447 | 448 | if (optind >= argc) { 449 | throw std::invalid_argument("Must pass a zhfst as argument"); 450 | } 451 | 452 | std::cerr << std::fixed << std::setprecision(2); 453 | std::cout << std::fixed << std::setprecision(2); 454 | int rv = zhfst_spell(argv[optind]); 455 | 456 | u_cleanup(); 457 | return rv; 458 | } 459 | -------------------------------------------------------------------------------- /ol-exceptions.h: -------------------------------------------------------------------------------- 1 | #ifndef _OL_EXCEPTIONS_H 2 | #define _OL_EXCEPTIONS_H 3 | 4 | #include "hfstol-stdafx.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace hfst_ospell 10 | { 11 | 12 | // This structure is inherited from for each exception. Taken from HFST library 13 | // code. 14 | //! @brief Top level exception class for ospell related errors. 15 | 16 | //! Ospell exceptions can hold basic back-track information for programmer as 17 | //! well as human readable explanation. 18 | struct OspellException 19 | { 20 | std::string name; //!< short description of exception 21 | std::string file; //!< file name of exception 22 | size_t line; //!< line number of exception 23 | 24 | OspellException(void) {} 25 | 26 | //! 27 | //! construct exception with name, file and location 28 | OspellException(const std::string &name,const std::string &file,size_t line): 29 | name(name), 30 | file(file), 31 | line(line) 32 | {} 33 | 34 | //! 35 | //! create string representation of exception for output 36 | std::string operator() (void) const 37 | { 38 | std::ostringstream o; 39 | o << "Exception: "<< name << " in file: " 40 | << file << " on line: " << line; 41 | return o.str(); 42 | } 43 | //! 44 | //! create char array representation of exception for output 45 | const char* what() 46 | { 47 | std::ostringstream o; 48 | o << file << ":" << line << ":" << name; 49 | return strdup(o.str().c_str()); 50 | } 51 | }; 52 | 53 | // These macros are used instead of the normal exception facilities. 54 | 55 | #define HFSTOSPELL_THROW(E) throw E(#E,__FILE__,__LINE__) 56 | 57 | #define HFSTOSPELL_THROW_MESSAGE(E,M) throw E(std::string(#E)+": "+std::string(M)\ 58 | ,__FILE__,__LINE__) 59 | 60 | #define HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(CHILD) \ 61 | struct CHILD : public OspellException \ 62 | { CHILD(const std::string &name,const std::string &file,size_t line):\ 63 | OspellException(name,file,line) {}} 64 | 65 | #define HFST_CATCH(E) \ 66 | catch (const E &e) \ 67 | { \ 68 | std::cerr << e.file << ", line " << e.line << ": " << \ 69 | e() << std::endl; \ 70 | } 71 | 72 | // Now the exceptions themselves 73 | 74 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(HeaderParsingException); 75 | 76 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(AlphabetParsingException); 77 | 78 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(IndexTableReadingException); 79 | 80 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(TransitionTableReadingException); 81 | 82 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(UnweightedSpellerException); 83 | 84 | HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(TransducerTypeException); 85 | } // namespace 86 | #endif // _OL_EXCEPTIONS_H 87 | -------------------------------------------------------------------------------- /tests/acceptor.basic.txt: -------------------------------------------------------------------------------- 1 | 0 1 o o 2 | 1 2 l l 3 | 2 3 u u 4 | 3 4 t t 5 | 4 6 | 0 5 v e 7 | 0 5 s i 8 | -------------------------------------------------------------------------------- /tests/analyse-spell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -a $srcdir/tests/speller_analyser.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/analyser.default.txt: -------------------------------------------------------------------------------- 1 | 0 1 o o 2 | 0 6 v v 3 | 1 2 l l 4 | 2 3 u u 5 | 3 5 @_EPSILON_SYMBOL_@ +Use/-Spell 6 | 3 4 t t 7 | 4 5 @_EPSILON_SYMBOL_@ +N 8 | 5 9 | 6 7 e e 10 | 7 8 s s 11 | 8 9 i i 12 | 9 5 @_EPSILON_SYMBOL_@ +Use/SpellNoSugg 13 | -------------------------------------------------------------------------------- /tests/bad-errormodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v $srcdir/tests/bad_errormodel.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/bad_errormodel.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/bad_errormodel.zhfst -------------------------------------------------------------------------------- /tests/basic-edit1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell $srcdir/tests/speller_edit1.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/basic-zhfst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell $srcdir/tests/speller_basic.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/basic_test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | qtz 5 | Example speller 6 | 7 | This example is for the automatic test suite of hfst-ospell. 8 | 9 | 1.5.73 10 | 2012-08-15 11 | Flammie 12 | 14 | 15 | 16 | Example dictionary 17 | Vuola lávlla 18 | Example dictionary recognises a word. 19 | 20 | Vuola, vuola mun aigon lási 21 | vuolas juhkaluvvat, 22 | vuola, vuola mun aigon lási 23 | vuolas mieladuvvat 24 | 25 | 26 | 27 | Sahtiwaari 28 | 29 | Example error model turns one word into another. 30 | 31 | 32 | errormodel.default.hfst 33 | 34 | 35 | -------------------------------------------------------------------------------- /tests/empty-descriptions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v $srcdir/tests/empty_descriptions.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/empty-locale.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v $srcdir/tests/empty_locale.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/empty-titles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v $srcdir/tests/empty_titles.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/empty-zhfst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | rm -f empty.zhfst 5 | touch empty.zhfst 6 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v empty.zhfst ; then 7 | exit 1 8 | fi 9 | else 10 | echo ./hfst-ospell not built 11 | exit 77 12 | fi 13 | 14 | -------------------------------------------------------------------------------- /tests/empty_descriptions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | qtz 5 | Example speller 6 | 7 | 1.5.73 8 | 2012-08-15 9 | Flammie 10 | 12 | 13 | 14 | Example dictionary 15 | Vuola lávlla 16 | 17 | 18 | 19 | Sahtiwaari 20 | 21 | 22 | errormodel.default.hfst 23 | 24 | 25 | -------------------------------------------------------------------------------- /tests/empty_descriptions.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/empty_descriptions.zhfst -------------------------------------------------------------------------------- /tests/empty_locale.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example speller 6 | 7 | This example is for the automatic test suite of hfst-ospell. 8 | 9 | 1.5.73 10 | 2012-08-15 11 | Flammie 12 | 14 | 15 | 16 | Example dictionary 17 | Vuola lávlla 18 | Example dictionary recognises a word. 19 | 20 | Vuola, vuola mun aigon lási 21 | vuolas juhkaluvvat, 22 | vuola, vuola mun aigon lási 23 | vuolas mieladuvvat 24 | 25 | 26 | 27 | Sahtiwaari 28 | 29 | Example error model turns one word into another. 30 | 31 | 32 | errormodel.default.hfst 33 | 34 | 35 | -------------------------------------------------------------------------------- /tests/empty_locale.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/empty_locale.zhfst -------------------------------------------------------------------------------- /tests/empty_titles.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | qtz 5 | 6 | <description> 7 | This example is for the automatic test suite of hfst-ospell. 8 | </description> 9 | <version vcsrev="33459">1.5.73</version> 10 | <date>2012-08-15</date> 11 | <producer>Flammie</producer> 12 | <contact email="flammie@iki.fi" 13 | website="http://flammie.dyndns.org/"/> 14 | </info> 15 | <acceptor type="general" id="acceptor.default.hfst"> 16 | <title/> 17 | <description>Example dictionary recognises a word.</description> 18 | <description xml:lang="se"> 19 | Vuola, vuola mun aigon lási 20 | vuolas juhkaluvvat, 21 | vuola, vuola mun aigon lási 22 | vuolas mieladuvvat 23 | </description> 24 | </acceptor> 25 | <errmodel id="errormodel.default.hfst"> 26 | <title/> 27 | <description> 28 | Example error model turns one word into another. 29 | </description> 30 | <type type="default"/> 31 | <model>errormodel.default.hfst</model> 32 | </errmodel> 33 | </hfstspeller> 34 | -------------------------------------------------------------------------------- /tests/empty_titles.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/empty_titles.zhfst -------------------------------------------------------------------------------- /tests/errmodel.basic.txt: -------------------------------------------------------------------------------- 1 | 0 1 v o 2 | 1 2 e l 3 | 2 3 s u 4 | 3 4 i t 5 | 4 6 | -------------------------------------------------------------------------------- /tests/errmodel.edit1.txt: -------------------------------------------------------------------------------- 1 | 0 0.0 2 | 0 0 a a 0.0 3 | 0 0 c c 0.0 4 | 0 0 b b 0.0 5 | 0 0 e e 0.0 6 | 0 0 d d 0.0 7 | 0 0 g g 0.0 8 | 0 0 f f 0.0 9 | 0 0 i i 0.0 10 | 0 0 h h 0.0 11 | 0 0 k k 0.0 12 | 0 0 j j 0.0 13 | 0 0 m m 0.0 14 | 0 0 l l 0.0 15 | 0 0 o o 0.0 16 | 0 0 n n 0.0 17 | 0 0 q q 0.0 18 | 0 0 p p 0.0 19 | 0 0 s s 0.0 20 | 0 0 r r 0.0 21 | 0 0 u u 0.0 22 | 0 0 t t 0.0 23 | 0 0 v v 0.0 24 | 0 0 y y 0.0 25 | 0 0 x x 0.0 26 | 0 0 z z 0.0 27 | 0 1 c u 1.0 28 | 0 1 x f 1.0 29 | 0 1 i h 1.0 30 | 0 1 j v 1.0 31 | 0 1 @0@ x 1.0 32 | 0 1 h s 1.0 33 | 0 1 n o 1.0 34 | 0 1 z h 1.0 35 | 0 1 k b 1.0 36 | 0 1 i y 1.0 37 | 0 1 d q 1.0 38 | 0 1 t z 1.0 39 | 0 1 o x 1.0 40 | 0 1 d f 1.0 41 | 0 1 k @0@ 1.0 42 | 0 1 c n 1.0 43 | 0 1 j z 1.0 44 | 0 1 @0@ q 1.0 45 | 0 1 r p 1.0 46 | 0 1 e z 1.0 47 | 0 1 k y 1.0 48 | 0 1 a q 1.0 49 | 0 1 q t 1.0 50 | 0 1 g p 1.0 51 | 0 1 h @0@ 1.0 52 | 0 1 f c 1.0 53 | 0 1 y i 1.0 54 | 0 1 u v 1.0 55 | 0 1 v d 1.0 56 | 0 1 h v 1.0 57 | 0 1 f r 1.0 58 | 0 1 t k 1.0 59 | 0 1 j x 1.0 60 | 0 1 @0@ j 1.0 61 | 0 1 m i 1.0 62 | 0 1 r i 1.0 63 | 0 1 c q 1.0 64 | 0 1 x j 1.0 65 | 0 1 k p 1.0 66 | 0 1 y d 1.0 67 | 0 1 a s 1.0 68 | 0 1 q v 1.0 69 | 0 1 z l 1.0 70 | 0 1 o u 1.0 71 | 0 1 m h 1.0 72 | 0 1 q r 1.0 73 | 0 1 s k 1.0 74 | 0 1 x k 1.0 75 | 0 1 y @0@ 1.0 76 | 0 1 e g 1.0 77 | 0 1 y v 1.0 78 | 0 1 n t 1.0 79 | 0 1 c j 1.0 80 | 0 1 t i 1.0 81 | 0 1 y p 1.0 82 | 0 1 @0@ c 1.0 83 | 0 1 r b 1.0 84 | 0 1 c x 1.0 85 | 0 1 z n 1.0 86 | 0 1 u s 1.0 87 | 0 1 i n 1.0 88 | 0 1 e f 1.0 89 | 0 1 a u 1.0 90 | 0 1 q h 1.0 91 | 0 1 g t 1.0 92 | 0 1 u r 1.0 93 | 0 1 q y 1.0 94 | 0 1 h z 1.0 95 | 0 1 p g 1.0 96 | 0 1 f n 1.0 97 | 0 1 x n 1.0 98 | 0 1 j n 1.0 99 | 0 1 t f 1.0 100 | 0 1 q j 1.0 101 | 0 1 s o 1.0 102 | 0 1 k j 1.0 103 | 0 1 s y 1.0 104 | 0 1 i a 1.0 105 | 0 1 z r 1.0 106 | 0 1 d y 1.0 107 | 0 1 d n 1.0 108 | 0 1 x o 1.0 109 | 0 1 e c 1.0 110 | 0 1 c f 1.0 111 | 0 1 y a 1.0 112 | 0 1 @_UNKNOWN_SYMBOL_@ x 1.0 113 | 0 1 c t 1.0 114 | 0 1 s c 1.0 115 | 0 1 @0@ y 1.0 116 | 0 1 p t 1.0 117 | 0 1 y b 1.0 118 | 0 1 r x 1.0 119 | 0 1 e b 1.0 120 | 0 1 z i 1.0 121 | 0 1 k a 1.0 122 | 0 1 q l 1.0 123 | 0 1 g h 1.0 124 | 0 1 i c 1.0 125 | 0 1 y m 1.0 126 | 0 1 p u 1.0 127 | 0 1 f j 1.0 128 | 0 1 t s 1.0 129 | 0 1 j p 1.0 130 | 0 1 @0@ r 1.0 131 | 0 1 m a 1.0 132 | 0 1 r q 1.0 133 | 0 1 x r 1.0 134 | 0 1 n a 1.0 135 | 0 1 j b 1.0 136 | 0 1 k x 1.0 137 | 0 1 q n 1.0 138 | 0 1 z d 1.0 139 | 0 1 m v 1.0 140 | 0 1 s u 1.0 141 | 0 1 z v 1.0 142 | 0 1 q g 1.0 143 | 0 1 x s 1.0 144 | 0 1 n b 1.0 145 | 0 1 e o 1.0 146 | 0 1 i r 1.0 147 | 0 1 c b 1.0 148 | 0 1 t q 1.0 149 | 0 1 j y 1.0 150 | 0 1 @0@ k 1.0 151 | 0 1 @_UNKNOWN_SYMBOL_@ z 1.0 152 | 0 1 r j 1.0 153 | 0 1 c p 1.0 154 | 0 1 t @0@ 1.0 155 | 0 1 p z 1.0 156 | 0 1 e n 1.0 157 | 0 1 o f 1.0 158 | 0 1 @_UNKNOWN_SYMBOL_@ t 1.0 159 | 0 1 g l 1.0 160 | 0 1 @_UNKNOWN_SYMBOL_@ s 1.0 161 | 0 1 d c 1.0 162 | 0 1 u z 1.0 163 | 0 1 v x 1.0 164 | 0 1 d h 1.0 165 | 0 1 l y 1.0 166 | 0 1 h b 1.0 167 | 0 1 i t 1.0 168 | 0 1 @_UNKNOWN_SYMBOL_@ m 1.0 169 | 0 1 @0@ d 1.0 170 | 0 1 m s 1.0 171 | 0 1 r c 1.0 172 | 0 1 j t 1.0 173 | 0 1 k v 1.0 174 | 0 1 x v 1.0 175 | 0 1 j f 1.0 176 | 0 1 t n 1.0 177 | 0 1 o d 1.0 178 | 0 1 p a 1.0 179 | 0 1 h c 1.0 180 | 0 1 q b 1.0 181 | 0 1 z x 1.0 182 | 0 1 s q 1.0 183 | 0 1 x a 1.0 184 | 0 1 p b 1.0 185 | 0 1 j g 1.0 186 | 0 1 d a 1.0 187 | 0 1 q s 1.0 188 | 0 1 e k 1.0 189 | 0 1 z m 1.0 190 | 0 1 @_UNKNOWN_SYMBOL_@ p 1.0 191 | 0 1 p l 1.0 192 | 0 1 y j 1.0 193 | 0 1 o c 1.0 194 | 0 1 e j 1.0 195 | 0 1 z a 1.0 196 | 0 1 k i 1.0 197 | 0 1 q d 1.0 198 | 0 1 z b 1.0 199 | 0 1 s x 1.0 200 | 0 1 o @0@ 1.0 201 | 0 1 x g 1.0 202 | 0 1 i k 1.0 203 | 0 1 v t 1.0 204 | 0 1 h p 1.0 205 | 0 1 p m 1.0 206 | 0 1 q u 1.0 207 | 0 1 l u 1.0 208 | 0 1 h f 1.0 209 | 0 1 f b 1.0 210 | 0 1 x d 1.0 211 | 0 1 j h 1.0 212 | 0 1 @0@ z 1.0 213 | 0 1 r y 1.0 214 | 0 1 h q 1.0 215 | 0 1 x z 1.0 216 | 0 1 n i 1.0 217 | 0 1 o a 1.0 218 | 0 1 h g 1.0 219 | 0 1 q f 1.0 220 | 0 1 s n 1.0 221 | 0 1 m n 1.0 222 | 0 1 s m 1.0 223 | 0 1 x e 1.0 224 | 0 1 d @0@ 1.0 225 | 0 1 n j 1.0 226 | 0 1 l s 1.0 227 | 0 1 i z 1.0 228 | 0 1 c z 1.0 229 | 0 1 t y 1.0 230 | 0 1 j q 1.0 231 | 0 1 @0@ s 1.0 232 | 0 1 @_UNKNOWN_SYMBOL_@ l 1.0 233 | 0 1 k g 1.0 234 | 0 1 p i 1.0 235 | 0 1 t h 1.0 236 | 0 1 o n 1.0 237 | 0 1 d t 1.0 238 | 0 1 v m 1.0 239 | 0 1 s t 1.0 240 | 0 1 g d 1.0 241 | 0 1 p s 1.0 242 | 0 1 d k 1.0 243 | 0 1 v p 1.0 244 | 0 1 h t 1.0 245 | 0 1 q i 1.0 246 | 0 1 l q 1.0 247 | 0 1 h j 1.0 248 | 0 1 @0@ l 1.0 249 | 0 1 m k 1.0 250 | 0 1 r k 1.0 251 | 0 1 x h 1.0 252 | 0 1 j l 1.0 253 | 0 1 h u 1.0 254 | 0 1 a @0@ 1.0 255 | 0 1 t v 1.0 256 | 0 1 o l 1.0 257 | 0 1 d r 1.0 258 | 0 1 h k 1.0 259 | 0 1 v o 1.0 260 | 0 1 z p 1.0 261 | 0 1 s i 1.0 262 | 0 1 x i 1.0 263 | 0 1 d i 1.0 264 | 0 1 q k 1.0 265 | 0 1 l o 1.0 266 | 0 1 y k 1.0 267 | 0 1 @0@ e 1.0 268 | 0 1 r d 1.0 269 | 0 1 c v 1.0 270 | 0 1 j u 1.0 271 | 0 1 k u 1.0 272 | 0 1 p v 1.0 273 | 0 1 @_UNKNOWN_SYMBOL_@ h 1.0 274 | 0 1 p d 1.0 275 | 0 1 y r 1.0 276 | 0 1 o k 1.0 277 | 0 1 z y 1.0 278 | 0 1 v i 1.0 279 | 0 1 s p 1.0 280 | 0 1 z @0@ 1.0 281 | 0 1 t g 1.0 282 | 0 1 i l 1.0 283 | 0 1 c o 1.0 284 | 0 1 h x 1.0 285 | 0 1 p e 1.0 286 | 0 1 q m 1.0 287 | 0 1 l m 1.0 288 | 0 1 m u 1.0 289 | 0 1 h n 1.0 290 | 0 1 q z 1.0 291 | 0 1 x l 1.0 292 | 0 1 g z 1.0 293 | 0 1 h y 1.0 294 | 0 1 n q 1.0 295 | 0 1 o i 1.0 296 | 0 1 k h 1.0 297 | 0 1 m t 1.0 298 | 0 1 y h 1.0 299 | 0 1 h o 1.0 300 | 0 1 v k 1.0 301 | 0 1 z t 1.0 302 | 0 1 m f 1.0 303 | 0 1 s e 1.0 304 | 0 1 x m 1.0 305 | 0 1 y e 1.0 306 | 0 1 t e 1.0 307 | 0 1 n r 1.0 308 | 0 1 l k 1.0 309 | 0 1 i b 1.0 310 | 0 1 @_UNKNOWN_SYMBOL_@ v 1.0 311 | 0 1 c r 1.0 312 | 0 1 j i 1.0 313 | 0 1 y t 1.0 314 | 0 1 f y 1.0 315 | 0 1 @_UNKNOWN_SYMBOL_@ d 1.0 316 | 0 1 l z 1.0 317 | 0 1 z k 1.0 318 | 0 1 k o 1.0 319 | 0 1 t p 1.0 320 | 0 1 o v 1.0 321 | 0 1 v e 1.0 322 | 0 1 s l 1.0 323 | 0 1 p k 1.0 324 | 0 1 y g 1.0 325 | 0 1 c k 1.0 326 | 0 1 q a 1.0 327 | 0 1 l i 1.0 328 | 0 1 i d 1.0 329 | 0 1 @0@ t 1.0 330 | 0 1 m c 1.0 331 | 0 1 r s 1.0 332 | 0 1 x p 1.0 333 | 0 1 n c 1.0 334 | 0 1 j d 1.0 335 | 0 1 k f 1.0 336 | 0 1 l x 1.0 337 | 0 1 i u 1.0 338 | 0 1 d u 1.0 339 | 0 1 v n 1.0 340 | 0 1 o t 1.0 341 | 0 1 d z 1.0 342 | 0 1 a h 1.0 343 | 0 1 v g 1.0 344 | 0 1 s a 1.0 345 | 0 1 x q 1.0 346 | 0 1 q c 1.0 347 | 0 1 l g 1.0 348 | 0 1 @0@ m 1.0 349 | 0 1 p x 1.0 350 | 0 1 @_UNKNOWN_SYMBOL_@ r 1.0 351 | 0 1 b p 1.0 352 | 0 1 j m 1.0 353 | 0 1 p n 1.0 354 | 0 1 f u 1.0 355 | 0 1 l v 1.0 356 | 0 1 z o 1.0 357 | 0 1 y z 1.0 358 | 0 1 o s 1.0 359 | 0 1 s z 1.0 360 | 0 1 z q 1.0 361 | 0 1 a j 1.0 362 | 0 1 v a 1.0 363 | 0 1 s h 1.0 364 | 0 1 b q 1.0 365 | 0 1 t o 1.0 366 | 0 1 c g 1.0 367 | 0 1 q e 1.0 368 | 0 1 @0@ f 1.0 369 | 0 1 l e 1.0 370 | 0 1 r e 1.0 371 | 0 1 k t 1.0 372 | 0 1 b r 1.0 373 | 0 1 q o 1.0 374 | 0 1 x t 1.0 375 | 0 1 l t 1.0 376 | 0 1 g r 1.0 377 | 0 1 h a 1.0 378 | 0 1 n y 1.0 379 | 0 1 o q 1.0 380 | 0 1 m l 1.0 381 | 0 1 v j 1.0 382 | 0 1 a l 1.0 383 | 0 1 v c 1.0 384 | 0 1 b s 1.0 385 | 0 1 x u 1.0 386 | 0 1 n h 1.0 387 | 0 1 t m 1.0 388 | 0 1 u e 1.0 389 | 0 1 n z 1.0 390 | 0 1 l c 1.0 391 | 0 1 @_UNKNOWN_SYMBOL_@ y 1.0 392 | 0 1 i j 1.0 393 | 0 1 @_UNKNOWN_SYMBOL_@ n 1.0 394 | 0 1 b t 1.0 395 | 0 1 @_UNKNOWN_SYMBOL_@ g 1.0 396 | 0 1 f q 1.0 397 | 0 1 g y 1.0 398 | 0 1 l r 1.0 399 | 0 1 u d 1.0 400 | 0 1 z c 1.0 401 | 0 1 @_UNKNOWN_SYMBOL_@ u 1.0 402 | 0 1 s v 1.0 403 | 0 1 t x 1.0 404 | 0 1 z u 1.0 405 | 0 1 a n 1.0 406 | 0 1 s d 1.0 407 | 0 1 b u 1.0 408 | 0 1 p c 1.0 409 | 0 1 @_UNKNOWN_SYMBOL_@ o 1.0 410 | 0 1 v @0@ 1.0 411 | 0 1 h d 1.0 412 | 0 1 r t 1.0 413 | 0 1 l a 1.0 414 | 0 1 p j 1.0 415 | 0 1 c @0@ 1.0 416 | 0 1 j o 1.0 417 | 0 1 b v 1.0 418 | 0 1 n k 1.0 419 | 0 1 r z 1.0 420 | 0 1 k n 1.0 421 | 0 1 l p 1.0 422 | 0 1 g v 1.0 423 | 0 1 h e 1.0 424 | 0 1 j e 1.0 425 | 0 1 e u 1.0 426 | 0 1 v f 1.0 427 | 0 1 d b 1.0 428 | 0 1 x y 1.0 429 | 0 1 u a 1.0 430 | 0 1 e t 1.0 431 | 0 1 j s 1.0 432 | 0 1 @0@ u 1.0 433 | 0 1 @_UNKNOWN_SYMBOL_@ j 1.0 434 | 0 1 b x 1.0 435 | 0 1 @_UNKNOWN_SYMBOL_@ c 1.0 436 | 0 1 k e 1.0 437 | 0 1 p f 1.0 438 | 0 1 f m 1.0 439 | 0 1 l n 1.0 440 | 0 1 m @0@ 1.0 441 | 0 1 s r 1.0 442 | 0 1 z g 1.0 443 | 0 1 a b 1.0 444 | 0 1 v y 1.0 445 | 0 1 b y 1.0 446 | 0 1 @0@ n 1.0 447 | 0 1 m e 1.0 448 | 0 1 r m 1.0 449 | 0 1 n e 1.0 450 | 0 1 b z 1.0 451 | 0 1 g s 1.0 452 | 0 1 m z 1.0 453 | 0 1 g j 1.0 454 | 0 1 h i 1.0 455 | 0 1 o y 1.0 456 | 0 1 e q 1.0 457 | 0 1 m d 1.0 458 | 0 1 f @0@ 1.0 459 | 0 1 a d 1.0 460 | 0 1 n f 1.0 461 | 0 1 p q 1.0 462 | 0 1 n p 1.0 463 | 0 1 s f 1.0 464 | 0 1 t u 1.0 465 | 0 1 u m 1.0 466 | 0 1 @0@ g 1.0 467 | 0 1 r f 1.0 468 | 0 1 e p 1.0 469 | 0 1 @_UNKNOWN_SYMBOL_@ q 1.0 470 | 0 1 k s 1.0 471 | 0 1 @_UNKNOWN_SYMBOL_@ f 1.0 472 | 0 1 t d 1.0 473 | 0 1 o b 1.0 474 | 0 1 f i 1.0 475 | 0 1 g q 1.0 476 | 0 1 l j 1.0 477 | 0 1 u l 1.0 478 | 0 1 d g 1.0 479 | 0 1 d l 1.0 480 | 0 1 a f 1.0 481 | 0 1 f x 1.0 482 | 0 1 v u 1.0 483 | 0 1 c m 1.0 484 | 0 1 i p 1.0 485 | 0 1 y s 1.0 486 | 0 1 h l 1.0 487 | 0 1 t b 1.0 488 | 0 1 n s 1.0 489 | 0 1 l h 1.0 490 | 0 1 g n 1.0 491 | 0 1 h m 1.0 492 | 0 1 i e 1.0 493 | 0 1 d e 1.0 494 | 0 1 e @0@ 1.0 495 | 0 1 d j 1.0 496 | 0 1 a x 1.0 497 | 0 1 u i 1.0 498 | 0 1 a i 1.0 499 | 0 1 j k 1.0 500 | 0 1 p h 1.0 501 | 0 1 y f 1.0 502 | 0 1 @_UNKNOWN_SYMBOL_@ b 1.0 503 | 0 1 k m 1.0 504 | 0 1 f e 1.0 505 | 0 1 g u 1.0 506 | 0 1 l f 1.0 507 | 0 1 u h 1.0 508 | 0 1 i g 1.0 509 | 0 1 q @0@ 1.0 510 | 0 1 s j 1.0 511 | 0 1 a z 1.0 512 | 0 1 f t 1.0 513 | 0 1 v q 1.0 514 | 0 1 b a 1.0 515 | 0 1 c i 1.0 516 | 0 1 a k 1.0 517 | 0 1 @0@ v 1.0 518 | 0 1 r u 1.0 519 | 0 1 n m 1.0 520 | 0 1 k d 1.0 521 | 0 1 g k 1.0 522 | 0 1 l d 1.0 523 | 0 1 m r 1.0 524 | 0 1 j @0@ 1.0 525 | 0 1 g b 1.0 526 | 0 1 e y 1.0 527 | 0 1 v z 1.0 528 | 0 1 v s 1.0 529 | 0 1 b c 1.0 530 | 0 1 i v 1.0 531 | 0 1 n x 1.0 532 | 0 1 j a 1.0 533 | 0 1 @0@ o 1.0 534 | 0 1 a m 1.0 535 | 0 1 r n 1.0 536 | 0 1 e x 1.0 537 | 0 1 @_UNKNOWN_SYMBOL_@ i 1.0 538 | 0 1 t l 1.0 539 | 0 1 b d 1.0 540 | 0 1 o j 1.0 541 | 0 1 f a 1.0 542 | 0 1 g i 1.0 543 | 0 1 l b 1.0 544 | 0 1 u t 1.0 545 | 0 1 z s 1.0 546 | 0 1 d o 1.0 547 | 0 1 f p 1.0 548 | 0 1 b e 1.0 549 | 0 1 c e 1.0 550 | 0 1 i x 1.0 551 | 0 1 @0@ h 1.0 552 | 0 1 m o 1.0 553 | 0 1 r g 1.0 554 | 0 1 c s 1.0 555 | 0 1 a o 1.0 556 | 0 1 k r 1.0 557 | 0 1 z j 1.0 558 | 0 1 t j 1.0 559 | 0 1 b f 1.0 560 | 0 1 o h 1.0 561 | 0 1 d v 1.0 562 | 0 1 g o 1.0 563 | 0 1 g f 1.0 564 | 0 1 i m 1.0 565 | 0 1 d m 1.0 566 | 0 1 a p 1.0 567 | 0 1 c l 1.0 568 | 0 1 s g 1.0 569 | 0 1 u g 1.0 570 | 0 1 b g 1.0 571 | 0 1 @0@ a 1.0 572 | 0 1 u q 1.0 573 | 0 1 p r 1.0 574 | 0 1 e d 1.0 575 | 0 1 @_UNKNOWN_SYMBOL_@ e 1.0 576 | 0 1 f s 1.0 577 | 0 1 o g 1.0 578 | 0 1 u f 1.0 579 | 0 1 b h 1.0 580 | 0 1 g m 1.0 581 | 0 1 u p 1.0 582 | 0 1 i o 1.0 583 | 0 1 s b 1.0 584 | 0 1 a r 1.0 585 | 0 1 f l 1.0 586 | 0 1 i @0@ 1.0 587 | 0 1 m y 1.0 588 | 0 1 b i 1.0 589 | 0 1 c a 1.0 590 | 0 1 b @0@ 1.0 591 | 0 1 l @0@ 1.0 592 | 0 1 y u 1.0 593 | 0 1 a c 1.0 594 | 0 1 n u 1.0 595 | 0 1 o e 1.0 596 | 0 1 k l 1.0 597 | 0 1 y l 1.0 598 | 0 1 m x 1.0 599 | 0 1 b j 1.0 600 | 0 1 g c 1.0 601 | 0 1 m j 1.0 602 | 0 1 n d 1.0 603 | 0 1 e a 1.0 604 | 0 1 v r 1.0 605 | 0 1 a t 1.0 606 | 0 1 n v 1.0 607 | 0 1 c h 1.0 608 | 0 1 y o 1.0 609 | 0 1 u c 1.0 610 | 0 1 b k 1.0 611 | 0 1 v b 1.0 612 | 0 1 e v 1.0 613 | 0 1 a e 1.0 614 | 0 1 q x 1.0 615 | 0 1 r v 1.0 616 | 0 1 @_UNKNOWN_SYMBOL_@ a 1.0 617 | 0 1 k c 1.0 618 | 0 1 f o 1.0 619 | 0 1 u b 1.0 620 | 0 1 b l 1.0 621 | 0 1 o r 1.0 622 | 0 1 d p 1.0 623 | 0 1 r l 1.0 624 | 0 1 g a 1.0 625 | 0 1 y c 1.0 626 | 0 1 a v 1.0 627 | 0 1 f h 1.0 628 | 0 1 b m 1.0 629 | 0 1 @_UNKNOWN_SYMBOL_@ k 1.0 630 | 0 1 @0@ p 1.0 631 | 0 1 m g 1.0 632 | 0 1 r o 1.0 633 | 0 1 a g 1.0 634 | 0 1 n g 1.0 635 | 0 1 k z 1.0 636 | 0 1 @_UNKNOWN_SYMBOL_@ @0@ 1.0 637 | 0 1 i q 1.0 638 | 0 1 j c 1.0 639 | 0 1 t r 1.0 640 | 0 1 g @0@ 1.0 641 | 0 1 b n 1.0 642 | 0 1 o p 1.0 643 | 0 1 x @0@ 1.0 644 | 0 1 e s 1.0 645 | 0 1 e m 1.0 646 | 0 1 y n 1.0 647 | 0 1 c d 1.0 648 | 0 1 u o 1.0 649 | 0 1 b o 1.0 650 | 0 1 @0@ i 1.0 651 | 0 1 r h 1.0 652 | 0 1 e r 1.0 653 | 0 1 u y 1.0 654 | 0 1 k q 1.0 655 | 0 1 a y 1.0 656 | 0 1 e l 1.0 657 | 0 1 g x 1.0 658 | 0 1 s @0@ 1.0 659 | 0 1 i s 1.0 660 | 0 1 f k 1.0 661 | 0 1 z e 1.0 662 | 0 1 u n 1.0 663 | 0 1 v l 1.0 664 | 0 1 g e 1.0 665 | 0 1 u x 1.0 666 | 0 1 f z 1.0 667 | 0 1 t c 1.0 668 | 0 1 f d 1.0 669 | 0 1 @0@ b 1.0 670 | 0 1 m q 1.0 671 | 0 1 r a 1.0 672 | 0 1 c y 1.0 673 | 0 1 x b 1.0 674 | 0 1 j r 1.0 675 | 0 1 p @0@ 1.0 676 | 0 1 p y 1.0 677 | 0 1 y x 1.0 678 | 0 1 o m 1.0 679 | 0 1 m p 1.0 680 | 0 1 u @0@ 1.0 681 | 0 1 x c 1.0 682 | 0 1 m b 1.0 683 | 0 1 n l 1.0 684 | 0 1 t a 1.0 685 | 0 1 e i 1.0 686 | 0 1 u k 1.0 687 | 0 1 y q 1.0 688 | 0 1 i f 1.0 689 | 0 1 q p 1.0 690 | 0 1 e h 1.0 691 | 0 1 r @0@ 1.0 692 | 0 1 z f 1.0 693 | 0 1 f g 1.0 694 | 0 1 d s 1.0 695 | 0 1 u j 1.0 696 | 0 1 v h 1.0 697 | 0 1 o z 1.0 698 | 0 1 d x 1.0 699 | 0 1 n @0@ 1.0 700 | 0 1 h r 1.0 701 | 0 1 p o 1.0 702 | 0 1 f v 1.0 703 | 1 1 a a 0.0 704 | 1 1 c c 0.0 705 | 1 1 b b 0.0 706 | 1 1 e e 0.0 707 | 1 1 d d 0.0 708 | 1 1 g g 0.0 709 | 1 1 f f 0.0 710 | 1 1 i i 0.0 711 | 1 1 h h 0.0 712 | 1 1 k k 0.0 713 | 1 1 j j 0.0 714 | 1 1 m m 0.0 715 | 1 1 l l 0.0 716 | 1 1 o o 0.0 717 | 1 1 n n 0.0 718 | 1 1 q q 0.0 719 | 1 1 p p 0.0 720 | 1 1 s s 0.0 721 | 1 1 r r 0.0 722 | 1 1 u u 0.0 723 | 1 1 t t 0.0 724 | 1 1 v v 0.0 725 | 1 1 y y 0.0 726 | 1 1 x x 0.0 727 | 1 1 z z 0.0 728 | 1 0.0 729 | -------------------------------------------------------------------------------- /tests/errmodel.extrachars.txt: -------------------------------------------------------------------------------- 1 | 0 1 v z 2 | 1 2 e l 3 | 2 3 s u 4 | 3 4 q t 5 | 4 6 | -------------------------------------------------------------------------------- /tests/no-errormodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell $srcdir/tests/no_errormodel.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/no_errmodel.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <hfstspeller dtdversion="1.0" hfstversion="3"> 3 | <info> 4 | <locale>qtz</locale> 5 | <title>Example speller 6 | 7 | This example is for the automatic test suite of hfst-ospell. 8 | 9 | 1.5.73 10 | 2012-08-15 11 | Flammie 12 | 14 | 15 | 16 | Example dictionary 17 | Vuola lávlla 18 | Example dictionary recognises a word. 19 | 20 | Vuola, vuola mun aigon lási 21 | vuolas juhkaluvvat, 22 | vuola, vuola mun aigon lási 23 | vuolas mieladuvvat 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /tests/no_errormodel.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/no_errormodel.zhfst -------------------------------------------------------------------------------- /tests/speller_analyser.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/speller_analyser.zhfst -------------------------------------------------------------------------------- /tests/speller_basic.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/speller_basic.zhfst -------------------------------------------------------------------------------- /tests/speller_edit1.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/speller_edit1.zhfst -------------------------------------------------------------------------------- /tests/test.strings: -------------------------------------------------------------------------------- 1 | olut 2 | vesi 3 | sivolutesi 4 | olu 5 | ßþ”×\ 6 | -------------------------------------------------------------------------------- /tests/trailing-spaces.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if test -x ./hfst-ospell ; then 4 | if ! cat $srcdir/tests/test.strings | ./hfst-ospell -v $srcdir/tests/trailing_spaces.zhfst ; then 5 | exit 1 6 | fi 7 | else 8 | echo ./hfst-ospell not built 9 | exit 77 10 | fi 11 | 12 | -------------------------------------------------------------------------------- /tests/trailing_spaces.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | qtz 5 | Example speller 6 | 7 | This example is for the automatic test suite of hfst-ospell. 8 | 9 | 1.5.73 10 | 2012-08-15 11 | Flammie 12 | 15 | 16 | 18 | Example dictionary 19 | Vuola lávlla 20 | Example dictionary recognises a word. 21 | 22 | Vuola, vuola mun aigon lási 23 | vuolas juhkaluvvat, 24 | vuola, vuola mun aigon lási 25 | vuolas mieladuvvat 26 | 27 | 28 | 29 | Sahtiwaari 30 | 31 | Example error model turns one word into another. 32 | 33 | 34 | errormodel.default.hfst 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/trailing_spaces.zhfst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfst/hfst-ospell/85edd77e959df213d2e713cbd3ca0ff9a600f462/tests/trailing_spaces.zhfst -------------------------------------------------------------------------------- /windows-Makefile.am: -------------------------------------------------------------------------------- 1 | ## Process this file with automake to produce Makefile.in 2 | 3 | # Copyright 2010 University of Helsinki 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # to silence: 18 | # libtoolize: Consider adding `-I m4' to ACLOCAL_AMFLAGS in Makefile.am. 19 | ACLOCAL_AMFLAGS=-I m4 20 | 21 | # targets 22 | if EXTRA_DEMOS 23 | CONFERENCE_DEMOS=hfst-ospell-norvig hfst-ospell-fsmnlp-2012 hfst-ospell-cicling\ 24 | hfst-ospell-survey hfst-ospell-lrec2013 hfst-ispell 25 | endif # EXTRA_DEMOS 26 | 27 | bin_PROGRAMS=hfst-ospell $(CONFERENCE_DEMOS) 28 | #lib_LTLIBRARIES=libhfstospell.la 29 | man1_MANS=hfst-ospell.1 30 | 31 | PKG_LIBS= 32 | PKG_CXXFLAGS= 33 | 34 | if WANT_ARCHIVE 35 | PKG_LIBS+=$(LIBARCHIVE_LIBS) 36 | PKG_CXXFLAGS+=$(LIBARCHIVE_CFLAGS) 37 | endif 38 | 39 | #if WANT_LIBXMLPP 40 | #PKG_LIBS+=$(LIBXMLPP_LIBS) 41 | #PKG_CXXFLAGS+=$(LIBXMLPP_CFLAGS) 42 | #endif 43 | 44 | if WANT_TINYXML2 45 | PKG_LIBS+=$(TINYXML2_LIBS) 46 | PKG_CXXFLAGS+=$(TINYXML2_CFLAGS) 47 | endif 48 | 49 | # library parts 50 | #libhfstospell_la_SOURCES= 51 | 52 | #libhfstospell_la_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS) 53 | #libhfstospell_la_LDFLAGS=-no-undefined -version-info 4:0:0 \ 54 | $(PKG_LIBS) 55 | 56 | # link sample program against library here 57 | hfst_ospell_SOURCES=main.cc hfst-ol.cc ospell.cc \ 58 | ZHfstOspeller.cc ZHfstOspellerXmlMetadata.cc \ 59 | tinyxml2.cc \ 60 | libarchive/archive_acl.c \ 61 | libarchive/archive_acl_private.h \ 62 | libarchive/archive_check_magic.c \ 63 | libarchive/archive_crc32.h \ 64 | libarchive/archive_crypto.c \ 65 | libarchive/archive_crypto_private.h \ 66 | libarchive/archive_endian.h \ 67 | libarchive/archive_entry.c \ 68 | libarchive/archive_entry.h \ 69 | libarchive/archive_entry_copy_stat.c \ 70 | libarchive/archive_entry_link_resolver.c \ 71 | libarchive/archive_entry_locale.h \ 72 | libarchive/archive_entry_private.h \ 73 | libarchive/archive_entry_sparse.c \ 74 | libarchive/archive_entry_stat.c \ 75 | libarchive/archive_entry_strmode.c \ 76 | libarchive/archive_entry_xattr.c \ 77 | libarchive/archive_options.c \ 78 | libarchive/archive_options_private.h \ 79 | libarchive/archive_platform.h \ 80 | libarchive/archive_ppmd_private.h \ 81 | libarchive/archive_ppmd7.c \ 82 | libarchive/archive_ppmd7_private.h \ 83 | libarchive/archive_private.h \ 84 | libarchive/archive_rb.c \ 85 | libarchive/archive_rb.h \ 86 | libarchive/archive_read.c \ 87 | libarchive/archive_read_data_into_fd.c \ 88 | libarchive/archive_read_disk_entry_from_file.c \ 89 | libarchive/archive_read_disk_posix.c \ 90 | libarchive/archive_read_disk_private.h \ 91 | libarchive/archive_read_disk_set_standard_lookup.c \ 92 | libarchive/archive_read_extract.c \ 93 | libarchive/archive_read_open_fd.c \ 94 | libarchive/archive_read_open_file.c \ 95 | libarchive/archive_read_open_filename.c \ 96 | libarchive/archive_read_open_memory.c \ 97 | libarchive/archive_read_private.h \ 98 | libarchive/archive_read_set_options.c \ 99 | libarchive/archive_read_support_filter_all.c \ 100 | libarchive/archive_read_support_filter_compress.c \ 101 | libarchive/archive_read_support_filter_gzip.c \ 102 | libarchive/archive_read_support_filter_none.c \ 103 | libarchive/archive_read_support_filter_program.c \ 104 | libarchive/archive_read_support_filter_rpm.c \ 105 | libarchive/archive_read_support_filter_uu.c \ 106 | libarchive/archive_read_support_filter_xz.c \ 107 | libarchive/archive_read_support_format_7zip.c \ 108 | libarchive/archive_read_support_format_all.c \ 109 | libarchive/archive_read_support_format_ar.c \ 110 | libarchive/archive_read_support_format_by_code.c \ 111 | libarchive/archive_read_support_format_cab.c \ 112 | libarchive/archive_read_support_format_cpio.c \ 113 | libarchive/archive_read_support_format_empty.c \ 114 | libarchive/archive_read_support_format_iso9660.c \ 115 | libarchive/archive_read_support_format_lha.c \ 116 | libarchive/archive_read_support_format_mtree.c \ 117 | libarchive/archive_read_support_format_rar.c \ 118 | libarchive/archive_read_support_format_raw.c \ 119 | libarchive/archive_read_support_format_tar.c \ 120 | libarchive/archive_read_support_format_xar.c \ 121 | libarchive/archive_read_support_format_zip.c \ 122 | libarchive/archive_string.c \ 123 | libarchive/archive_string.h \ 124 | libarchive/archive_string_composition.h \ 125 | libarchive/archive_string_sprintf.c \ 126 | libarchive/archive_util.c \ 127 | libarchive/archive_virtual.c \ 128 | libarchive/archive_write.c \ 129 | libarchive/archive_write_disk_posix.c \ 130 | libarchive/archive_write_disk_private.h \ 131 | libarchive/archive_write_disk_set_standard_lookup.c \ 132 | libarchive/archive_write_open_fd.c \ 133 | libarchive/archive_write_open_file.c \ 134 | libarchive/archive_write_open_filename.c \ 135 | libarchive/archive_write_open_memory.c \ 136 | libarchive/archive_write_private.h \ 137 | libarchive/archive_write_add_filter_compress.c \ 138 | libarchive/archive_write_add_filter_gzip.c \ 139 | libarchive/archive_write_add_filter_none.c \ 140 | libarchive/archive_write_add_filter_program.c \ 141 | libarchive/archive_write_add_filter_xz.c \ 142 | libarchive/archive_write_set_format.c \ 143 | libarchive/archive_write_set_format_7zip.c \ 144 | libarchive/archive_write_set_format_ar.c \ 145 | libarchive/archive_write_set_format_by_name.c \ 146 | libarchive/archive_write_set_format_cpio.c \ 147 | libarchive/archive_write_set_format_cpio_newc.c \ 148 | libarchive/archive_write_set_format_iso9660.c \ 149 | libarchive/archive_write_set_format_mtree.c \ 150 | libarchive/archive_write_set_format_pax.c \ 151 | libarchive/archive_write_set_format_shar.c \ 152 | libarchive/archive_write_set_format_ustar.c \ 153 | libarchive/archive_write_set_format_gnutar.c \ 154 | libarchive/archive_write_set_format_xar.c \ 155 | libarchive/archive_write_set_format_zip.c \ 156 | libarchive/archive_write_set_options.c \ 157 | libarchive/config_freebsd.h \ 158 | libarchive/archive_read_support_filter_bzip2.c \ 159 | libarchive/archive_write_add_filter_bzip2.c \ 160 | libarchive/filter_fork.c \ 161 | libarchive/filter_fork.h \ 162 | libarchive/archive_entry_copy_bhfi.c \ 163 | libarchive/archive_read_disk_windows.c \ 164 | libarchive/archive_windows.h \ 165 | libarchive/archive_windows.c \ 166 | libarchive/archive_write_disk_windows.c \ 167 | libarchive/filter_fork_windows.c 168 | 169 | #hfst_ospell_LDADD=libhfstospell.la 170 | hfst_ospell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 171 | $(PKG_CXXFLAGS) 172 | 173 | if EXTRA_DEMOS 174 | hfst_ospell_norvig_SOURCES=main-norvig.cc 175 | #hfst_ospell_norvig_LDADD=libhfstospell.la 176 | hfst_ospell_norvig_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 177 | $(PKG_CXXFLAGS) 178 | 179 | hfst_ospell_cicling_SOURCES=main-cicling.cc 180 | #hfst_ospell_cicling_LDADD=libhfstospell.la 181 | hfst_ospell_cicling_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 182 | $(PKG_CXXFLAGS) 183 | 184 | hfst_ospell_lrec2013_SOURCES=main-lrec2013.cc 185 | #hfst_ospell_lrec2013_LDADD=libhfstospell.la 186 | hfst_ospell_lrec2013_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 187 | $(PKG_CXXFLAGS) 188 | 189 | hfst_ospell_survey_SOURCES=main-survey.cc 190 | #hfst_ospell_survey_LDADD=libhfstospell.la 191 | hfst_ospell_survey_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 192 | $(PKG_CXXFLAGS) 193 | 194 | hfst_ospell_fsmnlp_2012_SOURCES=main-fsmnlp-2012.cc 195 | #hfst_ospell_fsmnlp_2012_LDADD=libhfstospell.la 196 | hfst_ospell_fsmnlp_2012_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 197 | $(PKG_CXXFLAGS) 198 | endif # EXTRA_DEMOS 199 | 200 | hfst_ispell_SOURCES=main-ispell.cc 201 | #hfst_ispell_LDADD=libhfstospell.la 202 | hfst_ispell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ 203 | $(PKG_CXXFLAGS) 204 | 205 | # install headers for library in hfst's includedir 206 | include_HEADERS=hfst-ol.h ospell.h ol-exceptions.h \ 207 | ZHfstOspeller.h ZHfstOspellerXmlMetadata.h tinyxml2.h 208 | 209 | # pkgconfig 210 | pkgconfigdir=$(libdir)/pkgconfig 211 | pkgconfig_DATA=hfstospell.pc 212 | 213 | # tests 214 | if CAN_TEST 215 | TXTS=acceptor.default.txt errmodel.default.txt errmodel.extrachars.txt 216 | check_DATA=speller_basic.zhfst empty_descriptions.zhfst \ 217 | empty_titles.zhfst empty_locale.zhfst \ 218 | trailing_spaces.zhfst \ 219 | spl.hfstol sug.hfstol err.hfstol \ 220 | acceptor.default.hfst errmodel.default.hfst \ 221 | errmodel.extrachars.hfst bad_errormodel.zhfst 222 | # Actual test scripts: 223 | TESTS=basic-zhfst.sh basic-legacy.sh basic-zhfst-fallback.sh \ 224 | empty-descriptions.sh empty-titles.sh empty-locale.sh \ 225 | trailing-spaces.sh bad-errormodel.sh empty-zhfst.sh 226 | XFAIL_TESTS=empty-descriptions.sh empty-titles.sh empty-locale.sh empty-zhfst.sh 227 | EXTRA_DIST=$(TXTS) $(TESTS) \ 228 | basic_test.xml empty_descriptions.xml empty_titles.xml \ 229 | empty_locale.xml trailing_spaces.xml \ 230 | test.strings 231 | 232 | clean-local: 233 | -rm -rf $(check_DATA) index.xml 234 | endif # CAN_TEST 235 | 236 | # N.B. Do not parallel test, race condition exists 237 | empty_descriptions.zhfst: acceptor.default.hfst errmodel.default.hfst empty_descriptions.xml 238 | cp -f $(srcdir)/empty_descriptions.xml index.xml 239 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 240 | 241 | empty_titles.zhfst: acceptor.default.hfst errmodel.default.hfst empty_titles.xml 242 | cp -f $(srcdir)/empty_titles.xml index.xml 243 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 244 | 245 | empty_locale.zhfst: acceptor.default.hfst errmodel.default.hfst empty_locale.xml 246 | cp -f $(srcdir)/empty_locale.xml index.xml 247 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 248 | 249 | trailing_spaces.zhfst: acceptor.default.hfst errmodel.default.hfst trailing_spaces.xml 250 | cp -f $(srcdir)/trailing_spaces.xml index.xml 251 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 252 | 253 | # N.B. Do not parallel test, race condition exists 254 | speller_basic.zhfst: acceptor.default.hfst errmodel.default.hfst basic_test.xml 255 | cp $(srcdir)/basic_test.xml index.xml 256 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 257 | 258 | bad_errormodel.zhfst: acceptor.default.hfst errmodel.extrachars.hfst index.xml 259 | cp -f $(srcdir)/errmodel.extrachars.hfst errmodel.default.hfst 260 | $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml 261 | 262 | sug.hfstol: acceptor.default.hfst 263 | -ln -sf $< $@ 264 | 265 | spl.hfstol: acceptor.default.hfst 266 | -ln -sf $< $@ 267 | 268 | err.hfstol: errmodel.default.hfst 269 | -ln -sf $< $@ 270 | 271 | .txt.hfst: 272 | hfst-txt2fst $< | hfst-fst2fst -f olw -o $@ 273 | 274 | .txt.hfstol: 275 | hfst-txt2fst $< | hfst-fst2fst -f olw -o $@ 276 | 277 | hfst-ospell.1: hfst-ospell 278 | help2man --no-discard-stderr $< > $@ 279 | -------------------------------------------------------------------------------- /windows-configure.ac: -------------------------------------------------------------------------------- 1 | ## Process this file with autoconf to produce configure script 2 | 3 | ## Copyright (C) 2010 University of Helsinki 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # autoconf requirements 19 | AC_PREREQ([2.62]) 20 | LT_PREREQ([2.2.6]) 21 | 22 | # init 23 | AC_INIT([hfstospell], [0.3.0], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.sf.net]) 24 | AC_CONFIG_AUX_DIR([build-aux]) 25 | AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign check-news color-tests silent-rules]) 26 | AM_SILENT_RULES([yes]) 27 | AC_REVISION([$Revision: 3876 $]) 28 | AC_CONFIG_MACRO_DIR([m4]) 29 | AC_CONFIG_SRCDIR([ospell.cc]) 30 | AC_CONFIG_HEADERS([config.h]) 31 | 32 | # Information on package 33 | HFSTOSPELL_NAME=hfstospell 34 | HFSTOSPELL_MAJOR=0 35 | HFSTOSPELL_MINOR=3 36 | HFSTOSPELL_EXTENSION=.0 37 | HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION 38 | AC_SUBST(HFSTOSPELL_MAJOR) 39 | AC_SUBST(HFSTOSPELL_MINOR) 40 | AC_SUBST(HFSTOSPELL_VERSION) 41 | AC_SUBST(HFSTOSPELL_NAME) 42 | 43 | # Check for pkg-config first - the configuration won't work if it isn't available: 44 | #AC_PATH_PROG([PKGCONFIG], [pkg-config], [no]) 45 | #AS_IF([test "x$PKGCONFIG" = xno], [AC_MSG_ERROR([pkg-config is required - please install])]) 46 | #AC_PATH_PROG([DOXYGEN], [doxygen], [false]) 47 | #AM_CONDITIONAL([CAN_DOXYGEN], [test "x$DOXYGEN" != xfalse]) 48 | 49 | 50 | # Settings 51 | AC_ARG_ENABLE([extra_demos], 52 | [AS_HELP_STRING([--enable-extra-demos], 53 | [build conference demos for science reproduction @<:@default=no@:>@])], 54 | [enable_extra_demos=$enableval], [enable_extra_demos=no]) 55 | AM_CONDITIONAL([EXTRA_DEMOS], [test x$enable_extra_demos != xno]) 56 | AC_ARG_ENABLE([zhfst], 57 | [AS_HELP_STRING([--enable-zhfst], 58 | [support zipped complex automaton sets @<:@default=check@:>@])], 59 | [enable_zhfst=$enableval], [enable_zhfst=check]) 60 | AC_ARG_ENABLE([xml], 61 | [AS_HELP_STRING([--enable-xml=LIBXML], 62 | [support xml metadata for zipped automaton sets with library LIBXML @<:@default=libxmlpp@:>@])], 63 | [enable_xml=$enableval], [enable_xml=libxmlpp]) 64 | AC_ARG_WITH([extract], 65 | [AS_HELP_STRING([--with-extract=TARGET], 66 | [extract zhfst archives to tmpdir or mem @<:@default=mem@:>@])], 67 | [with_extract=$withval], [with_extract=mem]) 68 | AS_IF([test "x$with_extract" = xmem], [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [1], 69 | [Define to extract zhfst archives to char buffer])], 70 | [AS_IF([test "x$with_extract" = xtmpdir], 71 | [AC_DEFINE([ZHFST_EXTRACT_TO_TMPDIR], [1], 72 | [Define to extract zhfst to tmp dir])], 73 | [AC_MSG_ERROR([Use with-extract to mem or tmpdir])])]) 74 | 75 | # Checks for programs 76 | m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) 77 | AC_PROG_CC 78 | AC_PROG_CXX 79 | AC_LIBTOOL_WIN32_DLL 80 | LT_INIT 81 | AC_PROG_INSTALL 82 | AC_PROG_LN_S 83 | AC_PROG_MAKE_SET 84 | AC_PATH_PROG([HFST_TXT2FST], [hfst-txt2fst], [false]) 85 | AC_PATH_PROG([HFST_FST2FST], [hfst-fst2fst], [false]) 86 | AC_PATH_PROG([ZIP], [zip], [false]) 87 | AM_CONDITIONAL([CAN_TEST], 88 | [test x$HFST_TXT2FST != xfalse -a x$HFST_FST2FST != xfalse -a x$ZIP != xfalse]) 89 | 90 | # Checks for libraries 91 | #AS_IF([test x$enable_zhfst != xno], 92 | # [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 3], 93 | # [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives]) 94 | # enable_zhfst=yes], 95 | # [enable_zhfst=no])]) 96 | 97 | AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives]) 98 | AM_CONDITIONAL([WANT_ARCHIVE], [test x$enable_zhfst != xno]) 99 | 100 | #AS_IF([test x$enable_xml = xlibxmlpp], 101 | # [PKG_CHECK_MODULES([LIBXMLPP], [libxml++-2.6 >= 2.10.0], 102 | # [AC_DEFINE([HAVE_LIBXML], [1], [Use libxml++])], 103 | # [AC_MSG_WARN([libxml++ failed, disabling xml]) 104 | # enable_xml=no])]) 105 | #AM_CONDITIONAL([WANT_LIBXMLPP], [test x$enable_xml = xlibxmlpp]) 106 | 107 | #AS_IF([test x$enable_xml = xtinyxml2], 108 | # [PKG_CHECK_MODULES([TINYXML2], [tinyxml2 >= 1.0.8], 109 | # [AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml])], 110 | # [AC_MSG_WARN([tinyxml missing, xml disabled]) 111 | # enable_xml=no])]) 112 | 113 | AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml]) 114 | AM_CONDITIONAL([WANT_TINYXML2], [test x$enable_xml = xtinyxml2]) 115 | 116 | # Checks for header files 117 | AC_CHECK_HEADERS([getopt.h error.h]) 118 | 119 | # Checks for types 120 | AC_TYPE_SIZE_T 121 | 122 | # Checks for structures 123 | 124 | # Checks for compiler characteristics 125 | 126 | # Checks for library functions 127 | AC_FUNC_MALLOC 128 | AC_CHECK_FUNCS([strndup error]) 129 | # Checks for system services 130 | 131 | # config files 132 | AC_CONFIG_FILES([Makefile hfstospell.pc]) 133 | 134 | # output 135 | AC_OUTPUT 136 | 137 | cat <