├── .gitattributes ├── .github └── workflows │ └── build.yaml ├── .gitignore ├── AUTHORS ├── COPYING ├── ChangeLog ├── Makefile.am ├── NEWS ├── README ├── README.md ├── Usage.md ├── autogen.sh ├── configure.ac ├── imgs ├── lexc.dot ├── lexc.png ├── twoc.dot └── twoc.png ├── lexd.pc.in ├── m4 └── ax_check_compile_flag.m4 ├── src ├── Makefile.am ├── help2man.sh ├── icu-iter.cc ├── icu-iter.h ├── lexd.1 ├── lexd.cc ├── lexdcompiler.cc └── lexdcompiler.h └── tests ├── README.md ├── comp.lexd ├── feature ├── Makefile ├── negtest-col0.lexd ├── negtest-trailing-bracket.lexd ├── test-alt.lexd ├── test-alt.lexd.txt.strings.gold ├── test-anonlex-modifier.lexd ├── test-anonlex-modifier.lexd.txt.strings.gold ├── test-anonlex.lexd ├── test-anonlex.lexd.txt.strings.gold ├── test-anonpat-filter-ops.lexd ├── test-anonpat-filter-ops.lexd.txt.strings.gold ├── test-anonpat-filter.lexd ├── test-anonpat-filter.lexd.txt.strings.gold ├── test-anonpat-modifier.lexd ├── test-anonpat-modifier.lexd.txt.strings.gold ├── test-anonpat-nospaces.lexd ├── test-anonpat-nospaces.lexd.txt.strings.gold ├── test-anonpat-ops.lexd ├── test-anonpat-ops.lexd.txt.strings.gold ├── test-anonpat.lexd ├── test-anonpat.lexd.txt.strings.gold ├── test-conflicting-tags.lexd ├── test-conflicting-tags.lexd.txt.strings.gold ├── test-diacritic.lexd ├── test-diacritic.lexd.txt.strings.gold ├── test-disjoint-opt.lexd ├── test-disjoint-opt.lexd.txt.strings.gold ├── test-empty-patterns.lexd ├── test-empty-patterns.lexd.txt.strings.gold ├── test-empty.lexd ├── test-empty.lexd.txt.strings.gold ├── test-filter-crosstalk.lexd ├── test-filter-crosstalk.lexd.txt.strings.gold ├── test-lexdeftag.lexd ├── test-lexdeftag.lexd.txt.strings.gold ├── test-lexicon-side-tags.lexd ├── test-lexicon-side-tags.lexd.txt.strings.gold ├── test-lexname-space.lexd ├── test-lexname-space.lexd.txt.strings.gold ├── test-lexnegtag.lexd ├── test-lexnegtag.lexd.txt.strings.gold ├── test-lextag.lexd ├── test-lextag.lexd.txt.strings.gold ├── test-nontree.lexd ├── test-nontree.lexd.txt.strings.gold ├── test-oneside.lexd ├── test-oneside.lexd.txt.strings.gold ├── test-opt.lexd ├── test-opt.lexd.txt.strings.gold ├── test-or-filter.lexd ├── test-or-filter.lexd.txt.strings.gold ├── test-pairs.lexd ├── test-pairs.lexd.txt.strings.gold ├── test-pattag-coherent.lexd ├── test-pattag-coherent.lexd.txt.strings.gold ├── test-pattag-details.lexd ├── test-pattag-details.lexd.txt.strings.gold ├── test-pattag.lexd ├── test-pattag.lexd.txt.strings.gold ├── test-pattern-independence.lexd ├── test-pattern-independence.lexd.txt.strings.gold ├── test-regex.lexd ├── test-regex.lexd.txt.strings.gold ├── test-revsieve.lexd ├── test-revsieve.lexd.txt.strings.gold ├── test-sieve.lexd ├── test-sieve.lexd.txt.strings.gold ├── test-sieveopt.lexd ├── test-sieveopt.lexd.txt.strings.gold ├── test-slots-and-operators-nospace.lexd ├── test-slots-and-operators-nospace.lexd.txt.strings.gold ├── test-xor-filter.lexd ├── test-xor-filter.lexd.txt.strings.gold ├── test-xor-multi.lexd └── test-xor-multi.lexd.txt.strings.gold ├── heb.lexc ├── heb.lexd ├── heb.sh2 ├── heb.twoc ├── heb_vow.twoc ├── kik.lexc ├── kik.lexd ├── kik.sh2 ├── kik.twoc ├── lin.lexc ├── lin.lexd ├── lin.sh2 ├── lin.twoc ├── timing.sh ├── trilit.lexd ├── wad.lexc ├── wad.lexd ├── wad.sh2 └── wad.twoc /.gitattributes: -------------------------------------------------------------------------------- 1 | * text eol=lf 2 | *.png binary 3 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: lexd CI Build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: dependencies 11 | run: | 12 | sudo apt-get -qy update 13 | sudo apt-get -qfy install apt-utils wget ca-certificates 14 | wget -q https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash 15 | sudo apt-get -qfy install --no-install-recommends build-essential automake autotools-dev pkg-config lttoolbox-dev hfst 16 | - name: autoreconf 17 | run: autoreconf -fvi 18 | - name: configure 19 | run: ./configure 20 | - name: build 21 | run: make -j4 V=1 VERBOSE=1 22 | - name: tests 23 | run: make test 24 | - name: make install 25 | run: sudo make install 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hfst 2 | *.att 3 | *.o 4 | *Makefile 5 | *Makefile.in 6 | src/.deps 7 | src/lexd 8 | tests/feature/*.lexd.* 9 | !tests/feature/*.gold 10 | !tests/feature/Makefile 11 | INSTALL 12 | aclocal.m4 13 | autom4te.cache 14 | config.log 15 | config.status 16 | configure 17 | depcomp 18 | install-sh 19 | missing 20 | stamp-h1 21 | lexd.pc 22 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | 2019-2021, Daniel Swanson 2 | 2020, Jonathan North Washington 3 | 2020, Nick Howell 4 | 2020, Tino Didriksen 5 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/ChangeLog -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | 2 | SUBDIRS = src 3 | 4 | EXTRA_DIST=autogen.sh 5 | check_targets = check-plain check-flags check-minimize-tags check-tags check-minimize check-single 6 | pkgconfigdir = $(libdir)/pkgconfig 7 | dist_pkgconfig_DATA = lexd.pc 8 | 9 | timing-test: all 10 | (cd tests || exit && ./timing.sh wad && ./timing.sh heb) 11 | check: $(check_targets) 12 | test: check 13 | check-clean: 14 | + make -C tests/feature clean 15 | 16 | $(check_targets): check-%: all tests/feature 17 | + make -C tests/feature O=$* LEXD_TEST_FLAGS="$$(echo '$*' | grep -v plain | sed 's/^\|-/ --/g')" check 18 | + make -C tests/feature O=$* clean 19 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/NEWS -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | # Lexd 2 | 3 | A lexicon compiler specialising in non-suffixational morphologies. 4 | 5 | This module compiles lexicons in a format loosely based on hfst-lexc and produces transducers in ATT format which are equivalent to those produced using the overgenerate-and-constrain approach with hfst-twolc (see [here](https://wiki.apertium.org/wiki/Morphotactic_constraints_with_twol) and [here](https://wiki.apertium.org/wiki/Replacement_for_flag_diacritics)). However, it is much faster (see below). 6 | 7 | See [Usage.md](Usage.md) for the rule file syntax. 8 | 9 | ## Installation 10 | 11 | First, clone this repository. 12 | 13 | To build, do 14 | ```bash 15 | ./autogen.sh 16 | make 17 | make install 18 | ``` 19 | 20 | If installing to a system-wide path, you may want to run `sudo make install` instead for the last step. 21 | 22 | To compile a lexicon file into a transducer, do 23 | ```bash 24 | lexd lexicon_file att_file 25 | ``` 26 | 27 | To get a speed comparison, do 28 | ```bash 29 | make timing-test 30 | ``` 31 | 32 | To run basic feature smoke-tests (fast), do 33 | ```bash 34 | make check 35 | ``` 36 | 37 | ## Why is it faster? 38 | 39 | When dealing with prefixes, the overgenerate-and-constrain approach initially builds a transducer like this: 40 | 41 | ![transducer that overgenerates](https://github.com/apertium/lexd/raw/main/imgs/lexc.png) 42 | 43 | Then composes that with a twolc rule to turn it into somehting like this: 44 | 45 | ![correct transducer](https://github.com/apertium/lexd/raw/main/imgs/twoc.png) 46 | 47 | But compiling the rule needed to do that can take hundreds of times longer than compiling the lexicon. 48 | 49 | Lexd, meanwhile, makes multiple copies of the lexical portion and attaches one to each prefix, thus generating the second transducer directly in a similar amount of time to what is required to generate the first one. 50 | 51 | | Language | Wamesa | Hebrew | Navajo | Lingala | 52 | |---|---:|---:|---:|---:| 53 | | Stems | 262 | 127 | 19 | 1470 54 | | Total forms | 12576 | 2540 | 473 | 1649496 55 | | Path restrictions | 14 | 10 | 17 | 19 56 | | **Lexc + Twolc** 57 | | Lexc compilation | 25ms | 15ms | 25ms | 230ms 58 | | Twolc compilation | 10245ms | 1360ms | 8460ms | 275525ms 59 | | Rule composition | 2050ms | 225ms | 1705ms | 45550ms 60 | | Minimization | 65ms | 5ms | 20ms | 155ms | 61 | | Total time | 12385ms | 1605ms | 10210ms | 321460ms | 62 | | **Lexd** 63 | | Lexd compilation | 210ms | 85ms | 10ms | 490ms | 64 | | Format conversion | 30ms | 5ms | 5ms | 55ms | 65 | | Total time | 240ms | 90ms | 15ms | 545ms | 66 | | **Speedup** | 52x | 18x | 681x | 590x | 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README -------------------------------------------------------------------------------- /Usage.md: -------------------------------------------------------------------------------- 1 | # Lexd Syntax 2 | 3 | ## Invocation 4 | 5 | The `lexd` binary generates [AT&T format] transducers. 6 | 7 | [AT&T Format]: https://wiki.apertium.org/wiki/ATT_format 8 | 9 | Sample, save to `verb.lexd`: 10 | ```verb.lexd 11 | PATTERNS 12 | VerbRoot VerbInfl 13 | 14 | LEXICON VerbRoot 15 | sing 16 | walk 17 | dance 18 | 19 | LEXICON VerbInfl 20 | : 21 | :s 22 | ``` 23 | 24 | Compile it (without flag diacritics) to ATT transducer format: 25 | ``` 26 | $ lexd verb.lexd > verb-generator.att 27 | ``` 28 | 29 | To compile to an `lttoolbox` transducer binary dictionary, use 30 | `lt-comp`; this can be used for lookup with `lt-proc`: 31 | ``` 32 | $ lt-comp rl verb-generator.att verb-analyser.bin 33 | main@standard 17 19 34 | $ echo 'sings' | lt-proc verb-analyser.bin 35 | ^sings/sing$ 36 | ``` 37 | 38 | To extract forms, use the [HFST] to first compile to `hfst` binary 39 | format: 40 | 41 | [HFST]: https://hfst.github.io/ 42 | 43 | ``` 44 | $ hfst-txt2fst verb-generator.att -o verb-generator.hfst 45 | ``` 46 | 47 | Then you can use `hfst-fst2strings`: 48 | ``` 49 | $ hfst-fst2strings verb-generator.hfst 50 | sing:sing 51 | sing:sings 52 | walk:walk 53 | walk:walks 54 | dance:dance 55 | dance:dances 56 | ``` 57 | 58 | ## Basic Syntax 59 | 60 | A Lexd rule file defines lexicons and patterns. Each lexicon consists of a list of entries which have an analysis side and a generation side, similar to lexicons in HFST Lexc. Patterns, meanwhile, replace Lexc's continuation lexicons. Each pattern consists of a list of lexicons or named patterns which the compiler concatenates in that order. 61 | 62 | ``` 63 | PATTERNS 64 | VerbRoot VerbInfl 65 | 66 | LEXICON VerbRoot 67 | sing 68 | walk 69 | dance 70 | 71 | LEXICON VerbInfl 72 | : 73 | :s 74 | ``` 75 | 76 | forms generated: 77 | ``` 78 | sing/sing 79 | sings/sing 80 | walk/walk 81 | walks/walk 82 | dance/dance 83 | dances/dance 84 | ``` 85 | 86 | Symbols enclosed in angle brackets or braces will be automatically interpreted as multicharacter symbols (presumably tags and archiphonemes, respectively): 87 | 88 | ``` 89 | PATTERNS 90 | X 91 | 92 | LEXICON X 93 | x:x{i} 94 | ``` 95 | 96 | resulting ATT file: 97 | ``` 98 | 0 1 x x 0.000000 99 | 1 2 {i} 0.000000 100 | 2 0.000000 101 | ``` 102 | 103 | Any character can be escaped with a backslash: 104 | 105 | ``` 106 | PATTERNS 107 | X 108 | 109 | LEXICON X 110 | x\:x{i} 111 | ``` 112 | 113 | resulting ATT file: 114 | ``` 115 | 0 1 x x 0.000000 116 | 1 2 < {i} 0.000000 117 | 2 3 i @0@ 0.000000 118 | 3 4 j @0@ 0.000000 119 | 4 5 > @0@ 0.000000 120 | 5 0.000000 121 | ``` 122 | 123 | And comments begin with `#`. 124 | 125 | ## Alignment 126 | 127 | Patterns can list different sides of each lexicon in different places. When the compiler encounters a one-sided lexicon reference in a pattern, it attaches all entries from that side of that lexicon to the transducer and then builds the rest of the pattern, attaching a separate copy for each entry. However, in these copies, for any subsequent mentions of that lexicon, only the corresponding segment of that entry will be attached, thus avoiding over-generation. The same lexicon can be mentioned arbitrarily many times, making it straightforward to write rules for phenomena such as reduplication. 128 | 129 | ``` 130 | PATTERNS 131 | :VerbInfl VerbRoot VerbInfl: 132 | :VerbInfl :VerbRoot VerbRoot VerbInfl: Redup 133 | 134 | LEXICON VerbRoot 135 | bloop 136 | vroom 137 | 138 | LEXICON VerbInfl 139 | :en 140 | 141 | LEXICON Redup 142 | : 143 | ``` 144 | 145 | forms generated: 146 | ``` 147 | enbloop/bloop 148 | envroom/vroom 149 | enbloopbloop/bloop 150 | envroomvroom/vroom 151 | ``` 152 | 153 | To handle more complex cases, such as infixation and Semitic triliteral roots, lexicon entries can have multiple segments which patterns can refer to independently. 154 | 155 | ``` 156 | PATTERNS 157 | C(1) :V(1) C(2) :V(2) C(3) V(2): 158 | 159 | LEXICON C(3) 160 | sh m r 161 | y sh v 162 | 163 | LEXICON V(2) 164 | :a :a 165 | :o :e 166 | ``` 167 | 168 | forms generated: 169 | ``` 170 | shamar/shmr 171 | shomer/shmr 172 | yashav/yshv 173 | yoshev/yshv 174 | ``` 175 | 176 | It is also possible to give lexicons multiple names using the `ALIAS` command, which allows patterns to refer to multiple independent copies, which can then be used for productive compounding. 177 | 178 | ``` 179 | PATTERNS 180 | NounStem NounInfl 181 | NounStem NounInflComp Comp NounStem2 NounInfl 182 | 183 | LEXICON Comp 184 | +: 185 | 186 | LEXICON NounStem 187 | shoop 188 | blarg 189 | 190 | ALIAS NounStem NounStem2 191 | 192 | LEXICON NounInfl 193 | : 194 | :ah 195 | 196 | LEXICON NounInflComp 197 | :a 198 | ``` 199 | 200 | forms generated: 201 | ``` 202 | shoop/shoop 203 | shoopah/shoop 204 | shoopashoop/shoop+shoop 205 | shoopashoopah/shoop+shoop 206 | shoopablarg/shoop+blarg 207 | shoopablargah/shoop+blarg 208 | blarg/blarg 209 | blargah/blarg 210 | blargashoop/blarg+shoop 211 | blargashoopah/blarg+shoop 212 | blargablarg/blarg+blarg 213 | blargablargah/blarg+blarg 214 | ``` 215 | 216 | Patterns can be named and included in other patterns. In addition to being less repetitive to write, it also compiles faster. 217 | 218 | ``` 219 | PATTERN VerbStem 220 | VerbRoot 221 | VerbRoot Causative 222 | AuxRoot 223 | 224 | PATTERNS 225 | VerbStem Tense PersonNumber 226 | ``` 227 | This is equivalent to 228 | ``` 229 | PATTERNS 230 | VerbRoot Tense PersonNumber 231 | VerbRoot Causative Tense PersonNumber 232 | AuxRoot Tense PersonNumber 233 | ``` 234 | 235 | ## Pattern Operators 236 | 237 | Some simple operators are supported to help write patterns concisely: 238 | - the option quantifier `?` can be applied to a single token 239 | ``` 240 | PATTERNS 241 | Negation? Adjective 242 | # equivalent to: 243 | # Negation Adjective 244 | # Adjective 245 | ``` 246 | 247 | Placing the `?` quantifier between a lexicon name and the segment number will 248 | make that lexicon as a whole optional in that pattern: 249 | 250 | ``` 251 | PATTERNS 252 | OptionalCircumfix?(1) Stem OptionalCircumfix?(2) 253 | # equivalent to: 254 | # OptionalCircumfix(1) Stem OptionalCircumfix(2) 255 | # Stem 256 | ``` 257 | 258 | Note that in this case the column specification after the `?` is required. 259 | 260 | ``` 261 | PATTERNS 262 | :Prefix?(1) Stem Prefix?(1): 263 | # equivalent to 264 | # :Prefix(1) Stem Prefix(1): => :Prefix Stem Prefix: 265 | # Stem 266 | :Prefix? Stem Prefix:? 267 | # equivalent to 268 | # :Prefix Stem Prefix: 269 | # :Prefix Stem 270 | # Stem Prefix: 271 | # Stem 272 | ``` 273 | 274 | The quantifiers `*` (repeat 0 or more times) and `+` (repeat 1 or more times) 275 | function similarly, though they only support modification of a single token 276 | and not distributed modification of a lexicon across an entire line. 277 | 278 | - the alternation operator `|` between two tokens causes one pattern 279 | for each alternate 280 | ``` 281 | PATTERNS 282 | VerbStem Case 283 | 284 | PATTERN Case 285 | Absolutive 286 | Oblique Ergative|Genitive 287 | # equivalent to: 288 | # Oblique Ergative 289 | # Oblique Genitive 290 | ``` 291 | 292 | - the sieve operators `<` and `>` allow left and right extensions 293 | ``` 294 | PATTERNS 295 | VerbStem > Nominalisation > Case 296 | # equivalent to: 297 | # VerbStem 298 | # VerbStem Nominalisation 299 | # VerbStem Nominalisation Case 300 | ``` 301 | 302 | ## Anonymous Lexicons and Patterns 303 | 304 | Patterns can contain anonymous lexicons to avoid needing to explicitly 305 | declare lexicons for very simple things. 306 | 307 | ``` 308 | PATTERNS 309 | NounStem [:] NounNumber 310 | 311 | LEXICON NounStem 312 | sock 313 | ninja 314 | 315 | LEXICON NounNumber 316 | : 317 | :s 318 | ``` 319 | 320 | forms generated: 321 | ``` 322 | ninja/ninja 323 | ninjas/ninja 324 | sock/sock 325 | socks/sock 326 | ``` 327 | 328 | Anonymous patterns function similarly: 329 | ``` 330 | PATTERNS 331 | (VerbRoot Causative?) | AuxRoot Tense PersonNumber 332 | # equivalent to: 333 | # PATTERN VerbStem 334 | # VerbRoot Causative? 335 | # PATTERNS 336 | # VerbStem|AuxRoot Tense PersonNumber 337 | ``` 338 | 339 | Anonymous patterns can be nested and both patterns and lexicons can be quantified: 340 | ``` 341 | PATTERNS 342 | NounRoot ([:] (Number Case)?) | (Verbalizer Tense) 343 | ``` 344 | 345 | ## Tags 346 | Lexicon entries can be tagged using square brackets: 347 | 348 | ``` 349 | LEXICON NounRoot 350 | sock[count] 351 | rice[mass] 352 | sand[count,mass] 353 | ``` 354 | 355 | Or tags can be applied-by-default to an entire block: 356 | 357 | ``` 358 | LEXICON NounRoot[count] 359 | sock 360 | rice[mass,-count] 361 | sand[mass] 362 | ``` 363 | 364 | When referring the lexicon, these tags can then be selected for: 365 | 366 | ``` 367 | PATTERNS 368 | NounRoot[count] [:] Number # 'sock' and 'sand', but not 'rice' 369 | NounRoot[mass] [:] # 'rice' and 'sand', but not 'sock' 370 | ``` 371 | 372 | The absense of a tag can also be selected for: 373 | 374 | ``` 375 | PATTERNS 376 | NounRoot[-count] [:] # 'rice' only 377 | ``` 378 | 379 | Tag selectors can also be applied to patterns: 380 | 381 | ``` 382 | PATTERN NounStem 383 | NounRoot [:] 384 | 385 | PATTERNS 386 | NounStem[count] Number 387 | NounStem[mass] 388 | ``` 389 | 390 | Distribution rules are as follows: 391 | 392 | ``` 393 | (A B)[x] = (A[x] B) | (A B[x]) 394 | (A B)[-x] = A[-x] B[-x] 395 | ``` 396 | 397 | Union and symmetric difference are implemented with the following 398 | syntax: 399 | 400 | ``` 401 | A[|[x,y]] = A[x] | A[y] # union / logical or 402 | A[^[x,y]] = A[x,-y] | A[-x,y] # symmetric difference / exclusive-or 403 | ``` 404 | 405 | These can be useful with lexically conditioned patterns. Here's an 406 | example showing declension paradigms and noun class: 407 | 408 | ``` 409 | PATTERNS 410 | (NounStem CaseEnding)[^[Decl1,Decl2],^[N,M,F]] 411 | 412 | LEXICON NounStem 413 | mensa:mens[Decl1,F] # table 414 | poeta:poet[Decl1,M] # poet 415 | dominus:domin[Decl2,M] # master 416 | bellum:bell[Decl2,N] # war 417 | 418 | LEXICON CaseEnding[Decl2] 419 | :>us[M] 420 | :>um[N] 421 | :>um # M or N 422 | 423 | LEXICON CaseEnding[Decl1] 424 | :>a # any gender 425 | :>am # any gender 426 | ``` 427 | 428 | produces the forms (through `lexd | hfst-txt2fst | hfst-fst2strings`) 429 | 430 | ``` 431 | poeta:poet>a 432 | poeta:poet>am 433 | mensa:mens>a 434 | mensa:mens>am 435 | bellum:bell>um 436 | bellum:bell>um 437 | dominus:domin>us 438 | dominus:domin>um 439 | ``` 440 | 441 | The exclusive or filter in this example will produce pairs of stems and case 442 | endings such that between them, one has a declension tag and one (possibly the 443 | same one) has a gender tag, and neither has any other declension or gender tag. 444 | Thus, if we were to add a third declension such as 445 | 446 | ``` 447 | arbor:arbor[Decl3,F] 448 | ``` 449 | 450 | but we then forgot to add `Decl3` to the filter in the pattern, then from the 451 | perspective of the filter, it wouldn't have a declension tag, so it would get 452 | paired with every case ending that does have a declension tag (which in this 453 | example is all of them). 454 | 455 | ## Regular Expressions 456 | 457 | If a lexicon entry begins with a forward slash, it is interpreted as a regular expression. 458 | 459 | ``` 460 | PATTERNS 461 | SomeLexicon 462 | 463 | LEXICON SomeLexicon 464 | /x(y|zz)?[n-p]/ 465 | ``` 466 | 467 | produces the forms 468 | 469 | ``` 470 | xn 471 | xo 472 | xp 473 | xyn 474 | xyo 475 | xyp 476 | xzzn 477 | xzzo 478 | xzzp 479 | ``` 480 | 481 | ### Currently Supported Syntax 482 | 483 | - Grouping with `()` 484 | - Quantification with `?`, `*`, and `+` 485 | - Currently, quantifiers may only be applied to fully parenthesized groups, so `x+` is an error and must be written as `(x)+`. 486 | - Alternation with `|` 487 | - Character classes with `[]` 488 | - Character ranges such as `[a-z]` 489 | - Multichar symbols, following the same rules are normal lexicon entries 490 | - Two-sided strings using `:` 491 | - `a:b` is the same as in a normal entry 492 | - `[ab]:c` is equivalent to `(a:c)|(b:c)` 493 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # If the user specified a --prefix, take that, otherwise /usr/local/ 4 | # is the default. 5 | PREFIX=/usr/local 6 | prefixnext=false 7 | for i in "$@"; do 8 | case $i in 9 | --prefix=*) # equals separated: 10 | PREFIX="${i#*=}" 11 | ;; 12 | --prefix) # space separated: 13 | prefixnext=true 14 | ;; 15 | *) 16 | $prefixnext && PREFIX="$i" && prefixnext=false 17 | ;; 18 | esac 19 | done 20 | 21 | # Set the paths needed by libtool/pkg-config/aclocal etc. By inferring 22 | # them based on --prefix , users don't have to edit ~/.bashrc. We only 23 | # append, so if a user has some other preference, that will override. 24 | PATH="${PATH}:/usr/local/bin" 25 | export PATH 26 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${PREFIX}/lib" 27 | export LD_LIBRARY_PATH 28 | PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${PREFIX}/share/pkgconfig:${PREFIX}/lib/pkgconfig" 29 | export PKG_CONFIG_PATH 30 | ACLOCAL_PATH="${ACLOCAL_PATH}:${PREFIX}/share/aclocal" 31 | export ACLOCAL_PATH 32 | 33 | 34 | # Pass on all args to configure 35 | autoreconf -fi && ./configure "$@" 36 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ(2.61) 2 | 3 | AC_INIT([lexd], [1.3.5], [awesomeevildudes@gmail.com]) 4 | AM_INIT_AUTOMAKE 5 | AC_CONFIG_MACRO_DIR([m4]) 6 | 7 | AC_PROG_CXX 8 | AM_SANITY_CHECK 9 | AC_LANG_CPLUSPLUS 10 | 11 | CFLAGS="-Wall -Wextra -Wsign-conversion $CFLAGS" 12 | CXXFLAGS="-Wall -Wextra -Werror=missing-field-initializers -Wsign-conversion $CXXFLAGS" 13 | 14 | AC_ARG_ENABLE(debug, 15 | [ --enable-debug Enable "-g" compiler options], 16 | [CXXFLAGS="-g $CXXFLAGS";CFLAGS="-g $CFLAGS"]) 17 | 18 | PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= 3.7.1]) 19 | PKG_CHECK_MODULES([ICU_UC], [icu-uc]) 20 | PKG_CHECK_MODULES([ICU_IO], [icu-io]) 21 | 22 | AC_CHECK_FUNCS([getopt_long]) 23 | 24 | CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $ICU_CFLAGS $ICU_UC_CFLAGS $ICU_IO_CFLAGS" 25 | LIBS="$LIBS $LTTOOLBOX_LIBS $ICU_LIBS $ICU_UC_LIBS $ICU_IO_LIBS" 26 | 27 | # Checks for highest supported C++ standard 28 | AC_LANG(C++) 29 | for version in 23 2b 20 2a 17; do 30 | version_flag="-std=c++${version}" 31 | AX_CHECK_COMPILE_FLAG([${version_flag}], [break], [version_flag=none]) 32 | done 33 | AS_IF([test "$version_flag" == none], [ 34 | AC_MSG_ERROR([Could not enable at least C++17 - upgrade your compiler]) 35 | ]) 36 | CXXFLAGS="$CXXFLAGS ${version_flag}" 37 | 38 | AC_CHECK_HEADER([utf8cpp/utf8.h], [CPPFLAGS="-I/usr/include/utf8cpp/ $CPPFLAGS"], [ 39 | AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) 40 | ]) 41 | 42 | AC_CONFIG_FILES([ 43 | lexd.pc 44 | Makefile 45 | src/Makefile 46 | ]) 47 | AC_OUTPUT 48 | -------------------------------------------------------------------------------- /imgs/lexc.dot: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR; 3 | n0 [label=""]; 4 | n1 [label=""]; 5 | n2 [label=""]; 6 | n3 [label=""]; 7 | n0 -> n1 [label="prefix1"]; 8 | n0 -> n1 [label="prefix2"]; 9 | n0 -> n1 [label="prefix3"]; 10 | n1 -> n2 [label="lemma1"]; 11 | n1 -> n2 [label="lemma2"]; 12 | n1 -> n2 [label="lemma3"]; 13 | n2 -> n3 [label="tag1"]; 14 | n2 -> n3 [label="tag2"]; 15 | n2 -> n3 [label="tag3"]; 16 | } 17 | -------------------------------------------------------------------------------- /imgs/lexc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/imgs/lexc.png -------------------------------------------------------------------------------- /imgs/twoc.dot: -------------------------------------------------------------------------------- 1 | digraph { 2 | rankdir=LR; 3 | n0 [label=""]; 4 | n1 [label=""]; 5 | n2 [label=""]; 6 | n3 [label=""]; 7 | n4 [label=""]; 8 | n5 [label=""]; 9 | n6 [label=""]; 10 | n7 [label=""]; 11 | n0 -> n1 [label="prefix1"]; 12 | n0 -> n2 [label="prefix2"]; 13 | n0 -> n3 [label="prefix3"]; 14 | n1 -> n4 [label="lemma1"]; 15 | n1 -> n4 [label="lemma2"]; 16 | n1 -> n4 [label="lemma3"]; 17 | n2 -> n5 [label="lemma1"]; 18 | n2 -> n5 [label="lemma2"]; 19 | n2 -> n5 [label="lemma3"]; 20 | n3 -> n6 [label="lemma1"]; 21 | n3 -> n6 [label="lemma2"]; 22 | n3 -> n6 [label="lemma3"]; 23 | n4 -> n7 [label="tag1"]; 24 | n5 -> n7 [label="tag2"]; 25 | n6 -> n7 [label="tag3"]; 26 | } 27 | -------------------------------------------------------------------------------- /imgs/twoc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/imgs/twoc.png -------------------------------------------------------------------------------- /lexd.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | 3 | Name: lexd 4 | Description: lexd lexicon compiler specialising in non-suffixational morphologies 5 | Version: @VERSION@ 6 | -------------------------------------------------------------------------------- /m4/ax_check_compile_flag.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check whether the given FLAG works with the current language's compiler 12 | # or gives an error. (Warnings, however, are ignored) 13 | # 14 | # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on 15 | # success/failure. 16 | # 17 | # If EXTRA-FLAGS is defined, it is added to the current language's default 18 | # flags (e.g. CFLAGS) when the check is done. The check is thus made with 19 | # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to 20 | # force the compiler to issue an error when a bad flag is given. 21 | # 22 | # INPUT gives an alternative input source to AC_COMPILE_IFELSE. 23 | # 24 | # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this 25 | # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. 26 | # 27 | # LICENSE 28 | # 29 | # Copyright (c) 2008 Guido U. Draheim 30 | # Copyright (c) 2011 Maarten Bosmans 31 | # 32 | # This program is free software: you can redistribute it and/or modify it 33 | # under the terms of the GNU General Public License as published by the 34 | # Free Software Foundation, either version 3 of the License, or (at your 35 | # option) any later version. 36 | # 37 | # This program is distributed in the hope that it will be useful, but 38 | # WITHOUT ANY WARRANTY; without even the implied warranty of 39 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 40 | # Public License for more details. 41 | # 42 | # You should have received a copy of the GNU General Public License along 43 | # with this program. If not, see . 44 | # 45 | # As a special exception, the respective Autoconf Macro's copyright owner 46 | # gives unlimited permission to copy, distribute and modify the configure 47 | # scripts that are the output of Autoconf when processing the Macro. You 48 | # need not follow the terms of the GNU General Public License when using 49 | # or distributing such scripts, even though portions of the text of the 50 | # Macro appear in them. The GNU General Public License (GPL) does govern 51 | # all other use of the material that constitutes the Autoconf Macro. 52 | # 53 | # This special exception to the GPL applies to versions of the Autoconf 54 | # Macro released by the Autoconf Archive. When you make and distribute a 55 | # modified version of the Autoconf Macro, you may extend this special 56 | # exception to the GPL to apply to your modified version as well. 57 | 58 | #serial 3 59 | 60 | AC_DEFUN([AX_CHECK_COMPILE_FLAG], 61 | [AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX 62 | AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl 63 | AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ 64 | ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS 65 | _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" 66 | AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], 67 | [AS_VAR_SET(CACHEVAR,[yes])], 68 | [AS_VAR_SET(CACHEVAR,[no])]) 69 | _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) 70 | AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], 71 | [m4_default([$2], :)], 72 | [m4_default([$3], :)]) 73 | AS_VAR_POPDEF([CACHEVAR])dnl 74 | ])dnl AX_CHECK_COMPILE_FLAGS 75 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_LDFLAGS=$(LIBS) 2 | 3 | bin_PROGRAMS = lexd 4 | 5 | lexd_SOURCES = lexd.cc lexdcompiler.cc icu-iter.cc 6 | 7 | lexd.1: 8 | $(abs_srcdir)/help2man.sh $(PACKAGE_VERSION) 9 | 10 | man_MANS = lexd.1 11 | -------------------------------------------------------------------------------- /src/help2man.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | help2man -N -n 'lexd' --version-string "$1" ./lexd > lexd.1 3 | -------------------------------------------------------------------------------- /src/icu-iter.cc: -------------------------------------------------------------------------------- 1 | #include "icu-iter.h" 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | using namespace icu; 7 | 8 | charspan_iter::charspan_iter(const UnicodeString &s) 9 | : _status(U_ZERO_ERROR), s(&s) 10 | { 11 | it = BreakIterator::createCharacterInstance(Locale::getDefault(), _status); 12 | if(U_FAILURE(_status)) 13 | { 14 | cerr << "Failed to create character iterator with code " << _status << endl; 15 | exit(1); 16 | } 17 | it->setText(s); 18 | _span.first = it->first(); 19 | _span.second = it->next(); 20 | } 21 | 22 | charspan_iter::charspan_iter(const charspan_iter &other) 23 | : _status(other._status), s(other.s), _span(other._span) 24 | { 25 | it = other.it->clone(); 26 | } 27 | 28 | charspan_iter::~charspan_iter() 29 | { 30 | delete it; 31 | } 32 | 33 | charspan_iter rev_charspan_iter(const UnicodeString &s) 34 | { 35 | return --charspan_iter(s).end(); 36 | } 37 | 38 | const UErrorCode &charspan_iter::status() const 39 | { 40 | return _status; 41 | } 42 | 43 | const pair &charspan_iter::operator*() const 44 | { 45 | return _span; 46 | } 47 | 48 | charspan_iter charspan_iter::operator++(int) 49 | { 50 | if (at_end()) return *this; 51 | auto other = charspan_iter(*this); 52 | other._span = make_pair(other._span.second, other.it->next()); 53 | if(other._span.first == other._span.second) 54 | other._span.second = other.it->next(); 55 | return other; 56 | } 57 | 58 | charspan_iter &charspan_iter::operator++() 59 | { 60 | if (!at_end()) 61 | { 62 | _span = make_pair(_span.second, it->next()); 63 | if(_span.first == _span.second) 64 | _span.second = it->next(); 65 | } 66 | return *this; 67 | } 68 | 69 | charspan_iter &charspan_iter::operator--() 70 | { 71 | if(*this != begin()) 72 | { 73 | _span = make_pair(it->previous(), _span.first); 74 | if(_span.first == _span.second) 75 | _span.first = it->previous(); 76 | } 77 | return *this; 78 | } 79 | 80 | charspan_iter charspan_iter::operator--(int) 81 | { 82 | if(*this == begin()) 83 | return *this; 84 | auto other = charspan_iter(*this); 85 | other._span = make_pair(other.it->previous(), other._span.first); 86 | if(other._span.first == other._span.second) 87 | other._span.first = other.it->previous(); 88 | return other; 89 | } 90 | 91 | const UnicodeString &charspan_iter::string() const 92 | { 93 | return *s; 94 | } 95 | 96 | const pair &charspan_iter::span() const 97 | { 98 | return _span; 99 | } 100 | 101 | bool charspan_iter::operator!=(const charspan_iter &other) const 102 | { 103 | return s != other.s || _span != other._span; 104 | } 105 | 106 | bool charspan_iter::operator==(const charspan_iter &other) const 107 | { 108 | return s == other.s && _span == other._span; 109 | } 110 | 111 | charspan_iter charspan_iter::begin() 112 | { 113 | return charspan_iter(*s); 114 | } 115 | 116 | charspan_iter charspan_iter::end() 117 | { 118 | charspan_iter cs_it(*s); 119 | cs_it._span.first = cs_it.it->last(); 120 | cs_it._span.second = BreakIterator::DONE; 121 | return cs_it; 122 | } 123 | 124 | bool charspan_iter::at_end() const 125 | { 126 | return _span.second == BreakIterator::DONE; 127 | } 128 | 129 | 130 | char_iter::char_iter(const UnicodeString &s) : it(s) 131 | { 132 | } 133 | char_iter::char_iter(const charspan_iter &it) : it(it) 134 | { 135 | } 136 | char_iter::char_iter(const char_iter &cit) : it(cit.it) 137 | { 138 | } 139 | 140 | char_iter rev_char_iter(const UnicodeString &s) { 141 | return char_iter(rev_charspan_iter(s)); 142 | } 143 | 144 | char_iter &char_iter::operator++() { 145 | ++it; 146 | return *this; 147 | } 148 | char_iter char_iter::operator++(int) { 149 | char_iter it(*this); 150 | ++this->it; 151 | return it; 152 | } 153 | char_iter &char_iter::operator--() { 154 | --it; 155 | return *this; 156 | } 157 | char_iter char_iter::operator--(int) { 158 | char_iter it = *this; 159 | --this->it; 160 | return it; 161 | } 162 | char_iter char_iter::begin() 163 | { 164 | return char_iter(string()); 165 | } 166 | char_iter char_iter::end() 167 | { 168 | return char_iter(it.end()); 169 | } 170 | bool char_iter::at_end() 171 | { 172 | return it.at_end(); 173 | } 174 | const UnicodeString &char_iter::string() const 175 | { 176 | return it.string(); 177 | } 178 | bool char_iter::operator!=(const char_iter &other) const 179 | { 180 | return it != other.it; 181 | } 182 | bool char_iter::operator==(const char_iter &other) const 183 | { 184 | return it == other.it; 185 | } 186 | pair char_iter::span() const 187 | { 188 | return it.span(); 189 | } 190 | 191 | UString to_ustring(const UnicodeString &str) 192 | { 193 | UString temp; 194 | temp.append(str.getBuffer(), (unsigned int)str.length()); 195 | return temp; 196 | } 197 | -------------------------------------------------------------------------------- /src/icu-iter.h: -------------------------------------------------------------------------------- 1 | #ifndef _LEXD_ICU_ITER_H_ 2 | #define _LEXD_ICU_ITER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | class charspan_iter 13 | { 14 | private: 15 | icu::BreakIterator* it; 16 | UErrorCode _status; 17 | const icu::UnicodeString *s; 18 | std::pair _span; 19 | public: 20 | charspan_iter(const icu::UnicodeString &s); 21 | charspan_iter(const charspan_iter &other); 22 | friend charspan_iter rev_charspan_iter(const icu::UnicodeString &s); 23 | 24 | ~charspan_iter(); 25 | 26 | const UErrorCode &status() const; 27 | const std::pair &operator*() const; 28 | charspan_iter operator++(int); 29 | charspan_iter &operator++(); 30 | charspan_iter &operator--(); 31 | charspan_iter operator--(int); 32 | const icu::UnicodeString &string() const; 33 | const std::pair &span() const; 34 | bool operator!=(const charspan_iter &other) const; 35 | bool operator==(const charspan_iter &other) const; 36 | charspan_iter begin(); 37 | charspan_iter end(); 38 | // checks (*this == end()), but without explicitly constructing 39 | // end(), so it's much faster 40 | bool at_end() const; 41 | }; 42 | charspan_iter rev_charspan_iter(const icu::UnicodeString &s); 43 | 44 | class char_iter 45 | { 46 | private: 47 | charspan_iter it; 48 | public: 49 | char_iter(const icu::UnicodeString &s); 50 | char_iter(const charspan_iter &it); 51 | char_iter(const char_iter &cit); 52 | friend char_iter rev_char_iter(const icu::UnicodeString &s); 53 | 54 | char_iter &operator++(); 55 | char_iter operator++(int); 56 | char_iter &operator--(); 57 | char_iter operator--(int); 58 | char_iter begin(); 59 | char_iter end(); 60 | bool at_end(); 61 | const icu::UnicodeString &string() const; 62 | inline icu::UnicodeString operator*() const { return string().tempSubStringBetween(it.span().first, it.span().second); } 63 | bool operator!=(const char_iter &other) const; 64 | bool operator==(const char_iter &other) const; 65 | std::pair span() const; 66 | }; 67 | 68 | char_iter rev_char_iter(const icu::UnicodeString &s); 69 | 70 | UString to_ustring(const icu::UnicodeString &str); 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/lexd.1: -------------------------------------------------------------------------------- 1 | .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.13. 2 | .TH LEXD "1" "January 2021" "lexd 1.0.0" "User Commands" 3 | .SH NAME 4 | lexd \- lexd 5 | .SH DESCRIPTION 6 | lexd: compile lexd files to transducers 7 | USAGE: lexd [\-abcfmx] [rule_file [output_file]] 8 | .TP 9 | \fB\-a\fR, \fB\-\-align\fR: 10 | align labels (prefer a:0 b:b to a:b b:0) 11 | .TP 12 | \fB\-b\fR, \fB\-\-bin\fR: 13 | output as Lttoolbox binary file (default is AT&T format) 14 | .TP 15 | \fB\-c\fR, \fB\-\-compress\fR: 16 | condense labels (prefer a:b to 0:b a:0 \- sets \fB\-\-align\fR) 17 | .TP 18 | \fB\-f\fR, \fB\-\-flags\fR: 19 | compile using flag diacritics 20 | .TP 21 | \fB\-m\fR, \fB\-\-minimize\fR: 22 | do hyperminimization (sets \fB\-f\fR) 23 | .TP 24 | \fB\-t\fR, \fB\-\-tags\fR: 25 | compile tags and filters with flag diacritics (sets \fB\-f\fR) 26 | .HP 27 | \fB\-x\fR, \fB\-\-statistics\fR: print lexicon and pattern sizes to stderr 28 | -------------------------------------------------------------------------------- /src/lexd.cc: -------------------------------------------------------------------------------- 1 | #include "lexdcompiler.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | void endProgram(char *name) 11 | { 12 | if(name != NULL) 13 | { 14 | cout << basename(name) << " v" << VERSION << ": compile lexd files to transducers" << endl; 15 | cout << "USAGE: " << basename(name) << " [-abcfmtvxUV] [rule_file [output_file]]" << endl; 16 | cout << " -a, --align: align labels (prefer a:0 b:b to a:b b:0)" << endl; 17 | cout << " -b, --bin: output as Lttoolbox binary file (default is AT&T format)" << endl; 18 | cout << " -c, --compress: condense labels (prefer a:b to 0:b a:0 - sets --align)" << endl; 19 | cout << " -f, --flags: compile using flag diacritics" << endl; 20 | cout << " -m, --minimize: do hyperminimization (sets -f)" << endl; 21 | cout << " -t, --tags: compile tags and filters with flag diacritics (sets -f)" << endl; 22 | cout << " -v, --verbose: compile verbosely" << endl; 23 | cout << " -U, --no-combine: represent multi-codepoint glyphs as multiple transitions" << endl; 24 | cout << " -V, --version: print version string" << endl; 25 | cout << " -x, --statistics: print lexicon and pattern sizes to stderr" << endl; 26 | } 27 | exit(EXIT_FAILURE); 28 | } 29 | 30 | int main(int argc, char *argv[]) 31 | { 32 | LtLocale::tryToSetLocale(); 33 | 34 | bool bin = false; 35 | bool flags = false; 36 | bool single = false; 37 | bool stats = false; 38 | UFILE* input = u_finit(stdin, NULL, NULL); 39 | UFILE* output = u_finit(stdout, NULL, NULL); 40 | LexdCompiler comp; 41 | 42 | #if HAVE_GETOPT_LONG 43 | int option_index=0; 44 | #endif 45 | 46 | while (true) { 47 | #if HAVE_GETOPT_LONG 48 | static struct option long_options[] = 49 | { 50 | {"align", no_argument, 0, 'a'}, 51 | {"bin", no_argument, 0, 'b'}, 52 | {"compress", no_argument, 0, 'c'}, 53 | {"flags", no_argument, 0, 'f'}, 54 | {"help", no_argument, 0, 'h'}, 55 | {"minimize", no_argument, 0, 'm'}, 56 | {"single", no_argument, 0, 's'}, 57 | {"tags", no_argument, 0, 't'}, 58 | {"verbose", no_argument, 0, 'v'}, 59 | {"no-combine",no_argument, 0, 'U'}, 60 | {"version", no_argument, 0, 'V'}, 61 | {"statistics",no_argument, 0, 'x'}, 62 | {0, 0, 0, 0} 63 | }; 64 | 65 | int cnt=getopt_long(argc, argv, "abcfhmstvUVx", long_options, &option_index); 66 | #else 67 | int cnt=getopt(argc, argv, "abcfhmstvUVx"); 68 | #endif 69 | if (cnt==-1) 70 | break; 71 | 72 | switch (cnt) 73 | { 74 | case 'a': 75 | comp.setShouldAlign(true); 76 | break; 77 | 78 | case 'b': 79 | bin = true; 80 | break; 81 | 82 | case 'c': 83 | comp.setShouldAlign(true); 84 | comp.setShouldCompress(true); 85 | break; 86 | 87 | case 'f': 88 | flags = true; 89 | break; 90 | 91 | case 'm': 92 | flags = true; 93 | comp.setShouldHypermin(true); 94 | break; 95 | 96 | case 's': 97 | single = true; 98 | break; 99 | 100 | case 't': 101 | flags = true; 102 | comp.setTagsAsFlags(true); 103 | break; 104 | 105 | case 'v': 106 | comp.setVerbose(true); 107 | break; 108 | 109 | case 'U': 110 | comp.setShouldCombine(false); 111 | break; 112 | 113 | case 'V': 114 | cout << "lexd " << VERSION << endl; 115 | return 0; 116 | break; 117 | 118 | case 'x': 119 | stats = true; 120 | break; 121 | 122 | case 'h': // fallthrough 123 | default: 124 | endProgram(argv[0]); 125 | break; 126 | } 127 | } 128 | 129 | string infile; 130 | string outfile; 131 | switch(argc - optind) 132 | { 133 | case 0: 134 | break; 135 | 136 | case 1: 137 | infile = argv[argc-1]; 138 | break; 139 | 140 | case 2: 141 | infile = argv[argc-2]; 142 | outfile = argv[argc-1]; 143 | break; 144 | 145 | default: 146 | endProgram(argv[0]); 147 | break; 148 | } 149 | 150 | if(infile != "" && infile != "-") 151 | { 152 | input = u_fopen(infile.c_str(), "rb", NULL, NULL); 153 | if(!input) 154 | { 155 | cerr << "Error: Cannot open file '" << infile << "' for reading." << endl; 156 | exit(EXIT_FAILURE); 157 | } 158 | } 159 | 160 | if(outfile != "" && outfile != "-") 161 | { 162 | output = u_fopen(outfile.c_str(), "wb", NULL, NULL); 163 | if(!output) 164 | { 165 | cerr << "Error: Cannot open file '" << outfile << "' for writing." << endl; 166 | exit(EXIT_FAILURE); 167 | } 168 | } 169 | 170 | comp.readFile(input); 171 | u_fclose(input); 172 | Transducer* transducer = (single ? comp.buildTransducerSingleLexicon() : comp.buildTransducer(flags)); 173 | if(stats) 174 | comp.printStatistics(); 175 | if(!transducer) 176 | cerr << "Warning: output is empty transducer." << endl; 177 | else if(bin) 178 | { 179 | // TODO: finish this! 180 | //fwrite(HEADER_LTTOOLBOX, 1, 4, output); 181 | //uint64_t features = 0; 182 | //write_le(output, features); 183 | 184 | // letters 185 | //Compression::string_write(""_u, output); 186 | //comp.alphabet.write(output); 187 | //Compression::multibyte_write(1, output); 188 | //Compression::string_write("main"_u, output); 189 | //transducer->write(output); 190 | } 191 | else 192 | { 193 | transducer->show(comp.alphabet, output, 0, true); 194 | } 195 | u_fclose(output); 196 | delete transducer; 197 | return 0; 198 | } 199 | -------------------------------------------------------------------------------- /src/lexdcompiler.h: -------------------------------------------------------------------------------- 1 | #ifndef __LEXDCOMPILER__ 2 | #define __LEXDCOMPILER__ 3 | 4 | #include "icu-iter.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | using namespace icu; 22 | 23 | struct string_ref { 24 | unsigned int i; 25 | string_ref() : i(0) {} 26 | explicit string_ref(unsigned int _i) : i(_i) {} 27 | explicit operator unsigned int() const { return i; } 28 | bool operator == (string_ref other) const { return i == other.i; } 29 | bool operator != (string_ref other) const { return !(*this == other); } 30 | bool operator < (string_ref other) const { return i < other.i; } 31 | bool operator !() const { return empty(); } 32 | string_ref operator || (string_ref other) const { 33 | return i ? *this : other; 34 | } 35 | bool empty() const { return i == 0; } 36 | bool valid() const { return i != 0; } 37 | }; 38 | 39 | template<> 40 | struct std::hash { 41 | size_t operator()(const string_ref &t) const 42 | { 43 | return std::hash()(t.i); 44 | } 45 | }; 46 | 47 | template 48 | bool subset(const set &xs, const set &ys) 49 | { 50 | if(xs.size() > ys.size()) 51 | return false; 52 | for(auto x: xs) 53 | if(ys.find(x) == ys.end()) 54 | return false; 55 | return true; 56 | } 57 | 58 | template 59 | bool subset_strict(const set &xs, const set &ys) 60 | { 61 | if(xs.size() >= ys.size()) 62 | return false; 63 | return subset(xs, ys); 64 | } 65 | 66 | template 67 | set unionset(const set &xs, const set &ys) 68 | { 69 | set u = xs; 70 | unionset_inplace(u, ys); 71 | return u; 72 | } 73 | 74 | template 75 | void unionset_inplace(set &xs, const set &ys) 76 | { 77 | xs.insert(ys.begin(), ys.end()); 78 | return; 79 | } 80 | 81 | template 82 | set intersectset(const set &xs, const set &ys) 83 | { 84 | set i = xs; 85 | for(auto x: xs) 86 | if(ys.find(x) == ys.end()) 87 | i.erase(x); 88 | return i; 89 | } 90 | 91 | template 92 | set subtractset(const set &xs, const set &ys) 93 | { 94 | set i = xs; 95 | subtractset_inplace(i, ys); 96 | return i; 97 | } 98 | 99 | template 100 | void subtractset_inplace(set &xs, const set &ys) 101 | { 102 | for(const auto &y: ys) 103 | xs.erase(y); 104 | } 105 | 106 | struct lex_token_t; 107 | 108 | class tags_t : public set 109 | { 110 | using set::set; 111 | public: 112 | tags_t(const set &s) : set(s) { } 113 | }; 114 | class pos_tag_filter_t : public set 115 | { 116 | using set::set; 117 | public: 118 | pos_tag_filter_t(const set &s) : set(s) { } 119 | }; 120 | class neg_tag_filter_t : public set 121 | { 122 | using set::set; 123 | public: 124 | neg_tag_filter_t(const set &s) : set(s) { } 125 | }; 126 | 127 | class tag_filter_t; 128 | class op_tag_filter_t : public set 129 | { 130 | using set::set; 131 | public: 132 | op_tag_filter_t(const set &s) : set(s) { } 133 | virtual std::vector distribute(const tag_filter_t &tags) const = 0; 134 | UChar sigil = '?'; 135 | }; 136 | struct tag_filter_t { 137 | tag_filter_t() = default; 138 | tag_filter_t(const pos_tag_filter_t &pos, const neg_tag_filter_t &neg, const vector> &ops) : _pos(pos), _neg(neg), _ops(ops) { } 139 | tag_filter_t(const pos_tag_filter_t &pos, const neg_tag_filter_t &neg) : _pos(pos), _neg(neg) { } 140 | tag_filter_t(const pos_tag_filter_t &pos) : _pos(pos) {} 141 | tag_filter_t(const neg_tag_filter_t &neg) : _neg(neg) {} 142 | tag_filter_t(const vector> &ops) : _ops(ops) {} 143 | bool empty() const { return pos().empty() && neg().empty() && ops().empty(); } 144 | bool operator<(const tag_filter_t &t) const 145 | { 146 | return _pos < t._pos || (_pos == t._pos && _neg < t._neg) || (_pos == t._pos && _neg == t._neg && _ops < t._ops); 147 | } 148 | bool operator==(const tag_filter_t &t) const 149 | { 150 | return _pos == t._pos && _neg == t._neg && _ops == t._ops; 151 | } 152 | bool compatible(const tags_t &tags) const; 153 | bool combinable(const tag_filter_t &other) const; 154 | bool applicable(const tags_t &tags) const; 155 | bool try_apply(tags_t &tags) const; 156 | const pos_tag_filter_t &pos() const { return _pos; } 157 | const neg_tag_filter_t &neg() const { return _neg; } 158 | const vector> &ops() const { return _ops; } 159 | const tags_t tags() { return unionset(tags_t(_pos), tags_t(_neg)); } 160 | 161 | bool combine(const tag_filter_t &other); 162 | 163 | vector distribute() const 164 | { 165 | vector filters = { tag_filter_t(_pos, _neg) }; 166 | for (auto op : ops()) 167 | { 168 | vector next_filters; 169 | for (auto &f : filters) 170 | for (auto &nf : op->distribute(f)) 171 | next_filters.push_back(nf); 172 | filters = next_filters; 173 | } 174 | return filters; 175 | } 176 | 177 | private: 178 | pos_tag_filter_t _pos; 179 | neg_tag_filter_t _neg; 180 | vector> _ops; 181 | }; 182 | class or_tag_filter_t : public op_tag_filter_t 183 | { 184 | using op_tag_filter_t::op_tag_filter_t; 185 | public: 186 | or_tag_filter_t(const set &s) : op_tag_filter_t(s) { } 187 | virtual std::vector distribute(const tag_filter_t &tags) const { 188 | std::vector res; 189 | for (auto &tag : *this) 190 | { 191 | tag_filter_t tags_ = tags; 192 | if(!tags_.combine(tag_filter_t(pos_tag_filter_t { tag }))) 193 | continue; 194 | res.push_back(tags_); 195 | } 196 | return res; 197 | } 198 | UChar sigil = '|'; 199 | }; 200 | class xor_tag_filter_t : public op_tag_filter_t 201 | { 202 | using op_tag_filter_t::op_tag_filter_t; 203 | public: 204 | xor_tag_filter_t(const set &s) : op_tag_filter_t(s) { } 205 | virtual std::vector distribute(const tag_filter_t &tags) const { 206 | std::vector res; 207 | for (auto &tag : *this) 208 | { 209 | tag_filter_t tags_ = tags; 210 | if(!tags_.combine(tag_filter_t(pos_tag_filter_t { tag }))) 211 | continue; 212 | neg_tag_filter_t neg(*this); 213 | subtractset_inplace(neg, { tag }); 214 | if(!tags_.combine(tag_filter_t(neg))) 215 | continue; 216 | res.push_back(tags_); 217 | } 218 | return res; 219 | } 220 | UChar sigil = '^'; 221 | }; 222 | 223 | 224 | struct token_t { 225 | string_ref name; 226 | unsigned int part; 227 | bool optional; 228 | bool operator<(const token_t &t) const 229 | { 230 | return name < t.name || (name == t.name && part < t.part) || (name == t.name && part == t.part && optional < t.optional) ; 231 | } 232 | bool operator==(const token_t &t) const 233 | { 234 | return name == t.name && part == t.part && optional == t.optional; 235 | } 236 | }; 237 | 238 | struct trans_sym_t { 239 | int i; 240 | trans_sym_t() : i(0) {} 241 | explicit trans_sym_t(int _i) : i(_i) {} 242 | explicit operator int() const { return i; } 243 | bool operator == (trans_sym_t other) const { return i == other.i; } 244 | bool operator < (trans_sym_t other) const { return i < other.i; } 245 | trans_sym_t operator || (trans_sym_t other) const { 246 | return i ? *this : other; 247 | } 248 | }; 249 | 250 | struct lex_token_t { 251 | vector symbols; 252 | tags_t tags; 253 | bool operator ==(const lex_token_t &other) const { return symbols == other.symbols && tags == other.tags; } 254 | }; 255 | 256 | struct lex_seg_t { 257 | lex_token_t left, right; 258 | Transducer* regex = nullptr; 259 | tags_t tags; 260 | bool operator == (const lex_seg_t &t) const 261 | { 262 | return left == t.left && right == t.right && tags == t.tags; 263 | } 264 | }; 265 | 266 | enum RepeatMode 267 | { 268 | Optional = 1, 269 | Repeated = 2, 270 | 271 | Normal = 0, 272 | Question = 1, 273 | Plus = 2, 274 | Star = 3 275 | }; 276 | 277 | struct pattern_element_t { 278 | token_t left, right; 279 | tag_filter_t tag_filter; 280 | RepeatMode mode; 281 | 282 | bool operator<(const pattern_element_t& o) const 283 | { 284 | return left < o.left || (left == o.left && right < o.right) || (left == o.left && right == o.right && mode < o.mode) || (left == o.left && right == o.right && mode == o.mode && tag_filter < o.tag_filter); 285 | } 286 | 287 | bool operator==(const pattern_element_t& o) const 288 | { 289 | return left == o.left && right == o.right && mode == o.mode && tag_filter == o.tag_filter; 290 | } 291 | 292 | bool compatible(const lex_seg_t &tok) const; 293 | 294 | bool optional() const 295 | { 296 | return ((left.name.valid() && left.optional) || 297 | (right.name.valid() && right.optional)); 298 | } 299 | 300 | }; 301 | 302 | typedef vector pattern_t; 303 | typedef vector entry_t; 304 | typedef int line_number_t; 305 | 306 | enum FlagDiacriticType 307 | { 308 | Unification, 309 | Positive, 310 | Negative, 311 | Require, 312 | Disallow, 313 | Clear 314 | }; 315 | 316 | class LexdCompiler 317 | { 318 | private: 319 | bool shouldAlign = false; 320 | bool shouldCompress = false; 321 | bool shouldCombine = true; 322 | bool tagsAsFlags = false; 323 | bool shouldHypermin = false; 324 | bool tagsAsMinFlags = false; 325 | bool verbose = false; 326 | 327 | map name_to_id; 328 | vector id_to_name; 329 | 330 | const UnicodeString &name(string_ref r) const; 331 | 332 | map> lexicons; 333 | // { id => [ ( line, [ pattern ] ) ] } 334 | map>> patterns; 335 | map patternTransducers; 336 | map lexiconTransducers; 337 | map> entryTransducers; 338 | map> flagsUsed; 339 | map> transducerLocs; 340 | map lexiconFreedom; 341 | 342 | UFILE* input = nullptr; 343 | bool inLex = false; 344 | bool inPat = false; 345 | vector currentLexicon; 346 | tags_t currentLexicon_tags; 347 | string_ref currentLexiconId; 348 | unsigned int currentLexiconPartCount; 349 | string_ref currentPatternId; 350 | line_number_t lineNumber = 0; 351 | bool doneReading = false; 352 | unsigned int anonymousCount = 0; 353 | unsigned int transitionCount = 0; 354 | 355 | Transducer* hyperminTrans; 356 | 357 | string_ref left_sieve_name; 358 | string_ref right_sieve_name; 359 | vector left_sieve_tok; 360 | vector right_sieve_tok; 361 | 362 | void die(const char* msg, ...); 363 | UnicodeString printPattern(const pattern_element_t& pat); 364 | UnicodeString printFilter(const tag_filter_t& filter); 365 | void finishLexicon(); 366 | string_ref internName(const UnicodeString& name); 367 | string_ref checkName(UnicodeString& name); 368 | RepeatMode readModifier(char_iter& iter); 369 | tag_filter_t readTagFilter(char_iter& iter, UnicodeString& line); 370 | tags_t readTags(char_iter& iter, UnicodeString& line); 371 | void appendSymbol(const UnicodeString& s, lex_token_t& tok); 372 | void readSymbol(char_iter& iter, UnicodeString& line, lex_token_t& tok); 373 | int processRegexTokenSeq(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state); 374 | int processRegexGroup(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state, unsigned int depth); 375 | lex_seg_t processLexiconSegment(char_iter& iter, UnicodeString& line, unsigned int part_count); 376 | token_t readToken(char_iter& iter, UnicodeString& line); 377 | pattern_element_t readPatternElement(char_iter& iter, UnicodeString& line); 378 | void processPattern(char_iter& iter, UnicodeString& line); 379 | void processNextLine(); 380 | 381 | bool isLexiconToken(const pattern_element_t& tok); 382 | vector determineFreedom(pattern_t& pat); 383 | map matchedParts; 384 | void applyMode(Transducer* trans, RepeatMode mode); 385 | void insertEntry(Transducer* trans, const lex_seg_t &seg); 386 | void appendLexicon(string_ref lexicon_id, const vector &to_append); 387 | Transducer* getLexiconTransducer(pattern_element_t tok, unsigned int entry_index, bool free); 388 | void buildPattern(int state, Transducer* t, const pattern_t& pat, vector is_free, unsigned int pos); 389 | Transducer* buildPattern(const pattern_element_t &tok); 390 | Transducer* buildPatternWithFlags(const pattern_element_t &tok, int pattern_start_state); 391 | trans_sym_t alphabet_lookup(const UnicodeString &symbol); 392 | trans_sym_t alphabet_lookup(trans_sym_t l, trans_sym_t r); 393 | 394 | int insertPreTags(Transducer* t, int state, tag_filter_t &tags); 395 | int insertPostTags(Transducer* t, int state, tag_filter_t &tags); 396 | void encodeFlag(UnicodeString& str, int flag); 397 | trans_sym_t getFlag(FlagDiacriticType type, string_ref flag, unsigned int value); 398 | Transducer* getLexiconTransducerWithFlags(pattern_element_t& tok, bool free); 399 | 400 | void buildAllLexicons(); 401 | int buildPatternSingleLexicon(pattern_element_t tok, int start_state); 402 | 403 | public: 404 | LexdCompiler(); 405 | ~LexdCompiler(); 406 | Alphabet alphabet; 407 | void setShouldAlign(bool val) 408 | { 409 | shouldAlign = val; 410 | } 411 | void setShouldCompress(bool val) 412 | { 413 | shouldCompress = val; 414 | } 415 | void setShouldCombine(bool val) 416 | { 417 | shouldCombine = val; 418 | } 419 | void setTagsAsFlags(bool val) 420 | { 421 | tagsAsFlags = val; 422 | } 423 | void setShouldHypermin(bool val) 424 | { 425 | shouldHypermin = val; 426 | } 427 | void setVerbose(bool val) 428 | { 429 | verbose = val; 430 | } 431 | Transducer* buildTransducer(bool usingFlags); 432 | Transducer* buildTransducerSingleLexicon(); 433 | void readFile(UFILE* infile); 434 | void printStatistics() const; 435 | }; 436 | 437 | #endif 438 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Lexd Test Data 2 | 3 | The example files in this directory are adapted from the following repositories: 4 | 5 | - `heb` 6 | - These were created specifically as test data, though the list of roots was extracted from https://github.com/apertium/apertium-heb 7 | - `kik` 8 | - `.lexc` and `.twoc` files were copied from https://github.com/ksteimel/apertium-kik 9 | - `lin` 10 | - `.lexc` and `.twoc` files were copied from https://github.com/apertium/apertium-lin 11 | - `wad` 12 | - These files are a simplified version of https://github.com/apertium/apertium-wad 13 | 14 | Timing data for each of these languages can be obtained by running the following command from this directory. 15 | 16 | ```bash 17 | ./timing.sh [repetitions] code 18 | ``` 19 | 20 | This will print each command run along with execution time and maximum memory usage. Specifying a number of repetitions will repeat each command and report the total time. 21 | -------------------------------------------------------------------------------- /tests/comp.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | NounStem NounInfl 3 | NounStem NounInflComp Comp NounStem2 NounInfl 4 | 5 | LEXICON Comp 6 | +: 7 | 8 | LEXICON NounStem 9 | #bloop 10 | shoop 11 | blarg 12 | 13 | ALIAS NounStem NounStem2 14 | 15 | LEXICON NounInfl 16 | : 17 | :ah 18 | 19 | LEXICON NounInflComp 20 | :a 21 | -------------------------------------------------------------------------------- /tests/feature/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: check 2 | 3 | tests = \ 4 | alt \ 5 | anonlex \ 6 | anonlex-modifier \ 7 | anonpat \ 8 | anonpat-filter \ 9 | anonpat-filter-ops \ 10 | anonpat-modifier \ 11 | anonpat-nospaces \ 12 | anonpat-ops \ 13 | conflicting-tags \ 14 | diacritic \ 15 | disjoint-opt \ 16 | empty \ 17 | empty-patterns \ 18 | filter-crosstalk \ 19 | lexdeftag \ 20 | lexicon-side-tags \ 21 | lexname-space \ 22 | lextag \ 23 | lexnegtag \ 24 | nontree \ 25 | oneside \ 26 | opt \ 27 | or-filter \ 28 | pairs \ 29 | pattag \ 30 | pattag-coherent \ 31 | pattag-details \ 32 | pattern-independence \ 33 | regex \ 34 | revsieve \ 35 | sieve \ 36 | sieveopt \ 37 | slots-and-operators-nospace \ 38 | xor-filter \ 39 | xor-multi \ 40 | 41 | sources = $(foreach test,$(tests),test-$(test).lexd) 42 | 43 | check-pos: $(foreach src,$(sources),$(O)/$(src).txt.strings.check) 44 | 45 | negtests = \ 46 | col0 \ 47 | trailing-bracket \ 48 | 49 | negsources = $(foreach test,$(negtests),negtest-$(test).lexd) 50 | 51 | check-neg: $(foreach src,$(negsources),$(O)/$(src).txt.error) 52 | 53 | check: check-pos check-neg 54 | 55 | O=. 56 | 57 | $(O): 58 | mkdir $(O) 59 | $(O)/%.lexd.txt: ../../src/lexd %.lexd | $(O) 60 | $^ $(LEXD_TEST_FLAGS) > $@ 61 | $(O)/%.lexd.txt.strings: $(O)/%.lexd.txt 62 | hfst-txt2fst $< | hfst-fst2strings -X obey-flags -c 10 | LC_ALL=C sort -u > $@ 63 | $(O)/%.strings.diff: $(O)/%.strings %.strings.gold 64 | diff -U0 $^ > $@; [ $$? != 2 ] 65 | $(O)/%.strings.check: $(O)/%.strings.diff 66 | [ -s "$<" ] && cat "$<" && exit 1; touch $@ 67 | $(O)/%.lexd.txt.error: ../../src/lexd %.lexd | $(O) 68 | $^ $(LEXD_TEST_FLAGS) > /dev/null 2> $@; [ $$? = 1 ] 69 | clean: 70 | rm $(foreach src,$(sources),$(O)/$(src).txt.strings.check) 71 | rm $(foreach src,$(negsources),$(O)/$(src).txt.error) 72 | rmdir $(O) 73 | -------------------------------------------------------------------------------- /tests/feature/negtest-col0.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X(0) 3 | 4 | LEXICON X 5 | a:b 6 | -------------------------------------------------------------------------------- /tests/feature/negtest-trailing-bracket.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X 3 | 4 | LEXICON X 5 | a] 6 | -------------------------------------------------------------------------------- /tests/feature/test-alt.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | pattern1 3 | 4 | PATTERN pattern1 5 | A | B 6 | C:|:D 7 | 8 | LEXICON A 9 | a 10 | 11 | LEXICON B 12 | b 13 | 14 | LEXICON C 15 | c 16 | 17 | LEXICON D 18 | d 19 | -------------------------------------------------------------------------------- /tests/feature/test-alt.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | :d 2 | a 3 | b 4 | c: 5 | -------------------------------------------------------------------------------- /tests/feature/test-anonlex-modifier.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [a] [b]? [c] 3 | 4 | -------------------------------------------------------------------------------- /tests/feature/test-anonlex-modifier.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | abc 2 | ac 3 | -------------------------------------------------------------------------------- /tests/feature/test-anonlex.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [ a ] 3 | -------------------------------------------------------------------------------- /tests/feature/test-anonlex.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-filter-ops.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | (A)[^[a,b]] 3 | 4 | LEXICON A 5 | apple[a] 6 | banana[b] 7 | orange[a,b] 8 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-filter-ops.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | apple 2 | banana 3 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-filter.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | (Adj Noun)[nofruit,-nocolor] 3 | (Adj Noun)[-nofruit,nocolor] 4 | 5 | LEXICON Adj 6 | bright[nofruit] 7 | green 8 | tasty[nocolor] 9 | impetuous[nofruit,nocolor] 10 | 11 | LEXICON Noun 12 | apple[nocolor] 13 | orange 14 | green[nofruit] 15 | cat[nofruit,nocolor] 16 | 17 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-filter.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | brightgreen 2 | brightorange 3 | greenapple 4 | greengreen 5 | tastyapple 6 | tastyorange 7 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-modifier.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [a] ([b])? [c] 3 | 4 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-modifier.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | abc 2 | ac 3 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-nospaces.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | (A) 3 | 4 | LEXICON A 5 | a 6 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-nospaces.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-ops.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A|(C?) 3 | 4 | LEXICON A 5 | a 6 | 7 | LEXICON C 8 | c 9 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat-ops.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | 2 | a 3 | c 4 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | ( a b ) | c 3 | 4 | LEXICON a 5 | a 6 | 7 | LEXICON b 8 | b 9 | 10 | LEXICON c 11 | c 12 | -------------------------------------------------------------------------------- /tests/feature/test-anonpat.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | ab 2 | c 3 | -------------------------------------------------------------------------------- /tests/feature/test-conflicting-tags.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | Verbs-IV 3 | 4 | PATTERN VerbStemBase 5 | V-IV [:>[nonpunct]] V-Aspect-Hab 6 | V-IV [:>[punct]] V-Aspect-Punct # error reported here 7 | 8 | PATTERN VerbStem 9 | VerbStemBase[^[A1,A2]] 10 | 11 | PATTERN Verbs-IV 12 | :V-Agent VerbStem[-nonpunct] V-Agent: 13 | :V-Agent VerbStem[stat] V-Agent: 14 | 15 | LEXICON V-IV 16 | stem 17 | 18 | LEXICON V-Aspect-Hab 19 | :{a}haʔ[A1] 20 | :{a}s[A2] 21 | 22 | LEXICON V-Aspect-Punct 23 | :{a}{ʔ}[A1] 24 | :{a}{ʔ}[A2] 25 | 26 | LEXICON V-Agent 27 | :{G}{e} 28 | :{y}ag{n}{I} -------------------------------------------------------------------------------- /tests/feature/test-conflicting-tags.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | stem:{y}ag{n}{I}stem>{a}{ʔ} 2 | stem:{G}{e}stem>{a}{ʔ} 3 | -------------------------------------------------------------------------------- /tests/feature/test-diacritic.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X 3 | Y(2) 4 | 5 | LEXICON X 6 | \ַ 7 | :ֶ 8 | :\ֻ 9 | x\ַ 10 | 11 | LEXICON Y(2) 12 | a ַ 13 | -------------------------------------------------------------------------------- /tests/feature/test-diacritic.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | :ֶ 2 | :ֻ 3 | xַ 4 | ַ 5 | -------------------------------------------------------------------------------- /tests/feature/test-disjoint-opt.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A?(1) B A?(1) 3 | 4 | LEXICON A 5 | a 6 | aa 7 | 8 | LEXICON B 9 | b 10 | bb 11 | -------------------------------------------------------------------------------- /tests/feature/test-disjoint-opt.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | aabaa 2 | aabbaa 3 | aba 4 | abba 5 | b 6 | bb 7 | -------------------------------------------------------------------------------- /tests/feature/test-empty-patterns.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | Case[t] 3 | Case[s] # comment to get the correct answer 4 | 5 | LEXICON Obl 6 | 7 | 8 | LEXICON OblCase 9 | [t] 10 | 11 | PATTERN Case 12 | Obl OblCase 13 | 14 | -------------------------------------------------------------------------------- /tests/feature/test-empty-patterns.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/feature/test-empty.lexd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/tests/feature/test-empty.lexd -------------------------------------------------------------------------------- /tests/feature/test-empty.lexd.txt.strings.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lexd/342ace457be8ba86c80e04a3d6bc666715692076/tests/feature/test-empty.lexd.txt.strings.gold -------------------------------------------------------------------------------- /tests/feature/test-filter-crosstalk.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | Phrase[nofruit,-nocolor] 3 | Phrase[-nofruit,nocolor] 4 | 5 | PATTERN Phrase 6 | Adj Noun 7 | 8 | LEXICON Adj 9 | bright[nofruit] 10 | 11 | LEXICON Noun 12 | apple[nocolor] 13 | orange[nofruit] 14 | 15 | -------------------------------------------------------------------------------- /tests/feature/test-filter-crosstalk.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | brightorange 2 | -------------------------------------------------------------------------------- /tests/feature/test-lexdeftag.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A[x] 3 | B[x] 4 | 5 | LEXICON A[x] 6 | apple 7 | banana[-x] 8 | 9 | LEXICON A 10 | orange 11 | pear[x] 12 | 13 | LEXICON B[x]:[y] 14 | nope[-x] 15 | yep:yep[-y,x] # left side gets x 16 | -------------------------------------------------------------------------------- /tests/feature/test-lexdeftag.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | apple 2 | pear 3 | yep 4 | -------------------------------------------------------------------------------- /tests/feature/test-lexicon-side-tags.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X(1):X(2)[tag] 3 | 4 | 5 | LEXICON X(2) 6 | a[tag] b 7 | 8 | -------------------------------------------------------------------------------- /tests/feature/test-lexicon-side-tags.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a:b 2 | -------------------------------------------------------------------------------- /tests/feature/test-lexname-space.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [a] 3 | 4 | LEXICON X 5 | blah 6 | 7 | LEXICON Y 8 | bloop 9 | 10 | LEXICON Z(2) 11 | x y 12 | 13 | LEXICON W(3) 14 | a b c 15 | -------------------------------------------------------------------------------- /tests/feature/test-lexname-space.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /tests/feature/test-lexnegtag.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [a] A[s,-r] 3 | [b] A[-t,s] 4 | [c] A[-s] 5 | 6 | LEXICON A 7 | a[t,s] 8 | b[s,r] 9 | c[t,r] 10 | -------------------------------------------------------------------------------- /tests/feature/test-lexnegtag.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | aa 2 | bb 3 | cc 4 | -------------------------------------------------------------------------------- /tests/feature/test-lextag.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [a] A[t] 3 | [b] A[r] 4 | [ab] A[s] 5 | 6 | LEXICON A 7 | a[t,s] 8 | b[s,r] 9 | -------------------------------------------------------------------------------- /tests/feature/test-lextag.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | aa 2 | aba 3 | abb 4 | bb 5 | -------------------------------------------------------------------------------- /tests/feature/test-nontree.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | parentpat 3 | parentpat2 4 | 5 | PATTERN parentpat 6 | childpat 7 | 8 | PATTERN parentpat2 9 | childpat 10 | 11 | PATTERN childpat 12 | child 13 | 14 | LEXICON child 15 | x 16 | -------------------------------------------------------------------------------- /tests/feature/test-nontree.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | x 2 | -------------------------------------------------------------------------------- /tests/feature/test-oneside.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A: 3 | :A 4 | 5 | LEXICON A 6 | a1:a2 7 | -------------------------------------------------------------------------------- /tests/feature/test-oneside.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | :a2 2 | a1: 3 | -------------------------------------------------------------------------------- /tests/feature/test-opt.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | pattern1 3 | 4 | PATTERN pattern1 5 | A? B? 6 | C:? :D? 7 | 8 | LEXICON A 9 | a 10 | 11 | LEXICON B 12 | b 13 | 14 | LEXICON C 15 | c 16 | 17 | LEXICON D 18 | d 19 | -------------------------------------------------------------------------------- /tests/feature/test-opt.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | 2 | :d 3 | a 4 | ab 5 | b 6 | c: 7 | c:d 8 | -------------------------------------------------------------------------------- /tests/feature/test-or-filter.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A[|[a,b]] 3 | 4 | LEXICON A 5 | apple[a] 6 | banana[b] 7 | orange 8 | delaware[notafruit] 9 | -------------------------------------------------------------------------------- /tests/feature/test-or-filter.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | apple 2 | banana 3 | -------------------------------------------------------------------------------- /tests/feature/test-pairs.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | pattern 3 | 4 | PATTERN pattern 5 | x(1):y(2) x(2):y(1) 6 | 7 | LEXICON x(2) 8 | x X 9 | 10 | LEXICON y(2) 11 | y Y 12 | -------------------------------------------------------------------------------- /tests/feature/test-pairs.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | xX:Yy 2 | -------------------------------------------------------------------------------- /tests/feature/test-pattag-coherent.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | 3 | B(1)[x] A(1)[x] B(1) 4 | B(1)[-x] A(1)[-x] B(1) 5 | 6 | LEXICON A 7 | 8 | a-no-x 9 | a-x[x] 10 | 11 | LEXICON B 12 | 13 | b-no-x 14 | b-x[x] 15 | 16 | -------------------------------------------------------------------------------- /tests/feature/test-pattag-coherent.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | b-no-xa-no-xb-no-x 2 | b-xa-xb-x 3 | -------------------------------------------------------------------------------- /tests/feature/test-pattag-details.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X[t,-s] 3 | 4 | PATTERN X 5 | A B 6 | C 7 | 8 | LEXICON A 9 | a 10 | at[t] 11 | as[s] 12 | ast[s,t] 13 | 14 | LEXICON B 15 | b 16 | bt[t] 17 | bs[s] 18 | bst[t,s] 19 | 20 | LEXICON C 21 | ct[t] 22 | cs[s] 23 | cst[t,s] 24 | -------------------------------------------------------------------------------- /tests/feature/test-pattag-details.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | abt 2 | atb 3 | atbt 4 | ct 5 | -------------------------------------------------------------------------------- /tests/feature/test-pattag.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [t] A[t] 3 | [nott] A[-t] 4 | 5 | PATTERN A 6 | B 7 | C 8 | 9 | LEXICON B 10 | a[t] 11 | 12 | LEXICON C 13 | b[s] 14 | -------------------------------------------------------------------------------- /tests/feature/test-pattag.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | nottb 2 | ta 3 | -------------------------------------------------------------------------------- /tests/feature/test-pattern-independence.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X A 3 | B B 4 | 5 | PATTERN X 6 | A A 7 | 8 | LEXICON A 9 | a1 10 | a2 11 | 12 | LEXICON B 13 | b1 14 | b2 15 | -------------------------------------------------------------------------------- /tests/feature/test-pattern-independence.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a1a1a1 2 | a1a1a2 3 | a2a2a1 4 | a2a2a2 5 | b1b1 6 | b2b2 7 | -------------------------------------------------------------------------------- /tests/feature/test-regex.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | [/a/] 3 | RE 4 | COLRE(1) COLRE(2) 5 | TWOSIDED 6 | 7 | LEXICON RE 8 | // 9 | /c[d-f]g/ 10 | /h(i)?/ 11 | /j|k/ 12 | /(l(m)?)?/ 13 | 14 | LEXICON COLRE(2) 15 | /n[op]/ q 16 | r /[s-u]v/ 17 | 18 | LEXICON TWOSIDED 19 | /w:x[yz]/ 20 | -------------------------------------------------------------------------------- /tests/feature/test-regex.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | 2 | 3 | a 4 | cdg 5 | ceg 6 | cfg 7 | h 8 | hi 9 | j 10 | k 11 | l 12 | lm 13 | noq 14 | npq 15 | rsv 16 | rtv 17 | ruv 18 | w:xy 19 | w:xz 20 | -------------------------------------------------------------------------------- /tests/feature/test-revsieve.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | b(1) < b(2) < b(3) 3 | 4 | LEXICON b(3) 5 | b1 b2 b3 6 | -------------------------------------------------------------------------------- /tests/feature/test-revsieve.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | b1b2b3 2 | b2b3 3 | b3 4 | -------------------------------------------------------------------------------- /tests/feature/test-sieve.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | pattern 3 | 4 | LEXICON f(3) 5 | f1 f2 f3 6 | g1 g2 g3 7 | 8 | LEXICON b(3) 9 | b1 b2 b3 10 | c1 c2 c3 11 | 12 | LEXICON m 13 | m 14 | 15 | PATTERN pattern 16 | f(1) > f(2) > f(3) 17 | b(1) < b(2) < b(3) 18 | b(1) < m > f(1) 19 | -------------------------------------------------------------------------------- /tests/feature/test-sieve.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | b1b2b3 2 | b1m 3 | b1mf1 4 | b1mg1 5 | b2b3 6 | b3 7 | c1c2c3 8 | c1m 9 | c1mf1 10 | c1mg1 11 | c2c3 12 | c3 13 | f1 14 | f1f2 15 | f1f2f3 16 | g1 17 | g1g2 18 | g1g2g3 19 | m 20 | mf1 21 | mg1 22 | -------------------------------------------------------------------------------- /tests/feature/test-sieveopt.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A > B|C 3 | 4 | LEXICON A 5 | a 6 | 7 | LEXICON B 8 | b 9 | 10 | LEXICON C 11 | c 12 | -------------------------------------------------------------------------------- /tests/feature/test-sieveopt.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | a 2 | ab 3 | ac 4 | -------------------------------------------------------------------------------- /tests/feature/test-slots-and-operators-nospace.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | X(1)|Y(1) [z] 3 | 4 | LEXICON X(2) 5 | x1 x2 6 | 7 | LEXICON Y(2) 8 | y1 y2 9 | -------------------------------------------------------------------------------- /tests/feature/test-slots-and-operators-nospace.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | x1z 2 | y1z 3 | -------------------------------------------------------------------------------- /tests/feature/test-xor-filter.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | A[^[a,b]] 3 | 4 | LEXICON A 5 | apple[a] 6 | banana[b] 7 | orange[a,b] 8 | -------------------------------------------------------------------------------- /tests/feature/test-xor-filter.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | apple 2 | banana 3 | -------------------------------------------------------------------------------- /tests/feature/test-xor-multi.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | Phrase[^[nofruit,nocolor]] 3 | 4 | PATTERN Phrase 5 | Adj Noun 6 | 7 | LEXICON Adj 8 | bright[nofruit] 9 | green 10 | tasty[nocolor] 11 | impetuous[nofruit,nocolor] 12 | 13 | LEXICON Noun 14 | apple[nocolor] 15 | orange 16 | green[nofruit] 17 | cat[nofruit,nocolor] 18 | 19 | -------------------------------------------------------------------------------- /tests/feature/test-xor-multi.lexd.txt.strings.gold: -------------------------------------------------------------------------------- 1 | brightgreen 2 | brightorange 3 | greenapple 4 | greengreen 5 | tastyapple 6 | tastyorange 7 | -------------------------------------------------------------------------------- /tests/heb.lexc: -------------------------------------------------------------------------------- 1 | Multichar_Symbols 2 | % 3 | % 4 | % 5 | % 6 | % 7 | % 8 | % 9 | % 10 | % 11 | % 12 | 13 | %{V1%} 14 | %{V2%} 15 | 16 | 17 | %[%+1sg%] %[%+1pl%] %[%+2sgm%] %[%+2sgf%] %[%+2plm%] %[%+2plf%] %[%+3sgm%] %[%+3sgf%] %[%+3plm%] %[%+3plf%] 18 | %[%-1sg%] %[%-1pl%] %[%-2sgm%] %[%-2sgf%] %[%-2plm%] %[%-2plf%] %[%-3sgm%] %[%-3sgf%] %[%-3plm%] %[%-3plf%] 19 | 20 | %[perf%] 21 | %[impf%] 22 | 23 | LEXICON Root 24 | %[%+1sg%]:%[impf%]אֶ VerbRoot ; 25 | %[%+1pl%]:%[impf%]נִ VerbRoot ; 26 | %[%+2sgm%]:%[impf%]תִ VerbRoot ; 27 | %[%+2sgf%]:%[impf%]תִ VerbRoot ; 28 | %[%+2plm%]:%[impf%]תִ VerbRoot ; 29 | %[%+2plf%]:%[impf%]תִ VerbRoot ; 30 | %[%+3sgm%]:%[impf%]יִ VerbRoot ; 31 | %[%+3sgf%]:%[impf%]תִ VerbRoot ; 32 | %[%+3plm%]:%[impf%]יִ VerbRoot ; 33 | %[%+3plf%]:%[impf%]תִ VerbRoot ; 34 | VerbRoot ; 35 | 36 | LEXICON VerbInflSuf 37 | %%%%%[%-1sg%]:%[impf%] # ; 38 | %%%%%[%-1pl%]:%[impf%] # ; 39 | %%%%%%[%-2sgm%]:%[impf%] # ; 40 | %%%%%%[%-2sgf%]:ִי%[impf%] # ; 41 | %%%%%%[%-2plm%]:ו%[impf%] # ; 42 | %%%%%%[%-2plf%]:נָה%[impf%] # ; 43 | %%%%%%[%-3sgm%]:%[impf%] # ; 44 | %%%%%%[%-3sgf%]:%[impf%] # ; 45 | %%%%%%[%-3plm%]:ו%[impf%] # ; 46 | %%%%%%[%-3plf%]:נָה%[impf%] # ; 47 | 48 | %%%%:תִי%[perf%] # ; 49 | %%%%:נו%[perf%] # ; 50 | %%%%%:תָ%[perf%] # ; 51 | %%%%%:ת%[perf%] # ; 52 | %%%%%:תֶמ%[perf%] # ; 53 | %%%%%:תֶנ%[perf%] # ; 54 | %%%%%:%[perf%] # ; 55 | %%%%%:ָה%[perf%] # ; 56 | %%%%%:ו%[perf%] # ; 57 | %%%%%:ו%[perf%] # ; 58 | 59 | 60 | LEXICON VerbRoot 61 | אכל:א%{V1%}כ%{V2}ל VerbInflSuf ; 62 | אכף:א%{V1%}כ%{V2}ף VerbInflSuf ; 63 | אמר:א%{V1%}מ%{V2}ר VerbInflSuf ; 64 | אסף:א%{V1%}ס%{V2}ף VerbInflSuf ; 65 | אסר:א%{V1%}ס%{V2}ר VerbInflSuf ; 66 | ארך:א%{V1%}ר%{V2}ך VerbInflSuf ; 67 | אשר:א%{V1%}ש%{V2}ר VerbInflSuf ; 68 | בחר:ב%{V1%}ח%{V2}ר VerbInflSuf ; 69 | בלע:ב%{V1%}ל%{V2}ע VerbInflSuf ; 70 | ברח:ב%{V1%}ר%{V2}ח VerbInflSuf ; 71 | גאל:ג%{V1%}א%{V2}ל VerbInflSuf ; 72 | גדל:ג%{V1%}ד%{V2}ל VerbInflSuf ; 73 | גזז:ג%{V1%}ז%{V2}ז VerbInflSuf ; 74 | גזר:ג%{V1%}ז%{V2}ר VerbInflSuf ; 75 | גנב:ג%{V1%}נ%{V2}ב VerbInflSuf ; 76 | גנח:ג%{V1%}נ%{V2}ח VerbInflSuf ; 77 | גרד:ג%{V1%}ר%{V2}ד VerbInflSuf ; 78 | דחס:ד%{V1%}ח%{V2}ס VerbInflSuf ; 79 | דעך:ד%{V1%}ע%{V2}ך VerbInflSuf ; 80 | היה:ה%{V1%}י%{V2}ה VerbInflSuf ; 81 | הלך:ה%{V1%}ל%{V2}ך VerbInflSuf ; 82 | הפך:ה%{V1%}פ%{V2}ך VerbInflSuf ; 83 | הפר:ה%{V1%}פ%{V2}ר VerbInflSuf ; 84 | הרג:ה%{V1%}ר%{V2}ג VerbInflSuf ; 85 | הרס:ה%{V1%}ר%{V2}ס VerbInflSuf ; 86 | זכה:ז%{V1%}כ%{V2}ה VerbInflSuf ; 87 | זרח:ז%{V1%}ר%{V2}ח VerbInflSuf ; 88 | זרק:ז%{V1%}ר%{V2}ק VerbInflSuf ; 89 | חבר:ח%{V1%}ב%{V2}ר VerbInflSuf ; 90 | חדר:ח%{V1%}ד%{V2}ר VerbInflSuf ; 91 | חלב:ח%{V1%}ל%{V2}ב VerbInflSuf ; 92 | חלה:ח%{V1%}ל%{V2}ה VerbInflSuf ; 93 | חלם:ח%{V1%}ל%{V2}ם VerbInflSuf ; 94 | חנק:ח%{V1%}נ%{V2}ק VerbInflSuf ; 95 | חסם:ח%{V1%}ס%{V2}ם VerbInflSuf ; 96 | חפר:ח%{V1%}פ%{V2}ר VerbInflSuf ; 97 | חפש:ח%{V1%}פ%{V2}ש VerbInflSuf ; 98 | חשב:ח%{V1%}ש%{V2}ב VerbInflSuf ; 99 | חשף:ח%{V1%}ש%{V2}ף VerbInflSuf ; 100 | טחן:ט%{V1%}ח%{V2}ן VerbInflSuf ; 101 | טעם:ט%{V1%}ע%{V2}ם VerbInflSuf ; 102 | טרף:ט%{V1%}ר%{V2}ף VerbInflSuf ; 103 | ידע:י%{V1%}ד%{V2}ע VerbInflSuf ; 104 | יכל:י%{V1%}כ%{V2}ל VerbInflSuf ; 105 | יצא:י%{V1%}צ%{V2}א VerbInflSuf ; 106 | יצר:י%{V1%}צ%{V2}ר VerbInflSuf ; 107 | ירד:י%{V1%}ר%{V2}ד VerbInflSuf ; 108 | ירק:י%{V1%}ר%{V2}ק VerbInflSuf ; 109 | ישב:י%{V1%}ש%{V2}ב VerbInflSuf ; 110 | ישן:י%{V1%}ש%{V2}ן VerbInflSuf ; 111 | כלל:כ%{V1%}ל%{V2}ל VerbInflSuf ; 112 | כתב:כ%{V1%}ת%{V2}ב VerbInflSuf ; 113 | לחץ:ל%{V1%}ח%{V2}ץ VerbInflSuf ; 114 | לקח:ל%{V1%}ק%{V2}ח VerbInflSuf ; 115 | מדד:מ%{V1%}ד%{V2}ד VerbInflSuf ; 116 | מחא:מ%{V1%}ח%{V2}א VerbInflSuf ; 117 | מכר:מ%{V1%}כ%{V2}ר VerbInflSuf ; 118 | מסר:מ%{V1%}ס%{V2}ר VerbInflSuf ; 119 | מצא:מ%{V1%}צ%{V2}א VerbInflSuf ; 120 | מרח:מ%{V1%}ר%{V2}ח VerbInflSuf ; 121 | משך:מ%{V1%}ש%{V2}ך VerbInflSuf ; 122 | נגע:נ%{V1%}ג%{V2}ע VerbInflSuf ; 123 | נזף:נ%{V1%}ז%{V2}ף VerbInflSuf ; 124 | נצץ:נ%{V1%}צ%{V2}ץ VerbInflSuf ; 125 | נשא:נ%{V1%}ש%{V2}א VerbInflSuf ; 126 | נשך:נ%{V1%}ש%{V2}ך VerbInflSuf ; 127 | נתן:נ%{V1%}ת%{V2}ן VerbInflSuf ; 128 | סגר:ס%{V1%}ג%{V2}ר VerbInflSuf ; 129 | סדק:ס%{V1%}ד%{V2}ק VerbInflSuf ; 130 | סחט:ס%{V1%}ח%{V2}ט VerbInflSuf ; 131 | סלח:ס%{V1%}ל%{V2}ח VerbInflSuf ; 132 | ספר:ס%{V1%}פ%{V2}ר VerbInflSuf ; 133 | סרח:ס%{V1%}ר%{V2}ח VerbInflSuf ; 134 | סתם:ס%{V1%}ת%{V2}ם VerbInflSuf ; 135 | עבד:ע%{V1%}ב%{V2}ד VerbInflSuf ; 136 | עזב:ע%{V1%}ז%{V2}ב VerbInflSuf ; 137 | עזר:ע%{V1%}ז%{V2}ר VerbInflSuf ; 138 | עכר:ע%{V1%}כ%{V2}ר VerbInflSuf ; 139 | עמד:ע%{V1%}מ%{V2}ד VerbInflSuf ; 140 | עצר:ע%{V1%}צ%{V2}ר VerbInflSuf ; 141 | עשה:ע%{V1%}ש%{V2}ה VerbInflSuf ; 142 | עשק:ע%{V1%}ש%{V2}ק VerbInflSuf ; 143 | פגע:פ%{V1%}ג%{V2}ע VerbInflSuf ; 144 | פנה:פ%{V1%}נ%{V2}ה VerbInflSuf ; 145 | פרם:פ%{V1%}ר%{V2}ם VerbInflSuf ; 146 | פרץ:פ%{V1%}ר%{V2}ץ VerbInflSuf ; 147 | פרק:פ%{V1%}ר%{V2}ק VerbInflSuf ; 148 | פשט:פ%{V1%}ש%{V2}ט VerbInflSuf ; 149 | פתח:פ%{V1%}ת%{V2}ח VerbInflSuf ; 150 | צבט:צ%{V1%}ב%{V2}ט VerbInflSuf ; 151 | צבע:צ%{V1%}ב%{V2}ע VerbInflSuf ; 152 | צחק:צ%{V1%}ח%{V2}ק VerbInflSuf ; 153 | צלל:צ%{V1%}ל%{V2}ל VerbInflSuf ; 154 | צעק:צ%{V1%}ע%{V2}ק VerbInflSuf ; 155 | קבל:ק%{V1%}ב%{V2}ל VerbInflSuf ; 156 | קבר:ק%{V1%}ב%{V2}ר VerbInflSuf ; 157 | קפץ:ק%{V1%}פ%{V2}ץ VerbInflSuf ; 158 | קצץ:ק%{V1%}צ%{V2}ץ VerbInflSuf ; 159 | קרא:ק%{V1%}ר%{V2}א VerbInflSuf ; 160 | קרב:ק%{V1%}ר%{V2}ב VerbInflSuf ; 161 | קרה:ק%{V1%}ר%{V2}ה VerbInflSuf ; 162 | קשר:ק%{V1%}ש%{V2}ר VerbInflSuf ; 163 | ראה:ר%{V1%}א%{V2}ה VerbInflSuf ; 164 | רזה:ר%{V1%}ז%{V2}ה VerbInflSuf ; 165 | רכב:ר%{V1%}כ%{V2}ב VerbInflSuf ; 166 | רעד:ר%{V1%}ע%{V2}ד VerbInflSuf ; 167 | רצה:ר%{V1%}צ%{V2}ה VerbInflSuf ; 168 | רקד:ר%{V1%}ק%{V2}ד VerbInflSuf ; 169 | רתח:ר%{V1%}ת%{V2}ח VerbInflSuf ; 170 | שאל:ש%{V1%}א%{V2}ל VerbInflSuf ; 171 | שבר:ש%{V1%}ב%{V2}ר VerbInflSuf ; 172 | שחה:ש%{V1%}ח%{V2}ה VerbInflSuf ; 173 | שטף:ש%{V1%}ט%{V2}ף VerbInflSuf ; 174 | שלח:ש%{V1%}ל%{V2}ח VerbInflSuf ; 175 | שלל:ש%{V1%}ל%{V2}ל VerbInflSuf ; 176 | שמח:ש%{V1%}מ%{V2}ח VerbInflSuf ; 177 | שמע:ש%{V1%}מ%{V2}ע VerbInflSuf ; 178 | שנא:ש%{V1%}נ%{V2}א VerbInflSuf ; 179 | שקט:ש%{V1%}ק%{V2}ט VerbInflSuf ; 180 | שקל:ש%{V1%}ק%{V2}ל VerbInflSuf ; 181 | שרף:ש%{V1%}ר%{V2}ף VerbInflSuf ; 182 | שרץ:ש%{V1%}ר%{V2}ץ VerbInflSuf ; 183 | שתה:ש%{V1%}ת%{V2}ה VerbInflSuf ; 184 | תלה:ת%{V1%}ל%{V2}ה VerbInflSuf ; 185 | תסס:ת%{V1%}ס%{V2}ס VerbInflSuf ; 186 | תפס:ת%{V1%}פ%{V2}ס VerbInflSuf ; 187 | תקף:ת%{V1%}ק%{V2}ף VerbInflSuf ; 188 | 189 | -------------------------------------------------------------------------------- /tests/heb.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | :VerbInfl(1) VerbRoot(1) :VerbInfl(2) VerbRoot(2) :VerbInfl(3) VerbRoot(3) :VerbInfl(4) VerbInfl(1): 3 | 4 | LEXICON VerbInfl(4) 5 | :אֶ : ֹ : 6 | :נִ : ֹ : 7 | :תִ : ֹ : 8 | :תִ : ֹ ִי 9 | :תִ : ֹ ו 10 | :תִ : ֹ נָה 11 | :יִ : ֹ : 12 | :תִ : ֹ : 13 | :יִ : ֹ ו 14 | :תִ : ֹ נָה 15 | : ָ ַ תִי 16 | : ָ ַ נו 17 | : ָ ַ תָ 18 | : ָ ַ ת 19 | : ָ ַ תֶמ 20 | : ָ ַ תֶנ 21 | : ָ ַ : 22 | : ָ ַ ָה 23 | : ָ ַ ו 24 | : ָ ַ ו 25 | 26 | 27 | LEXICON VerbRoot(3) 28 | א כ ל 29 | א כ ף 30 | א מ ר 31 | א ס ף 32 | א ס ר 33 | א ר ך 34 | א ש ר 35 | ב ח ר 36 | ב ל ע 37 | ב ר ח 38 | ג א ל 39 | ג ד ל 40 | ג ז ז 41 | ג ז ר 42 | ג נ ב 43 | ג נ ח 44 | ג ר ד 45 | ד ח ס 46 | ד ע ך 47 | ה י ה 48 | ה ל ך 49 | ה פ ך 50 | ה פ ר 51 | ה ר ג 52 | ה ר ס 53 | ז כ ה 54 | ז ר ח 55 | ז ר ק 56 | ח ב ר 57 | ח ד ר 58 | ח ל ב 59 | ח ל ה 60 | ח ל ם 61 | ח נ ק 62 | ח ס ם 63 | ח פ ר 64 | ח פ ש 65 | ח ש ב 66 | ח ש ף 67 | ט ח ן 68 | ט ע ם 69 | ט ר ף 70 | י ד ע 71 | י כ ל 72 | י צ א 73 | י צ ר 74 | י ר ד 75 | י ר ק 76 | י ש ב 77 | י ש ן 78 | כ ל ל 79 | כ ת ב 80 | ל ח ץ 81 | ל ק ח 82 | מ ד ד 83 | מ ח א 84 | מ כ ר 85 | מ ס ר 86 | מ צ א 87 | מ ר ח 88 | מ ש ך 89 | נ ג ע 90 | נ ז ף 91 | נ צ ץ 92 | נ ש א 93 | נ ש ך 94 | נ ת ן 95 | ס ג ר 96 | ס ד ק 97 | ס ח ט 98 | ס ל ח 99 | ס פ ר 100 | ס ר ח 101 | ס ת ם 102 | ע ב ד 103 | ע ז ב 104 | ע ז ר 105 | ע כ ר 106 | ע מ ד 107 | ע צ ר 108 | ע ש ה 109 | ע ש ק 110 | פ ג ע 111 | פ נ ה 112 | פ ר ם 113 | פ ר ץ 114 | פ ר ק 115 | פ ש ט 116 | פ ת ח 117 | צ ב ט 118 | צ ב ע 119 | צ ח ק 120 | צ ל ל 121 | צ ע ק 122 | ק ב ל 123 | ק ב ר 124 | ק פ ץ 125 | ק צ ץ 126 | ק ר א 127 | ק ר ב 128 | ק ר ה 129 | ק ש ר 130 | ר א ה 131 | ר ז ה 132 | ר כ ב 133 | ר ע ד 134 | ר צ ה 135 | ר ק ד 136 | ר ת ח 137 | ש א ל 138 | ש ב ר 139 | ש ח ה 140 | ש ט ף 141 | ש ל ח 142 | ש ל ל 143 | ש מ ח 144 | ש מ ע 145 | ש נ א 146 | ש ק ט 147 | ש ק ל 148 | ש ר ף 149 | ש ר ץ 150 | ש ת ה 151 | ת ל ה 152 | ת ס ס 153 | ת פ ס 154 | ת ק ף 155 | 156 | -------------------------------------------------------------------------------- /tests/heb.sh2: -------------------------------------------------------------------------------- 1 | hfst-lexc -s heb.lexc -o heb.lexc.hfst 2 | hfst-twolc -s heb.twoc -o heb.twoc.hfst 3 | hfst-twolc -s heb_vow.twoc -o heb_vow.twoc.hfst 4 | hfst-invert heb.lexc.hfst | hfst-compose-intersect -1 - -2 heb.twoc.hfst | hfst-invert -o heb.lexc-twoc.hfst 5 | hfst-compose-intersect -1 heb.lexc-twoc.hfst -2 heb_vow.twoc.hfst -o heb.nomin.hfst 6 | hfst-minimize heb.nomin.hfst -o heb.hfst 7 | ../src/lexd heb.lexd heb.att 8 | hfst-txt2fst heb.att -o heb_d.hfst 9 | -------------------------------------------------------------------------------- /tests/heb.twoc: -------------------------------------------------------------------------------- 1 | Alphabet 2 | 3 | ם כ ה ג ש ז צ נ ל פ ע י ב ף ץ מ ך ת ח ד ס ר א ט ק ן 4 | 5 | %[%+1sg%]:0 %[%+1pl%]:0 %[%+2sgm%]:0 %[%+2sgf%]:0 %[%+2plm%]:0 %[%+2plf%]:0 %[%+3sgm%]:0 %[%+3sgf%]:0 %[%+3plm%]:0 %[%+3plf%]:0 6 | %[%-1sg%]:0 %[%-1pl%]:0 %[%-2sgm%]:0 %[%-2sgf%]:0 %[%-2plm%]:0 %[%-2plf%]:0 %[%-3sgm%]:0 %[%-3sgf%]:0 %[%-3plm%]:0 %[%-3plf%]:0 7 | 8 | % 9 | % 10 | % 11 | % 12 | % 13 | % 14 | % 15 | % 16 | % 17 | % 18 | 19 | ; 20 | 21 | Sets 22 | 23 | Prefix = %[%+1sg%] %[%+1pl%] %[%+2sgm%] %[%+2sgf%] %[%+2plm%] %[%+2plf%] %[%+3sgm%] %[%+3sgf%] %[%+3plm%] %[%+3plf%] ; 24 | Suffix = %[%-1sg%] %[%-1pl%] %[%-2sgm%] %[%-2sgf%] %[%-2plm%] %[%-2plf%] %[%-3sgm%] %[%-3sgf%] %[%-3plm%] %[%-3plf%] ; 25 | 26 | Rules 27 | 28 | "Remove paths without matching suffix feature" 29 | Fx:0 /<= _ ; 30 | except 31 | _ :* Fy:0 ; 32 | where Fx in Prefix 33 | Fy in Suffix 34 | matched ; 35 | 36 | "Remove paths without matching prefix feature" 37 | Fy:0 /<= _ ; 38 | except 39 | Fx:0 :* _ ; 40 | where Fx in Prefix 41 | Fy in Suffix 42 | matched ; 43 | 44 | -------------------------------------------------------------------------------- /tests/heb_vow.twoc: -------------------------------------------------------------------------------- 1 | Alphabet 2 | 3 | ם כ ה ג ש ז צ נ ל פ ע י ב ף ץ מ ך ת ח ד ס ר א ט ק ן 4 | ֶ ָ ִ ַ 5 | 6 | %[perf%]:0 7 | %[impf%]:0 8 | 9 | %{V1%}:ָ 10 | %{V2%}:ַ 11 | 12 | ; 13 | 14 | Sets 15 | 16 | blah = %[perf%] %[impf%] ; 17 | 18 | Rules 19 | 20 | "imperfect vowel 1" 21 | %{V1%}:0 <=> _ :* %[impf%]:0 ; 22 | 23 | "imperfect vowel 2" 24 | %{V2%}:ֹ <=> _ :* %[impf%]:0 ; 25 | -------------------------------------------------------------------------------- /tests/kik.lexd: -------------------------------------------------------------------------------- 1 | #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2 | #!!! L E X I C O N !!! 3 | #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 4 | 5 | LEXICON NounLocative 6 | :inĩ 7 | :-inĩ 8 | : 9 | 10 | LEXICON NounRootCl1 11 | ndũ:nd{Ũ} # person 12 | arimũ:arim{Ũ} # teacher 13 | rĩmi:{R}ĩm{I} # farmer 14 | timia:{T}imi{A} # woman 15 | tumia:{T}umi{A} # woman, alternate spelling 16 | nyina:nyin{A} # mother 17 | thomi:thom{I} # student 18 | thuri:thur{I} # man 19 | arĩ:{A}r{Ĩ} # daughter 20 | 21 | LEXICON NounRootCl1Irr 22 | wagui:wagu{I} 23 | 24 | LEXICON Cl1 25 | :{NULL} 26 | 27 | LEXICON NounRootCl3 28 | aki:{A}k{I} # fire 29 | ithikiri:{I}thikir{I} # bicycle 30 | bira:bir{A} # ball 31 | rango:rang{O} # door 32 | ciĩ:{C}i{Ĩ} # home 33 | tĩ:{T}{Ĩ} # tree 34 | cuha:{C}uh{A} # swing 35 | rũthi:{R}ũth{I} # lion 36 | thanga:thang{A} # sand 37 | ĩkĩre:{Ĩ}kĩr{E} # method 38 | thenya:theny{A} # day; also gap 39 | gũnda:gũnd{A} # garden 40 | gambĩre:gambĩre # voice 41 | 42 | LEXICON Cl3 43 | :{NULL} 44 | 45 | LEXICON NounRootCl5 46 | hũa:hũ{A} # flower 47 | tunda:{T}und{A} # fruit 48 | rima:{R}im{A} # hole 49 | ria:{R}i{A} # milk, or lake/pond 50 | bera:ber{A} # pear 51 | gũrũ:gũr{Ũ} # sky 52 | aĩ:{BLOCK}{A}{BLOCK}{Ĩ} # water 53 | higa:hig{A} # stone 54 | embe:{E}mb{E} # mango 55 | rigũ:{R}ig{Ũ} # banana 56 | inabu:{I}nab{U} 57 | ĩciria:{Ĩ}ciri{A} 58 | taha:{T}ah{A} # mataha (a food made with corn and beans!) 59 | thangũ:thang{Ũ} # leaf 60 | mithimithi:mithimith{I} # twilight, always plural 61 | twa:{T}w{A} # name 62 | thaga:thag{A} # 63 | itho:{I}th{O} # eye 64 | rwa:{R}w{A} # letter 65 | 66 | LEXICON Cl5 67 | :{NULL} 68 | 69 | LEXICON NounRootCl7 70 | rĩma:{R}ĩm{A} # mountain 71 | tĩ:{T}{Ĩ} # chair 72 | rio:{R}i{O} # food 73 | kombe:{K}omb{E} # cup 74 | ũra:{Ũ}r{A} # frog 75 | rot:{R}ot # dream 76 | rathi:{R}ath{A} # class 77 | boko:bok{O} 78 | imba:{I}mb{A} 79 | ana:{A}n{A} # child 80 | ambĩrĩria:{A}mbĩrĩri{A} # beginning 81 | ndũ:nd{Ũ} # nothing 82 | tũmi:{T}ũm{I} # reason 83 | ũria:{Ũ}ri{A} # problem 84 | rumi:rum{I} # curse 85 | thama:tham{A} # island 86 | roto:rot{O} # dream (nominalized) 87 | ongo:{O}ng{O} # head 88 | roko:{R}ok{O} # morning 89 | ndo:nd{O} # thing 90 | 91 | LEXICON Cl7 92 | :{NULL} 93 | 94 | LEXICON NounRootCl9p10 95 | bata:bat{A} 96 | buku:buk{U} # book 97 | bembe:bemb{E} # corn 98 | gathĩ:gath{Ĩ} # ladder 99 | gui:gu{I} # dog, plural category uncertain 100 | nyamũ:nyam{Ũ} # animal, plural category uncertain 101 | cukuru:{C}ukur{U} # school, plural category uncertain 102 | ca:{C}{A} # outside, plural category uncertain 103 | kari:{K}ar{I} # car, plural category uncertain 104 | benjũ:benj{Ũ} # pencil 105 | timũ:{T}im{Ũ} # lime 106 | koma:{K}om{A} # spirit, singular category uncertain 107 | metha:meth{A} # table 108 | tũmĩrĩri:{T}ũmĩrĩr{I} # message, plural category uncertain 109 | thimũ:thim{Ũ} # phone, plural category uncertain 110 | thitembũ:thitemb{Ũ} # stamp, plural category uncertain 111 | bahaca:bahac{A} # envelope, plural category uncertain 112 | yama:yam{A} # meat, plural category uncertain 113 | gũkũ:gũk{Ũ} # chicken, plural category uncertain 114 | thaa:tha{A} # hour 115 | bakũri:bakũr{I} 116 | dahi:dah{I} 117 | thia:thi{A} 118 | thĩ:thĩ # not sure if this actually belongs here but it's my best guess based on agreement 119 | tuma:{T}um{A} # darkness 120 | yoka:y{O}k{A} # snake 121 | goga:gog{A} # grace 122 | goro:gor{O} # heart 123 | guo:gu{O} # garment 124 | muoy{O}:muoy{O} # life 125 | tiihũ:{T}iih{Ũ} # wave 126 | tihũ:{T}ih{Ũ} # wave 127 | doo:do{O} # bucket 128 | nuthu:{N}uth{U} # half 129 | hĩndĩ:hĩnd{Ĩ} # time (instance) 130 | dagĩka:dagĩk{A} # minute 131 | bathi:bath{I} # bus 132 | 133 | LEXICON NounRootCl9p10Irr 134 | rumu:rumu # room 135 | batĩ:bat{Ĩ} 136 | 137 | 138 | LEXICON Cl9p10 139 | :{NULL} 140 | 141 | LEXICON NounRootCl11p10 142 | rũigi:{R}ũig{I} 143 | gĩrĩ:gĩr{Ĩ} # fence, plural category uncertain 144 | imbo:{I}mb{O} # song, plural category uncertain 145 | ũĩ:{Ũ}{BLOCK}{Ĩ} # river, plural category uncertain 146 | ĩgĩ:{Ĩ}g{Ĩ} # eagle, plural category uncertain 147 | teto:tet{O} # word 148 | gano:gan{O} # story 149 | 150 | LEXICON Cl11p10 151 | :{NULL} 152 | 153 | LEXICON NounRootCl11p6 154 | guoya:guoy{A} 155 | 156 | LEXICON Cl11p6 157 | :{NULL} 158 | 159 | LEXICON NounRootCl12p13 160 | nua:nu{A} 161 | ana:{A}n{A} # child 162 | ihũri:{I}hũr{I} # gourd 163 | hũa:hũ{A} # coffee 164 | hĩĩ:hĩ{Ĩ} # boy 165 | hiũ:hi{Ũ} # knife 166 | gui:gu{I} # dog 167 | hinda:hind{A} # time (as in one time) 168 | 169 | LEXICON Cl12p13 170 | :{NULL} 171 | 172 | LEXICON NounRootCl14 173 | wendi:wend{I} 174 | cũrũ:{C}ũr{Ũ} # porridge 175 | goro:gor{O} # expense 176 | menyi:menyi # information 177 | 178 | LEXICON Cl14 179 | :{NULL} 180 | 181 | LEXICON NounRootCl15 182 | tũ:{T}{Ũ} 183 | rug:{R}ug # cook/cooking 184 | theka:theka # laugh 185 | oko:{O}k{O} 186 | gũrũ:gũr{Ũ} # leg 187 | 188 | LEXICON Cl15 189 | :{NULL} 190 | 191 | PATTERNS 192 | NounRootCl16 193 | NounRootCl17 194 | NounSg NounLocative 195 | NounPl NounLocative 196 | NounDimSg(1) NounSg NounDimSg(2) NounLocative 197 | NounDimPl(1) NounPl NounDimPl(2) NounLocative 198 | NounDimSg(1) NounSg-NoPre NounDimSg(2) NounLocative 199 | NounDimPl(1) NounPl-NoPre NounDimPl(2) NounLocative 200 | 201 | LEXICON NounRootCl16 202 | ndũ:hand{Ũ}{NULL} # place 203 | 204 | LEXICON NounRootCl17 205 | ndũ:kũnd{Ũ}{NULL} 206 | 207 | LEXICON NounDimPl(2) 208 | :{T}{Ũ} :{NULL} 209 | 210 | LEXICON NounDimSg(2) 211 | :{K}{A} :{NULL} 212 | 213 | LEXICON NounNumberSg 214 | :{NULL} 215 | 216 | LEXICON NounNumberPl 217 | :{NULL} 218 | 219 | PATTERN NounSg 220 | NumberCl1-Sg NounRootCl1 Cl1 NounNumberSg 221 | NumberCl1Irr-Sg NounRootCl1Irr Cl1 NounNumberSg 222 | NumberCl3-Sg NounRootCl3 Cl3 NounNumberSg 223 | NumberCl5-Sg NounRootCl5 Cl5 NounNumberSg 224 | NumberCl7-Sg NounRootCl7 Cl7 NounNumberSg 225 | NumberCl9p10-Sg NounRootCl9p10 Cl9p10 NounNumberSg 226 | NumberCl9p10Irr-Sg NounRootCl9p10Irr Cl9p10 NounNumberSg 227 | NumberCl11p10-Sg NounRootCl11p10 Cl11p10 NounNumberSg 228 | NumberCl11p6-Sg NounRootCl11p6 Cl11p6 NounNumberSg 229 | NumberCl12p13-Sg NounRootCl12p13 Cl12p13 NounNumberSg 230 | NumberCl14-Sg NounRootCl14 Cl14 NounNumberSg 231 | NumberCl15-Sg NounRootCl15 Cl15 NounNumberSg 232 | 233 | PATTERN NounSg-NoPre 234 | NounRootCl1 Cl1 NounNumberSg 235 | NounRootCl1Irr Cl1 NounNumberSg 236 | NounRootCl3 Cl3 NounNumberSg 237 | NounRootCl5 Cl5 NounNumberSg 238 | NounRootCl7 Cl7 NounNumberSg 239 | NounRootCl9p10 Cl9p10 NounNumberSg 240 | NounRootCl9p10Irr Cl9p10 NounNumberSg 241 | NounRootCl11p10 Cl11p10 NounNumberSg 242 | NounRootCl11p6 Cl11p6 NounNumberSg 243 | NounRootCl12p13 Cl12p13 NounNumberSg 244 | NounRootCl14 Cl14 NounNumberSg 245 | NounRootCl15 Cl15 NounNumberSg 246 | 247 | PATTERN NounPl 248 | NumberCl1-Pl NounRootCl1 Cl1 NounNumberPl 249 | NumberCl1Irr-Pl NounRootCl1Irr Cl1 NounNumberPl 250 | NumberCl3-Pl NounRootCl3 Cl3 NounNumberPl 251 | NumberCl5-Pl NounRootCl5 Cl5 NounNumberPl 252 | NumberCl7-Pl NounRootCl7 Cl7 NounNumberPl 253 | NumberCl9p10-Pl NounRootCl9p10 Cl9p10 NounNumberPl 254 | NumberCl9p10Irr-Pl NounRootCl9p10Irr Cl9p10 NounNumberPl 255 | NumberCl11p10-Pl NounRootCl11p10 Cl11p10 NounNumberPl 256 | NumberCl11p6-Pl NounRootCl11p6 Cl11p6 NounNumberPl 257 | NumberCl12p13-Pl NounRootCl12p13 Cl12p13 NounNumberPl 258 | NumberCl14-Pl NounRootCl14 Cl14 NounNumberPl 259 | NumberCl15-Pl NounRootCl15 Cl15 NounNumberPl 260 | 261 | PATTERN NounPl-NoPre 262 | NounRootCl1 Cl1 NounNumberPl 263 | NounRootCl1Irr Cl1 NounNumberPl 264 | NounRootCl3 Cl3 NounNumberPl 265 | NounRootCl5 Cl5 NounNumberPl 266 | NounRootCl7 Cl7 NounNumberPl 267 | NounRootCl9p10 Cl9p10 NounNumberPl 268 | NounRootCl9p10Irr Cl9p10 NounNumberPl 269 | NounRootCl11p10 Cl11p10 NounNumberPl 270 | NounRootCl11p6 Cl11p6 NounNumberPl 271 | NounRootCl12p13 Cl12p13 NounNumberPl 272 | NounRootCl14 Cl14 NounNumberPl 273 | NounRootCl15 Cl15 NounNumberPl 274 | 275 | LEXICON NumberCl1-Sg 276 | :m{Ũ}> 277 | LEXICON NumberCl1-Pl 278 | :{A}> 279 | 280 | LEXICON NumberCl1Irr-Sg 281 | :{NULL}> 282 | LEXICON NumberCl1Irr-Pl 283 | :{NULL}> 284 | 285 | LEXICON NumberCl3-Sg 286 | :m{Ũ}> 287 | LEXICON NumberCl3-Pl 288 | :m{Ĩ}> 289 | 290 | LEXICON NumberCl5-Sg 291 | :{RRI}{IRI}> 292 | LEXICON NumberCl5-Pl 293 | :m{A}> 294 | 295 | LEXICON NumberCl7-Sg 296 | :{K}{Ĩ}> 297 | LEXICON NumberCl7-Pl 298 | :{C}{I}> 299 | 300 | LEXICON NumberCl9p10-Sg 301 | :{N}{N2}{N3}> 302 | LEXICON NumberCl9p10-Pl 303 | :{N}{N2}{N3}> 304 | 305 | LEXICON NumberCl9p10Irr-Sg 306 | :{NULL} 307 | LEXICON NumberCl9p10Irr-Pl 308 | :{NULL} 309 | 310 | LEXICON NumberCl11p10-Sg 311 | :{R}{Ũ}> 312 | LEXICON NumberCl11p10-Pl 313 | :{N}{N2}{N3}> 314 | 315 | LEXICON NumberCl11p6-Sg 316 | :{R}{Ũ}> 317 | LEXICON NumberCl11p6-Pl 318 | :m{A}> 319 | 320 | LEXICON NumberCl12p13-Sg 321 | :{K}{A}> 322 | LEXICON NumberCl12p13-Pl 323 | :{T}{Ũ}> 324 | 325 | LEXICON NumberCl14-Sg 326 | :{NULL} 327 | :{Ũ} 328 | LEXICON NumberCl14-Pl 329 | :m{A}> 330 | 331 | LEXICON NumberCl15-Sg 332 | :{K}{Ũ}> 333 | LEXICON NumberCl15-Pl 334 | :m{A}> 335 | 336 | PATTERNS 337 | IrregCopNoClass 338 | IrregularCopula IrregularCopulaClass IrregularCopulaNumber 339 | 340 | LEXICON IrregularCopula 341 | rĩ:nĩ 342 | rĩ:ti 343 | 344 | LEXICON IrregCopNoClass 345 | rĩ:ndĩ 346 | 347 | LEXICON IrregularCopulaNumber 348 | :{NULL} 349 | :{NULL} 350 | 351 | LEXICON IrregularCopulaClass 352 | :{NULL} 353 | :{NULL} 354 | :{NULL} 355 | :{NULL} 356 | :{NULL} 357 | :{NULL} 358 | :{NULL} 359 | :{NULL} 360 | :{NULL} 361 | :{NULL} 362 | :{NULL} 363 | :{NULL} 364 | :{NULL} 365 | :{NULL} 366 | 367 | PATTERNS 368 | CopVerbNeg1(1) CopPersNum(1) CopVerbTense CopVerbNeg1(2) CopPersNum(2) 369 | CopVerbNeg1(1) CopPersNumNoNeg(1) CopVerbTense CopVerbNeg1(2) CopPersNumNoNeg(2) 370 | CopPersNum(1) CopVerbNeg2(1) CopVerbTense CopVerbNeg2(2) CopPersNum(2) 371 | 372 | LEXICON CopVerbNeg1(2) 373 | :n{D}> :{NULL} 374 | : : 375 | 376 | LEXICON CopPersNumNoNeg(2) 377 | :{Ũ}> :{NULL}{NULL} 378 | :{A}> :{NULL}{NULL} 379 | :{Ũ}> :{NULL}{NULL} 380 | :{yĨ}> :{NULL}{NULL} 381 | :{Ĩ}> :{NULL}{NULL} 382 | :{Ĩ}> :{NULL}{NULL} 383 | :{Ũ}> :{NULL}{NULL} 384 | 385 | LEXICON CopPersNum(2) 386 | :{N}{N2}{N3}> :{NULL}{NULL} 387 | :{T}{Ũ}> :{NULL}{NULL} 388 | :m{Ũ}> :{NULL}{NULL} 389 | :m{A}> :{NULL}{NULL} 390 | :r{Ĩ}> :{NULL}{NULL} 391 | :m{A}> :{NULL}{NULL} 392 | :{K}{Ĩ}> :{NULL}{NULL} 393 | :{C}{I}> :{NULL}{NULL} 394 | :m{A}> :{NULL}{NULL} 395 | :{C}{I}> :{NULL}{NULL} 396 | :{R}{Ũ}> :{NULL}{NULL} 397 | :m{A}> :{NULL}{NULL} 398 | :{R}{Ũ}> :{NULL}{NULL} 399 | :{C}{I}> :{NULL}{NULL} 400 | :{K}{A}> :{NULL}{NULL} 401 | :m{A}> :{NULL}{NULL} 402 | :{K}{A}> :{NULL}{NULL} 403 | :{T}{Ũ}> :{NULL}{NULL} 404 | :m{A}> :{NULL}{NULL} 405 | :{K}{Ũ}> :{NULL}{NULL} 406 | :m{A}> :{NULL}{NULL} 407 | :ha> :{NULL}{NULL} 408 | :k{Ũ}> :{NULL}{NULL} 409 | 410 | LEXICON CopVerbNeg2(2) 411 | :{T}{I}> :{NULL} 412 | 413 | LEXICON CopVerbTense 414 | ri:{NULL}{U}ma{NULL} 415 | rĩ:{NULL}{Ĩ} 416 | rĩ:{A}>{NULL} 417 | rĩ:r{A}>{NULL} 418 | rĩ:{NULL}{NULL} 419 | 420 | PATTERNS 421 | VerbInfImp(1) VerbStem VerbInfImp(2) 422 | VerbFoc(1) VerbNeg1(1) VerbSubject(1) VerbWithTense VerbNeg1(2) VerbSubject(2) VerbFoc(2) 423 | VerbFoc(1) VerbNeg1(1) VerbSubjectNoNeg(1) VerbWithTense VerbNeg1(2) VerbSubjectNoNeg(2) VerbFoc(2) 424 | VerbFoc(1) VerbSubject(1) VerbNeg2(1) VerbWithTense VerbNeg2(2) VerbSubject(2) VerbFoc(2) 425 | 426 | PATTERN VerbWithTense 427 | VerbTenseAspect-A(1) VerbReflexive(1) VerbStem VerbTenseAspect-A(2) VerbSuffix VerbTenseAspect-A(3) VerbTrans VerbFinalVowel-A VerbReflexive(2) 428 | VerbTenseAspect-E(1) VerbReflexive(1) VerbStem VerbTenseAspect-E(2) VerbSuffix VerbTenseAspect-E(3) VerbTrans VerbFinalVowel-E VerbReflexive(2) 429 | 430 | LEXICON VerbInfImp(2) 431 | :{NULL}> :{A} 432 | :{K}{Ũ}> :{A} 433 | 434 | LEXICON VerbFoc(2) 435 | :n{Ĩ}> :{NULL} 436 | : : 437 | 438 | LEXICON VerbSubjectNoNeg(2) 439 | :{Ũ}> :{NULL}{NULL} 440 | :{A}> :{NULL}{NULL} 441 | :{Ũ}> :{NULL}{NULL} 442 | :{Ĩ}> :{NULL}{NULL} 443 | :{Ĩ}> :{NULL}{NULL} 444 | :{Ĩ}> :{NULL}{NULL} 445 | :{Ũ}> :{NULL}{NULL} 446 | 447 | LEXICON VerbSubject(2) 448 | :{N}{N2}{N3}> :{NULL}{NULL} 449 | :{T}{Ũ}> :{NULL}{NULL} 450 | :m{Ũ}> :{NULL}{NULL} 451 | :m{A}> :{NULL}{NULL} 452 | :r{Ĩ}> :{NULL}{NULL} 453 | :m{A}> :{NULL}{NULL} 454 | :{K}{Ĩ}> :{NULL}{NULL} 455 | :{C}{I}> :{NULL}{NULL} 456 | :m{A}> :{NULL}{NULL} 457 | :{C}{I}> :{NULL}{NULL} 458 | :{R}{Ũ}> :{NULL}{NULL} 459 | :m{A}> :{NULL}{NULL} 460 | :{R}{Ũ}> :{NULL}{NULL} 461 | :{C}{I}> :{NULL}{NULL} 462 | :{K}{A}> :{NULL}{NULL} 463 | :m{A}> :{NULL}{NULL} 464 | :{K}{A}> :{NULL}{NULL} 465 | :{T}{Ũ}> :{NULL}{NULL} 466 | :m{A}> :{NULL}{NULL} 467 | :{K}{Ũ}> :{NULL}{NULL} 468 | :m{A}> :{NULL}{NULL} 469 | 470 | LEXICON VerbNeg2(2) 471 | :{T}{I}> :{NULL} 472 | 473 | LEXICON VerbTenseAspect-A(3) 474 | :{A}> :{NULL} :>{A}g 475 | :{BLOCK}{A}> :{NULL} :>{A}g 476 | :{R}{A}> :{NULL} :>{A}g 477 | :{R}{A}> :{NULL} :{NULL} 478 | :{BLOCK}{A}> :{NULL} :{NULL} 479 | :{K}{A}> :{NULL} :{NULL} 480 | :{RAR}{A}> :{NULL} :{NULL} 481 | :{RAR}{A}> :{NULL} :>{Ĩ}r 482 | :{NULL} :{NULL} :>{A}g 483 | :{K}{Ũ}> :{NULL} :{NULL} 484 | :{K}{A}> :{NULL} :{NULL} 485 | :{K}{Ĩ}> :{NULL} :{NULL} 486 | :{K}{I}> :{NULL} :{NULL} 487 | 488 | LEXICON VerbTenseAspect-E(3) 489 | :{A}> :{NULL} :>{Ĩ}t 490 | :{A}> :{NULL} :>{I}r 491 | :{BLOCK}{A}> :{NULL} :>{Ĩ}t 492 | :{BLOCK}{A}> :{NULL} :>{I}r 493 | :{R}{A}> :{NULL} :>{Ĩ}t 494 | :{R}{A}> :{NULL} :>{I}r 495 | :{K}{Ũ}> :{NULL} :>{Ĩ}t 496 | :{NULL} :{NULL} :>{I}r 497 | :{NULL} :{NULL} :>{Ĩ}t 498 | 499 | LEXICON VerbFinalVowel-A 500 | :>{AA} 501 | :>wo 502 | 503 | LEXICON VerbFinalVowel-E 504 | :>{EE} 505 | :>wo 506 | 507 | LEXICON VerbReflexive(2) 508 | :{Ĩ}> :{NULL} 509 | : : 510 | 511 | PATTERN VerbSuffix 512 | VerbRecip VerbIntens VerbMid VerbCaus VerbApp 513 | 514 | LEXICON VerbRecip 515 | :>{A}n 516 | : 517 | 518 | LEXICON VerbIntens 519 | :>{Ĩ}rĩr 520 | :>{Ũ}rũr 521 | : 522 | 523 | LEXICON VerbMid 524 | :>{Ĩ}k 525 | :>{Ĩ}k>{Ũ}r 526 | :>{Ũ}k 527 | :>{Ũ}r 528 | : 529 | 530 | LEXICON VerbCaus 531 | :>{I}th 532 | : 533 | 534 | LEXICON VerbApp 535 | :>{Ĩ}r 536 | : 537 | 538 | LEXICON VerbTrans 539 | :>{I} 540 | : 541 | 542 | LEXICON VerbNeg1(2) 543 | :n{D}> :{NULL} 544 | : : 545 | 546 | LEXICON VerbStem 547 | cab:cab # print 548 | har:har # prepare 549 | ũkĩr:{Ũ}kĩr # wake.up 550 | tega:tega # 551 | gan:gan # narrate 552 | rũg:{R}ũg # jump 553 | ag:{A}g # fail 554 | um:{U}m # come.out 555 | kiny:{K}iny # arrive 556 | kũr:{K}ũr # grow 557 | ũk:{Ũ}k # come 558 | in:{I}n # sing 559 | thi:th{II} # go 560 | thi:th{I} # go 561 | thom:thom # read 562 | thamb:thamb 563 | rĩm:{R}ĩm # cultivate 564 | haic:haic 565 | on:{O}n 566 | ror:{R}or 567 | gũ:g{Ũ} # fall 568 | ũr:{Ũ}r # run away 569 | cok:{C}ok # return 570 | cooker:{C}ooker # say(?) (meaning is fairly confusing, but common in Bible) 571 | ik:{I}k # throw 572 | nyit:nyit # catch 573 | tin:tin # cut 574 | hanyũk:hanyũk # run 575 | ikar:ikar # stay/sit 576 | rĩ:{R}{Ĩ} # eat 577 | ririkan:{R}irikan # remember 578 | ĩtĩk:{Ĩ}tĩk # believe 579 | iy:{I}y # steal 580 | igu:{I}g{U} # hear / obey 581 | ug:{U}g # say 582 | ĩr:{Ĩ}r # tell 583 | dũm:dũm # send 584 | tuĩk:{T}uĩk # become 585 | end:{E}nd # want 586 | thambĩr:thambĩr # swim 587 | ringĩr:{R}ingĩr # persuade 588 | hing:hing # close 589 | amb:{A}mb # start 590 | ĩrĩgĩr:{Ĩ}rĩgĩr # intend 591 | ĩtĩkĩr:{Ĩ}tĩkĩr # allow 592 | rut:{R}ut # learn, teach, or remove 593 | ribot:{R}ibot # report 594 | anĩrĩr:{A}nĩrĩr # announce 595 | ĩcir:{Ĩ}cir # think 596 | ĩtu:{Ĩ}t{U} # pretend 597 | rot:{R}ot # dream 598 | ĩrir:{Ĩ}rir # regret 599 | ken:{K}en # be happy 600 | rakar:{R}akar # be angry 601 | ĩhok:{Ĩ}hok # hope 602 | no:n{O} # can/must (?) 603 | hot:hot # might 604 | tũm:{T}ũm 605 | ũm:{Ũ}m # dry 606 | h[-fvE]:h # give 607 | hith:hith # hide 608 | humb:humb # cover 609 | ĩk:{tĨ}k # put 610 | thek:thek # laugh 611 | tah:tah # fetch 612 | ihũg:{I}hũg # to get wet 613 | heenere:heenere # deceive 614 | tũũr:tũũr # live (or live somewhere) 615 | nyu:nyu # drink 616 | it:{I}t # pour 617 | ur:{U}r # rain 618 | hũr:hũr # beat 619 | tig:{T}ig # leave 620 | on:{O}n # see 621 | ĩt:{Ĩ}t # call 622 | hĩtũki:hĩtũk{I} # pass (exceed) 623 | twar:{T}war # take, or drive 624 | ku:k{U} # die 625 | ũri:{Ũ}r{I} # ask 626 | maki:mak{I} # scare 627 | hot:hot # can (as in be able) 628 | ig:ig # put 629 | rath:rath # predict 630 | ĩk:{Ĩ}k # do 631 | bay:bay # wrap 632 | thir:thir # fail 633 | onek:{O}nek # appear 634 | meny:meny # know 635 | ambatĩri:{A}mbatĩr{I} # advance (in status) 636 | rũgam:rũgam # stand 637 | ken:ken # rejoice 638 | maki:mak{I} # surprise 639 | cari:car{I} # search 640 | amũkĩr:{A}mũkĩr # accept 641 | reki:rek{I} # send 642 | ũmb:{Ũ}mb # create (make) 643 | ari:{A}r{I} # discuss 644 | tu:t{U} # decide 645 | ĩhũg:{Ĩ}hũg # watch 646 | reg:reg # refuse 647 | iki:{I}k{I} # throw 648 | tigithani:tigithan{I} # separate 649 | buĩri:buĩr{I} # disappear 650 | end:{E}nd # love 651 | ĩti:{Ĩ}t{I} # to plead (as in plead guilty) 652 | niin:niin # finish 653 | ĩhumb:{Ĩ}humb # wear 654 | kuu:ku{U} # carry 655 | thuurũkani:thuurũkan{I} # analyse 656 | haragani:haragan{I} # disperse 657 | kaani:kaan{I} # prohibit 658 | geri:ger{I} # try 659 | cay:{C}ay # complain 660 | rathim:rathim # bless 661 | rĩmi:rĩm{I} # earn 662 | giũr:giũr # trample 663 | rot:rot # dream 664 | kor:{K}or # find 665 | 666 | PATTERNS 667 | AdjPrefix(1) AdjRoot AdjAgree AdjPrefix(2) 668 | 669 | LEXICON AdjPrefix(2) 670 | :m{Ũ} :{NULL}{NULL} 671 | :a :{NULL}{NULL} 672 | :m{Ũ} :{NULL}{NULL} 673 | :mĩ :{NULL}{NULL} 674 | :{RRI}{IRI} :{NULL}{NULL} 675 | :ma :{NULL}{NULL} 676 | :{K}ĩ :{NULL}{NULL} 677 | :{N}{N2}{N3} :{NULL}{NULL} 678 | :{N}{N2}{N3} :{NULL}{NULL} 679 | :ma :{NULL}{NULL} 680 | :{N}{N2}{N3} :{NULL}{NULL} 681 | :{N}{N2}{N3} :{NULL}{NULL} 682 | :rũ :{NULL}{NULL} 683 | :ma :{NULL}{NULL} 684 | :rũ :{NULL}{NULL} 685 | :{N}{N2}{N3} :{NULL}{NULL} 686 | :ka :{NULL}{NULL} 687 | :ma :{NULL}{NULL} 688 | :{K}a :{NULL}{NULL} 689 | :t{Ũ} :{NULL}{NULL} 690 | :m{Ũ} :{NULL}{NULL} 691 | :{Ũ} :{NULL}{NULL} 692 | :ma :{NULL}{NULL} 693 | :{K}{Ũ} :{NULL}{NULL} 694 | :ma :{NULL}{NULL} 695 | :h{A} :{NULL}{NULL} 696 | :{K}{Ũ} :{NULL}{NULL} 697 | :g{U} :{NULL}{NULL} 698 | 699 | LEXICON AdjRoot 700 | nene:nene # big 701 | kũrũ:{K}ũrũ # old 702 | ega:{E}ga # good 703 | tune:{T}une # red 704 | rũarũ:rũarũ # sick 705 | nini:nini # small/young 706 | erũ:{E}rũ # neq 707 | ũru:{Ũ}ru # bad 708 | erũ:{E}rũ # white 709 | athĩki:{A}thĩki # obedient 710 | kuhĩ:{K}uhĩ # short 711 | kenge:{K}enge # baby 712 | thaka:thaka # beautiful 713 | irũ:{I}rũ # black 714 | raihu:raihu # tall 715 | mata:mata # thick, creamy (possibly sour?) 716 | ingĩ:{I}ngĩ # many 717 | othe:{O}the # all 718 | 719 | LEXICON AdjAgree 720 | :{NULL} 721 | 722 | LEXICON Assoc 723 | 724 | a:wa 725 | a:a 726 | a:wa 727 | a:ya 728 | a:rĩa 729 | a:ma 730 | a:ma 731 | a:ma 732 | a:kĩa 733 | a:cia 734 | a:ya 735 | a:ya 736 | a:cia 737 | a:cia 738 | a:rũa 739 | a:rwa # alternate, possibly biblical form 740 | a:rũa 741 | a:ka 742 | a:tũa 743 | a:wa 744 | a:ma 745 | a:kũa 746 | a:ma 747 | 748 | PATTERNS 749 | Assoc 750 | PersPro # personal pronouns 751 | DepPro DepProEnds # dependent pronouns ("and/with X") 752 | PossPro PossProOwner # possessive pronouns (follow nouns) 753 | RelPro # relative pronouns (relativize nouns) 754 | #DemPro # demonstrative pronouns (that one, etc.) 755 | DistDem 756 | ProxDem 757 | AnaDem 758 | 759 | LEXICON PersPro 760 | 761 | pro:niũ 762 | pro:ithuĩ 763 | pro:weũ 764 | pro:inyuũ 765 | pro:we 766 | pro:mo 767 | pro:guo 768 | pro:yo 769 | pro:rĩo 770 | pro:mo 771 | pro:kĩo 772 | pro:cio 773 | pro:yo 774 | pro:cio 775 | pro:ruo 776 | pro:mo 777 | pro:ruo 778 | pro:cio 779 | pro:ko 780 | pro:tuo 781 | pro:guo 782 | pro:mo 783 | pro:kuo 784 | pro:mo 785 | 786 | LEXICON DepProEnds 787 | 788 | :niĩ 789 | :ithuĩ 790 | :we 791 | :inyuĩ 792 | :ke 793 | :ake # alternate form, possibly biblical 794 | :o 795 | :guo 796 | :yo 797 | :rĩo 798 | :mo 799 | :kĩo 800 | :cio 801 | :yo 802 | :cio 803 | :ruo 804 | :mo 805 | :ruo 806 | :cio 807 | :ko 808 | :tuo 809 | :guo 810 | :mo 811 | :kuo 812 | :mo 813 | 814 | LEXICON DepPro 815 | 816 | pro:na 817 | 818 | LEXICON PossPro 819 | 820 | pro:w{A}> 821 | pro:{A}> 822 | pro:w{A}> 823 | pro:y{A}> 824 | pro:rĩ{A}> 825 | pro:m{A}> 826 | pro:gĩ{A}> 827 | pro:ci{A}> 828 | pro:y{A}> 829 | pro:m{A}> 830 | pro:y{A}> 831 | pro:ci{A}> 832 | pro:rũ{A}> 833 | pro:m{A}> 834 | pro:rũ{A}> 835 | pro:ci{A}> 836 | pro:g{A}> 837 | pro:m{A}> 838 | pro:g{A}> 839 | pro:tũ{A}> 840 | pro:w{A}> 841 | pro:m{A}> 842 | pro:gũ{A}> 843 | pro:m{A}> 844 | 845 | LEXICON PossProOwner 846 | 847 | :{A}kwa 848 | :{I}tũ 849 | :{A}ku 850 | :{A}nyu 851 | :{A}ke 852 | :{A}o 853 | 854 | LEXICON RelPro 855 | 856 | pro:ũrĩa 857 | pro:{A}rĩa 858 | pro:ũrĩa 859 | pro:ĩrĩa 860 | pro:rĩrĩa 861 | pro:m{A}rĩa 862 | pro:kĩrĩa 863 | pro:irĩa 864 | pro:ĩria 865 | pro:m{A}rĩa 866 | pro:irĩa 867 | pro:iria 868 | pro:rũrĩa 869 | pro:m{A}rĩa 870 | pro:rũrĩa 871 | pro:iria 872 | pro:k{A}rĩa 873 | pro:m{A}rĩa 874 | pro:k{A}rĩa 875 | pro:tũrĩa 876 | pro:ũrĩa 877 | pro:m{A}rĩa 878 | pro:kũrĩa 879 | pro:m{A}rĩa 880 | 881 | LEXICON DistDem 882 | 883 | pro:ũrĩa 884 | pro:{A}rĩa 885 | pro:ũrĩa 886 | pro:ĩrĩa 887 | pro:rĩrĩa 888 | pro:m{A}rĩa 889 | pro:kĩrĩa 890 | pro:irĩa 891 | pro:ĩria 892 | pro:m{A}rĩa 893 | pro:irĩa 894 | pro:iria 895 | pro:rũrĩa 896 | pro:m{A}rĩa 897 | pro:rũrĩa 898 | pro:iria 899 | pro:k{A}rĩa 900 | pro:m{A}rĩa 901 | pro:k{A}rĩa 902 | pro:tũrĩa 903 | pro:ũrĩa 904 | pro:m{A}rĩa 905 | pro:kũrĩa 906 | pro:m{A}rĩa 907 | 908 | LEXICON ProxDem 909 | 910 | pro:ũyũ 911 | pro:aya 912 | pro:ũyũ 913 | pro:ĩno 914 | pro:rĩrĩ 915 | pro:maya 916 | pro:gĩkĩ 917 | pro:ici 918 | pro:ĩno 919 | pro:maya 920 | pro:ĩno 921 | pro:ici 922 | pro:rũrũ 923 | pro:maya 924 | pro:rũrũ 925 | pro:ici 926 | pro:gaka 927 | pro:maya 928 | pro:gaka 929 | pro:tũtũ 930 | pro:ũyũ 931 | pro:maya 932 | pro:gũkũ 933 | pro:maya 934 | 935 | LEXICON AnaDem 936 | 937 | pro:ũcio 938 | pro:ũu 939 | pro:acio 940 | pro:au 941 | pro:ũcio 942 | pro:ũu 943 | pro:ĩyo 944 | pro:ĩu 945 | pro:rĩu 946 | pro:macia 947 | pro:mau 948 | pro:kĩu 949 | pro:icio 950 | pro:iu 951 | pro:ĩyo 952 | pro:ĩu 953 | pro:macio 954 | pro:mau 955 | pro:ĩyo 956 | pro:macio 957 | pro:icio 958 | pro:iu 959 | pro:rũu 960 | pro:macio 961 | pro:mau 962 | pro:rũu 963 | pro:icio 964 | pro:iu 965 | pro:kau 966 | pro:macio 967 | pro:mau 968 | pro:kau 969 | pro:tũu 970 | pro:ũcio 971 | pro:ũu 972 | pro:macio 973 | pro:mau 974 | pro:kũu 975 | pro:macio 976 | pro:mau 977 | 978 | PATTERNS 979 | SpecialOrdinal 980 | LargeNumber 981 | NumeralPrefixSg(1) NumRoot NumeralPrefixSg(2) 982 | NumeralPrefixPl(1) NumRootPl NumeralPrefixPl(2) 983 | ExceptNumPre1 ExceptionalNumeral ExceptNumPre2 984 | NumIrregular NumIrregularAgreement 985 | 986 | # Default noun class for a number is 12p13 for some reason, but it agrees with a noun if there is one 987 | 988 | LEXICON SpecialOrdinal 989 | 990 | # Most ordinals are just assoc + cardinal, but first is special and has different words to replace the cardinal kĩmwe 991 | 992 | mbere:mbere # first 993 | kĩambĩrĩria:kĩambĩrĩria # first 994 | 995 | mũthia:mũthia # last 996 | 997 | LEXICON LargeNumber 998 | gana:igana # one hundred 999 | gana:magana # hundreds 1000 | giri:ngiri # one thousand 1001 | giri:ngiri # thousands 1002 | kũmi:ikũmi # ten 1003 | rongo:mĩrongo # tens 1004 | 1005 | LEXICON NumeralPrefixSg(2) 1006 | :ũ :{NULL}{NULL} 1007 | :ũ :{NULL}{NULL} 1008 | :rĩ :{NULL}{NULL} 1009 | :kĩ :{NULL}{NULL} 1010 | :ĩ :{NULL}{NULL} 1011 | :ĩ :{NULL}{NULL} 1012 | :rũ :{NULL}{NULL} 1013 | :rũ :{NULL}{NULL} 1014 | :k{A} :{NULL}{NULL} 1015 | :w{A} :{NULL}{NULL} 1016 | :kũ :{NULL}{NULL} 1017 | 1018 | LEXICON NumeralPrefixPl(2) 1019 | :{A} :{NULL}{NULL} 1020 | :y{A} :{NULL}{NULL} 1021 | :m{A} :{NULL}{NULL} 1022 | :m{A} :{NULL}{NULL} 1023 | :m{A} :{NULL}{NULL} 1024 | :i :{NULL}{NULL} 1025 | :tũ :{NULL}{NULL} 1026 | :m{A} :{NULL}{NULL} 1027 | :m{A} :{NULL}{NULL} 1028 | 1029 | LEXICON ExceptNumPre1 1030 | :{NULL} 1031 | 1032 | LEXICON ExceptNumPre2 1033 | :{NULL}{NULL} 1034 | :{NULL}{NULL} 1035 | :{NULL}{NULL} 1036 | 1037 | LEXICON ExceptionalNumeral 1038 | ĩrĩ:igĩrĩ # two 1039 | atũ:ithatũ # three 1040 | na:inya # four 1041 | ano:ithano # five 1042 | tandatũ:ithathatũ # six 1043 | nana:inyanya # eight 1044 | 1045 | LEXICON NumRoot 1046 | mwe:mwe # one 1047 | kũmi:kũmi # ten 1048 | 1049 | LEXICON NumRootPl 1050 | ĩrĩ:{Ĩ}rĩ # two 1051 | tatũ:tatũ # three 1052 | na:na # four 1053 | tano:tano # five 1054 | tandatũ:tandatũ # six 1055 | nana:nana # eight 1056 | rongo:rongo # tens 1057 | 1058 | LEXICON NumIrregular 1059 | mũgwanja:mũgwanja # seven 1060 | kenda:kenda # nine 1061 | 1062 | LEXICON NumIrregularAgreement 1063 | :{NULL} 1064 | :{NULL} 1065 | :{NULL} 1066 | :{NULL} 1067 | :{NULL} 1068 | :{NULL} 1069 | :{NULL} 1070 | :{NULL} 1071 | :{NULL} 1072 | :{NULL} 1073 | :{NULL} 1074 | :{NULL} 1075 | 1076 | LEXICON LittleWord 1077 | 1078 | na:na # with 1079 | na:na # and 1080 | o:o # all 1081 | kuma:kuma # from 1082 | kana:kana # if 1083 | kana:kana # or # potential disambiguation 1084 | no:no # but 1085 | mũno:mũno # very 1086 | atĩ:atĩ # that 1087 | atĩrĩ:atĩrĩ # a discourse marker, means sort of then, as a consequence 1088 | ũndũ:ũndũ # because 1089 | 1090 | LEXICON ThusWords 1091 | 1092 | guo:ũguo # thus 1093 | guo:nĩguo # thus 1094 | 1095 | LEXICON ProperName 1096 | 1097 | Kenya:Kenya # Kenya 1098 | Abĩrika:Abĩrika # Africa 1099 | Abirika:Abirika # Africa 1100 | Mũtiiri:Mũtiiri # Mũtiiri 1101 | Gĩgĩkũyũ:Gĩgĩkũyũ # Gĩgĩkũyũ 1102 | Mwathani:Mwathani # Lord (as in Lord God) 1103 | Edeni:Edeni # Eden 1104 | Pishoni:Pishoni # Pison 1105 | Havila:Havila # a biblical river 1106 | Gihoni:Gihoni # a biblical river 1107 | Farati:Farati # a biblical river 1108 | Adamu:Adamu # Adam (biblical) 1109 | Ngai:Ngai # God 1110 | Roho:Roho # Spirit (as in God's Spirit) 1111 | Jehova:Jehova # Jehova 1112 | Musa:Musa # 1113 | 1114 | LEXICON ProperNameEnding 1115 | 1116 | :{NULL} 1117 | 1118 | LEXICON Miscellany 1119 | 1120 | makĩria:makĩria # exceeding(?) 1121 | 1122 | LEXICON Punctuation 1123 | 1124 | .:. 1125 | ;:; 1126 | !:! 1127 | ,:, 1128 | ":" 1129 | 1130 | LEXICON Adverb # there aren't many of these 1131 | 1132 | ira:ira # yesterday 1133 | ũmũthĩ:ũmũthĩ # today 1134 | 1135 | PATTERNS 1136 | Adverb 1137 | Punctuation 1138 | ProperName ProperNameEnding 1139 | Miscellany 1140 | Assoc 1141 | LittleWord 1142 | ThusWords 1143 | -------------------------------------------------------------------------------- /tests/kik.sh2: -------------------------------------------------------------------------------- 1 | hfst-lexc -s kik.lexc -o kik.lexc.hfst 2 | hfst-twolc -s kik.twoc -o kik.twoc.hfst 3 | hfst-invert kik.lexc.hfst | hfst-compose-intersect -s -1 - -2 kik.twoc.hfst 2>/dev/null | hfst-invert -o kik.nomin.hfst 4 | hfst-minimize kik.nomin.hfst -o kik.hfst 5 | ../src/lexd kik.lexd kik.att 6 | hfst-txt2fst kik.att -o kik_d.hfst 7 | -------------------------------------------------------------------------------- /tests/kik.twoc: -------------------------------------------------------------------------------- 1 | Alphabet 2 | %[%-sg%]:0 %[%-pl%]:0 3 | %[%+sg%]:0 %[%+pl%]:0 4 | 5 | %[%-foc%]:0 6 | %[%+foc%]:0 7 | %[%-seq%]:0 8 | %[%+seq%]:0 9 | 10 | %[%-rempast%]:0 11 | %[%-nearpast%]:0 12 | %[%-currpast%]:0 13 | %[%-pres%]:0 14 | %[%-currfut%]:0 15 | %[%-remfut%]:0 16 | 17 | %[%-inf%]:0 18 | %[%+inf%]:0 19 | 20 | %[%+rempast%]:0 21 | %[%+nearpast%]:0 22 | %[%+currpast%]:0 23 | %[%+pres%]:0 24 | %[%+currfut%]:0 25 | %[%+remfut%]:0 26 | 27 | %[%-p1%]:0 28 | %[%-p2%]:0 29 | %[%+p1%]:0 30 | %[%+p2%]:0 31 | 32 | %[%-cl1%]:0 ! Noun class 1 33 | %[%-cl3%]:0 ! Noun class 3 34 | %[%-cl5%]:0 ! Noun class 6 35 | %[%-cl7%]:0 ! Noun class 7 36 | %[%-cl9p6%]:0 ! Noun class 9p6 37 | %[%-cl9p1%0%]:0 ! Noun class 9p1%0 38 | %[%-cl11p6%]:0 ! Noun class 11p6 39 | %[%-cl11p1%0%]:0 ! Noun class 11p1%0 40 | %[%-cl12p6%]:0 ! Noun class 12p6 41 | %[%-cl12p13%]:0 ! Noun class 12p13 42 | %[%-cl14%]:0 ! Noun class 14 43 | %[%-cl15%]:0 ! Noun class 15 44 | 45 | %[%+cl1%]:0 ! Noun class 1 46 | %[%+cl3%]:0 ! Noun class 3 47 | %[%+cl5%]:0 ! Noun class 6 48 | %[%+cl7%]:0 ! Noun class 7 49 | %[%+cl9p6%]:0 ! Noun class 9p6 50 | %[%+cl9p1%0%]:0 ! Noun class 9p1%0 51 | %[%+cl11p6%]:0 ! Noun class 11p6 52 | %[%+cl11p1%0%]:0 ! Noun class 11p1%0 53 | %[%+cl12p6%]:0 ! Noun class 12p6 54 | %[%+cl12p13%]:0 ! Noun class 12p13 55 | %[%+cl14%]:0 ! Noun class 14 56 | %[%+cl15%]:0 ! Noun class 15 57 | %[%+cl16%]:0 58 | %[%-cl16%]:0 59 | %[%+cl17%]:0 60 | %[%-cl17%]:0 61 | 62 | %[%-dim%]:0 63 | %[%+dim%]:0 64 | 65 | %[%-refl%]:0 66 | %[%+refl%]:0 67 | %[%-imp%]:0 68 | %[%+imp%]:0 69 | %[%-neg%]:0 70 | %[%+neg%]:0 71 | 72 | %[%-hab%]:0 73 | %[%-perf%]:0 74 | %[%-noasp%]:0 75 | %[%-compl%]:0 76 | %[%-proc%]:0 77 | %[%-fvA%]:0 78 | %[%-fvE%]:0 79 | 80 | %[%+hab%]:0 81 | %[%+perf%]:0 82 | %[%+noasp%]:0 83 | %[%+compl%]:0 84 | %[%+proc%]:0 85 | %[%+fvA%]:0 86 | %[%+fvE%]:0 87 | !%[%+fvĨ%]:0 88 | !%[%-fvĨ%]:0 89 | ; 90 | 91 | Rules 92 | 93 | "Match number prefixes and person" 94 | Tx:0 /<= _ ; 95 | except 96 | _ (:*) Ty:0 ; 97 | where Tx in ( %[%-sg%] ) 98 | Ty in ( %[%+sg%] ) 99 | matched ; 100 | 101 | "Match number prefixes and person 2" 102 | Tx:0 /<= _ ; 103 | except 104 | _ (:*) Ty:0 ; 105 | where Tx in ( %[%-pl%] ) 106 | Ty in ( %[%+pl%] ) 107 | matched ; 108 | 109 | 110 | "Match number prefixes and person 2a" 111 | Tx:0 /<= _ ; 112 | except 113 | _ (:*) Ty:0 ; 114 | where Tx in ( %[%-p2%] ) 115 | Ty in ( %[%+p2%] ) 116 | matched ; 117 | 118 | "Match number prefixes and person 2b" 119 | Tx:0 /<= _ ; 120 | except 121 | _ (:*) Ty:0 ; 122 | where Tx in ( %[%-p1%] ) 123 | Ty in ( %[%+p1%] ) 124 | matched ; 125 | 126 | "Match number prefixes and person s2a" 127 | Tx:0 /<= _ ; 128 | except 129 | _ (:*) Ty:0 ; 130 | where Tx in ( %[%-p2%] ) 131 | Ty in ( %[%+p2%] ) 132 | matched ; 133 | 134 | "Match number prefixes and person s2b" 135 | Tx:0 /<= _ ; 136 | except 137 | _ (:*) Ty:0 ; 138 | where Tx in ( %[%-p1%] ) 139 | Ty in ( %[%+p1%] ) 140 | matched ; 141 | 142 | "Match tense and focus, part 1a" 143 | Tx:0 /<= _ ; 144 | except 145 | _ (:*) Ty:0 ; 146 | where Tx in ( %[%-currpast%] %[%-pres%] ) 147 | Ty in ( %[%+currpast%] %[%+pres%] ) 148 | matched ; 149 | 150 | "Match tense and focus, part 1b" 151 | Tx:0 /<= _ ; 152 | except 153 | _ (:*) Ty:0 ; 154 | where Tx in ( %[%-currfut%] %[%-remfut%] ) 155 | Ty in ( %[%+currfut%] %[%+remfut%] ) 156 | matched ; 157 | 158 | "Match tense and focus, part 2a" 159 | Tx:0 /<= _ ; 160 | except 161 | _ (:*) Ty:0 ; 162 | where Tx in ( %[%-foc%] %[%-seq%] ) 163 | Ty in ( %[%+foc%] %[%+seq%] ) 164 | matched ; 165 | 166 | "Match tense and focus, part 2a.5" 167 | Ty:0 /<= _ ; 168 | except 169 | Tx:0 (:*) _ ; 170 | where Tx in ( %[%-foc%] %[%-seq%] ) 171 | Ty in ( %[%+foc%] %[%+seq%] ) 172 | matched ; 173 | 174 | "Match tense and focus, part 2b" 175 | Tx:0 /<= _ ; 176 | except! 177 | _ (:*) Ty:0 ; 178 | where Tx in ( %[%-rempast%] %[%-nearpast%] ) 179 | Ty in ( %[%+rempast%] %[%+nearpast%] ) 180 | matched ; 181 | 182 | 183 | !"Match noun class and person, part s1a" 184 | !Tx:0 /<= _ ; 185 | ! except 186 | ! _ (:*) Ty:0 ; 187 | ! where Tx in ( %[%-scl3%] ) 188 | ! Ty in ( %[%+scl3%] ) 189 | ! matched ; 190 | 191 | 192 | !"Match noun class and person, part s1a2" 193 | !Tx:0 /<= _ ; 194 | ! except 195 | ! _ (:*) Ty:0 ; 196 | ! where Tx in ( %[%-scl1%] ) 197 | ! Ty in ( %[%+scl1%] ) 198 | ! matched ; 199 | 200 | 201 | !"Match noun class and person, part s2a" 202 | !Tx:0 /<= _ ; 203 | ! except 204 | ! _ (:*) Ty:0 ; 205 | ! where Tx in ( %[%-scl9p1%0%] ) 206 | ! Ty in (%[%+scl9p1%0%] ) 207 | ! matched ; 208 | 209 | !"Match noun class and person, part s2a2" 210 | !Tx:0 /<= _ ; 211 | ! except 212 | ! _ (:*) Ty:0 ; 213 | ! where Tx in ( %[%-scl9p6%] ) 214 | ! Ty in ( %[%+scl9p6%] ) 215 | ! matched ; 216 | 217 | 218 | !"Match noun class and person, part s3a" 219 | !Tx:0 /<= _ ; 220 | ! except 221 | ! _ (:*) Ty:0 ; 222 | ! where Tx in ( %[%-scl12p13%] ) 223 | ! Ty in ( %[%+scl12p13%] ) 224 | ! matched ; 225 | 226 | !"Match noun class and person, part s3a2" 227 | !Tx:0 /<= _ ; 228 | ! except 229 | ! _ (:*) Ty:0 ; 230 | ! where Tx in ( %[%-cl12p6%] ) 231 | ! Ty in ( %[%+cl12p6%] ) 232 | ! matched ; 233 | 234 | 235 | !"Match noun class and person, part s1b" 236 | !Tx:0 /<= _ ; 237 | ! except 238 | ! _ (:*) Ty:0 ; 239 | ! where Tx in ( %[%-scl7%] ) 240 | ! Ty in ( %[%+scl7%] ) 241 | ! matched ; 242 | 243 | !"Match noun class and person, part s1b2" 244 | !Tx:0 /<= _ ; 245 | ! except 246 | ! _ (:*) Ty:0 ; 247 | ! where Tx in ( %[%-scl5%] ) 248 | ! Ty in ( %[%+scl5%] ) 249 | ! matched ; 250 | 251 | 252 | !"Match noun class and person, part s2b" 253 | !Tx:0 /<= _ ; 254 | ! except 255 | ! _ (:*) Ty:0 ; 256 | ! where Tx in ( %[%-scl11p6%] ) 257 | ! Ty in ( %[%+scl11p6%] ) 258 | ! matched ; 259 | 260 | !"Match noun class and person, part s2b2" 261 | !Tx:0 /<= _ ; 262 | 263 | ! except 264 | ! _ (:*) Ty:0 ; 265 | ! where Tx in ( %[%-scl11p1%0%] ) 266 | ! Ty in ( %[%+scl11p1%0%] ) 267 | ! matched ; 268 | 269 | 270 | !"Match noun class and person, part s3b" 271 | !Tx:0 /<= _ ; 272 | ! except 273 | ! _ (:*) Ty:0 ; 274 | ! where Tx in ( %[%-scl14%] ) 275 | ! Ty in ( %[%+scl14%] ) 276 | ! matched ; 277 | 278 | 279 | !"Match noun class and person, part s3b2" 280 | !Tx:0 /<= _ ; 281 | ! except 282 | ! _ (:*) Ty:0 ; 283 | ! where Tx in (%[%-scl15%] ) 284 | ! Ty in (%[%+scl15%] ) 285 | ! matched ; 286 | 287 | 288 | "Match noun class and person, parts1a" 289 | Tx:0 /<= _ ; 290 | except 291 | _ (:*) Ty:0 ; 292 | where Tx in ( %[%-scl3%] ) 293 | Ty in ( %[%+scl3%] ) 294 | matched ; 295 | 296 | 297 | "Match noun class and person, part 1a2" 298 | Tx:0 /<= _ ; 299 | except 300 | _ (:*) Ty:0 ; 301 | where Tx in ( %[%-cl1%] ) 302 | Ty in ( %[%+cl1%] ) 303 | matched ; 304 | 305 | 306 | "Match noun class and person, part 2a" 307 | Tx:0 /<= _ ; 308 | except 309 | _ (:*) Ty:0 ; 310 | where Tx in ( %[%-cl9p1%0%] ) 311 | Ty in (%[%+cl9p1%0%] ) 312 | matched ; 313 | 314 | "Match noun class and person, part 2a2" 315 | Tx:0 /<= _ ; 316 | except 317 | _ (:*) Ty:0 ; 318 | where Tx in ( %[%-cl9p6%] ) 319 | Ty in ( %[%+cl9p6%] ) 320 | matched ; 321 | 322 | 323 | "Match noun class and person, part 3a" 324 | Tx:0 /<= _ ; 325 | except 326 | _ (:*) Ty:0 ; 327 | where Tx in ( %[%-cl12p13%] ) 328 | Ty in ( %[%+cl12p13%] ) 329 | matched ; 330 | 331 | "Match noun class and person, part 3a2" 332 | Tx:0 /<= _ ; 333 | except 334 | _ (:*) Ty:0 ; 335 | where Tx in ( %[%-cl12p6%] ) 336 | Ty in ( %[%+cl12p6%] ) 337 | matched ; 338 | 339 | 340 | "Match noun class and person, part 1b" 341 | Tx:0 /<= _ ; 342 | except 343 | _ (:*) Ty:0 ; 344 | where Tx in ( %[%-cl7%] ) 345 | Ty in ( %[%+cl7%] ) 346 | matched ; 347 | 348 | "Match noun class and person, part 1b2" 349 | Tx:0 /<= _ ; 350 | except 351 | _ (:*) Ty:0 ; 352 | where Tx in ( %[%-cl5%] ) 353 | Ty in ( %[%+cl5%] ) 354 | matched ; 355 | 356 | 357 | "Match noun class and person, part 2b" 358 | Tx:0 /<= _ ; 359 | except 360 | _ (:*) Ty:0 ; 361 | where Tx in ( %[%-cl11p6%] ) 362 | Ty in ( %[%+cl11p6%] ) 363 | matched ; 364 | 365 | "Match noun class and person, part 2b2" 366 | Tx:0 /<= _ ; 367 | except 368 | _ (:*) Ty:0 ; 369 | where Tx in ( %[%-cl11p1%0%] ) 370 | Ty in ( %[%+cl11p1%0%] ) 371 | matched ; 372 | 373 | 374 | "Match noun class and person, part 3b" 375 | Tx:0 /<= _ ; 376 | except 377 | _ (:*) Ty:0 ; 378 | where Tx in ( %[%-cl14%]) 379 | Ty in ( %[%+cl14%] ) 380 | matched ; 381 | 382 | 383 | "Match noun class and person, part 3b2" 384 | Tx:0 /<= _ ; 385 | except 386 | _ (:*) Ty:0 ; 387 | where Tx in (%[%-cl15%] ) 388 | Ty in (%[%+cl15%] ) 389 | matched ; 390 | 391 | "Match noun class and person, part 4" 392 | Tx:0 /<= _ ; 393 | except 394 | _ (:*) Ty:0 ; 395 | where Tx in (%[%-cl16%] %[%-cl17%] ) 396 | Ty in (%[%+cl16%] %[%+cl17%] ) 397 | matched ; 398 | 399 | "Match no object" 400 | Tx:0 /<= _ ; 401 | except 402 | _ (:*) Ty:0 ; 403 | where Tx in (%[%-noobj%] ) 404 | Ty in (%[%+noobj%] ) 405 | matched ; 406 | 407 | 408 | "Match reflexive, imperative, and negative part 1" 409 | Tx:0 /<= _ ; 410 | except 411 | _ (:*) Ty:0 ; 412 | where Tx in ( %[%-refl%] %[%-imp%] ) 413 | Ty in ( %[%+refl%] %[%+imp%] ) 414 | matched ; 415 | 416 | "Match reflexive, imperative, and negative part 1.5" 417 | Tx:0 /<= _ ; 418 | except 419 | Ty:0 (:*) _ ; 420 | where Ty in ( %[%-refl%] %[%-imp%] ) 421 | Tx in ( %[%+refl%] %[%+imp%] ) 422 | matched ; 423 | 424 | "Match infinitive" 425 | Tx:0 /<= _ ; 426 | except 427 | Ty:0 (:*) _ ; 428 | where Ty in ( %[%-inf%] %[%-dim%] ) 429 | Tx in ( %[%+inf%] %[%+dim%] ) 430 | matched ; 431 | 432 | "Match diminutive" 433 | Tx:0 /<= _ ; 434 | except 435 | Tx:0 (:*) _ ; 436 | where Tx in ( %[%-dim%] ) 437 | Ty in ( %[%+dim%] ) 438 | matched ; 439 | 440 | "Match reflexive, imperative, and negative part 2" 441 | Tx:0 /<= _ ; 442 | except 443 | _ (:*) Ty:0 ; 444 | where Tx in ( %[%-neg%] ) 445 | Ty in ( %[%+neg%] ) 446 | matched ; 447 | 448 | "Match reflexive, imperative, and negative part 2" 449 | Tx:0 /<= _ ; 450 | except 451 | Ty:0 (:*) _ ; 452 | where Tx in ( %[%+neg%] ) 453 | Ty in ( %[%-neg%] ) 454 | matched ; 455 | 456 | 457 | "Match aspect part 1" 458 | Tx:0 /<= _ ; 459 | except 460 | Ty:0 (:*) _ ; 461 | where Tx in ( %[%+compl%] %[%+hab%] ) 462 | Ty in ( %[%-compl%] %[%-hab%] ) 463 | matched ; 464 | 465 | "Match aspect part 2" 466 | Tx:0 /<= _ ; 467 | except 468 | Ty:0 (:*) _ ; 469 | where Tx in ( %[%+noasp%] %[%+perf%] ) 470 | Ty in ( %[%-noasp%] %[%-perf%] ) 471 | matched ; 472 | 473 | "Match aspect part 3" 474 | Tx:0 /<= _ ; 475 | except 476 | Ty:0 (:*) _ ; 477 | where Tx in ( %[%+proc%] ) 478 | Ty in ( %[%-proc%] ) 479 | matched ; 480 | 481 | "Coordinate final vowels" 482 | Tx:0 /<= _ ; 483 | except 484 | Ty:0 (:*) _ ; 485 | where Tx in ( %[%+fvA%] ) 486 | Ty in ( %[%-fvA%] ) 487 | ; 488 | 489 | "Coordinate final vowels 2" 490 | Tx:0 /<= _ ; 491 | except 492 | Ty:0 (:*) _ ; 493 | where Tx in ( %[%+fvE%] ) 494 | Ty in ( %[%-fvE%] ) 495 | matched ; 496 | 497 | 498 | "Coordinate final vowels (reverse) " 499 | Tx:0 /<= _ ; 500 | except 501 | _ (:*) Ty:0 ; 502 | where Ty in ( %[%+fvA%] ) 503 | Tx in ( %[%-fvA%] ) 504 | matched ; 505 | 506 | "Coordinate final vowels (reverse) 2 " 507 | Tx:0 /<= _ ; 508 | except 509 | _ (:*) Ty:0 ; 510 | where Ty in ( %[%+fvE%] ) 511 | Tx in ( %[%-fvE%] ) 512 | matched ; 513 | 514 | ! "Coordinate final vowels 2" 515 | ! Tx:0 /<= _ ; 516 | ! except 517 | ! Ty:0 (:*) _ ; 518 | ! where Tx in ( %[%+fvĨ%] ) 519 | ! Ty in ( %[%-fvĨ%] ) 520 | ! matched ; 521 | 522 | ! "Coordinate final vowels 2 (reverse) " 523 | ! Tx:0 /<= _ ; 524 | ! except 525 | ! _ (:*) Ty:0 ; 526 | ! where Ty in ( %[%+fvĨ%] ) 527 | ! Tx in ( %[%-fvĨ%] ) 528 | ! matched ; 529 | 530 | 531 | 532 | -------------------------------------------------------------------------------- /tests/lin.sh2: -------------------------------------------------------------------------------- 1 | hfst-lexc -s lin.lexc -o lin.lexc.hfst 2 | hfst-twolc -s lin.twoc -o lin.twoc.hfst 3 | hfst-invert lin.lexc.hfst | hfst-compose-intersect -s -1 - -2 lin.twoc.hfst 2>/dev/null | hfst-invert -o lin.nomin.hfst 4 | hfst-minimize lin.nomin.hfst -o lin.hfst 5 | ../src/lexd lin.lexd lin.att 6 | hfst-txt2fst lin.att -o lin_d.hfst 7 | -------------------------------------------------------------------------------- /tests/lin.twoc: -------------------------------------------------------------------------------- 1 | Alphabet 2 | 3 | %[%+sg%]:0 %[%+pl%]:0 %[%+abs%]:0 %[%+def%]:0 %[%+inst%]:0 %[%+p1%]:0 %[%+p2%]:0 %[%+p3%]:0 %[%+aa%]:0 %[%+nn%]:0 %[%+fut%]:0 %[%+ref%]:0 %[%+inf%]:0 %[%+imp%]:0 %[%+cl1%]:0 %[%+cl5%]:0 %[%+cl6%]:0 %[%+cl7%]:0 %[%+cl8%]:0 4 | 5 | %[%-sg%]:0 %[%-pl%]:0 %[%-abs%]:0 %[%-def%]:0 %[%-inst%]:0 %[%-p1%]:0 %[%-p2%]:0 %[%-p3%]:0 %[%-aa%]:0 %[%-nn%]:0 %[%-fut%]:0 %[%-ref%]:0 %[%-inf%]:0 %[%-imp%]:0 %[%-cl1%]:0 %[%-cl5%]:0 %[%-cl6%]:0 %[%-cl7%]:0 %[%-cl8%]:0 6 | 7 | 8 | ; 9 | 10 | Rules 11 | 12 | "Match prefix" 13 | Sx:0 /<= _ ; 14 | except 15 | _ (:*) Sy:0 ; 16 | where Sx in ( %[%-sg%] %[%-pl%] %[%-abs%] %[%-def%] %[%-inst%] %[%-p1%] %[%-p2%] %[%-p3%] %[%-aa%] %[%-nn%] %[%-fut%] %[%-ref%] %[%-inf%] %[%-imp%] %[%-cl1%] %[%-cl5%] %[%-cl6%] %[%-cl7%] %[%-cl8%]) 17 | Sy in ( %[%+sg%] %[%+pl%] %[%+abs%] %[%+def%] %[%+inst%] %[%+p1%] %[%+p2%] %[%+p3%] %[%+aa%] %[%+nn%] %[%+fut%] %[%+ref%] %[%+inf%] %[%+imp%] %[%+cl1%] %[%+cl5%] %[%+cl6%] %[%+cl7%] %[%+cl8%]) 18 | matched ; 19 | 20 | "Inverse of Match prefix for use with mismatched tags" 21 | Sx:0 /<= _ ; 22 | except 23 | Sy:0 :* _ ; 24 | where Sy in ( %[%-abs%] %[%-def%] %[%-inst%] %[%-fut%] %[%-ref%] %[%-sg%] %[%-pl%]) 25 | Sx in ( %[%+abs%] %[%+def%] %[%+inst%] %[%+fut%] %[%+ref%] %[%+sg%] %[%+pl%]) 26 | matched ; 27 | 28 | 29 | 30 | !Sets 31 | ! 32 | !Features = %[%+sg%] %[%+pl%] %[%+abs%] %[%+def%] %[%+inst%] %[%+p1%] %[%+p2%] %[%+p3%] %[%+aa%] %[%+nn%]; 33 | ! 34 | !Rules 35 | ! 36 | !"Match prefixes with agreement tags" 37 | !Fx:0 <=> _ :* Fx:0 ; 38 | ! Fx:0 :* _ ; 39 | ! where Fx in Features ; 40 | -------------------------------------------------------------------------------- /tests/timing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 1 ] 4 | then 5 | N=$1 6 | F=$2 7 | else 8 | N="1" 9 | F=$1 10 | fi 11 | 12 | echo "" 13 | echo "Timing comparison for language $F" 14 | echo "" 15 | 16 | cat "$F.sh2" | while read line 17 | do 18 | echo $line 19 | /usr/bin/time -f " time: %e seconds, maximum memory usage: %M KB" bash -c "for x in {1..$N}; do $line; done" 20 | done 21 | 22 | hfst-expand "$F.hfst" | sort > "$F.txt" 23 | hfst-expand "${F}_d.hfst" | sort > "${F}_d.txt" 24 | 25 | S=`diff "$F.txt" "${F}_d.txt"` 26 | 27 | cat "$F.txt" | wc 28 | 29 | rm *.hfst *.att 30 | 31 | if [ -n "$S" ] 32 | then 33 | diff "$F.txt" "${F}_d.txt" 34 | rm "$F.txt" "${F}_d.txt" 35 | exit 1 36 | fi 37 | rm "$F.txt" "${F}_d.txt" 38 | -------------------------------------------------------------------------------- /tests/trilit.lexd: -------------------------------------------------------------------------------- 1 | LEXICON C(3) 2 | x y z 3 | n m l 4 | 5 | LEXICON V(2) 6 | :a :e 7 | :o :e 8 | 9 | LEXICON redup 10 | : 11 | 12 | PATTERNS 13 | C(1) :V(1) C(2) :V(2) C(3) V(2): 14 | # xayez/xyz 15 | # xoyez/xyz 16 | # namel/nml 17 | # nomel/nml 18 | 19 | :C(1) :V(1) C(1) :V(1) C(2) :V(2) C(3) V(2): redup 20 | # xaxayez/xyz 21 | # xoxoyez/xyz 22 | # nanamel/nml 23 | # nonomel/nml 24 | -------------------------------------------------------------------------------- /tests/wad.lexc: -------------------------------------------------------------------------------- 1 | ! Morphological Transducer for wad 2 | 3 | Multichar_Symbols 4 | 5 | ! Part of speech categories 6 | % 7 | % 8 | % 9 | % 10 | % 11 | % 12 | % 13 | % 14 | % 15 | % 16 | % 17 | % 18 | % 19 | % 20 | % 21 | % 22 | % 23 | % 24 | % 25 | % 26 | % 27 | % 28 | % 29 | % 30 | % 31 | % 32 | % 33 | % 34 | % 35 | % 36 | % 37 | % 38 | % 39 | % 40 | % 41 | 42 | ! Number morphology 43 | % 44 | % 45 | 46 | ! Other symbols 47 | %> 48 | %{b%} 49 | %{B%} 50 | %{D%} 51 | %{A%} 52 | 53 | %[%+1incdu%] %[%+1incpl%] %[%+1sg%] %[%+1excdu%] %[%+1excpl%] %[%+2sg%] %[%+2du%] %[%+2pl%] %[%+3sg%] %[%+3du%] %[%+3plhum%] %[%+3plnh%] 54 | 55 | %[%-1incdu%] %[%-1incpl%] %[%-1sg%] %[%-1excdu%] %[%-1excpl%] %[%-2sg%] %[%-2du%] %[%-2pl%] %[%-3sg%] %[%-3du%] %[%-3plhum%] %[%-3plnh%] 56 | 57 | %[%+appl%] %[%-appl%] 58 | %[%+caus%] %[%-caus%] 59 | 60 | LEXICON Root 61 | 62 | VerbPrefix ; 63 | 64 | LEXICON VerbPrefix 65 | 66 | %[%+1incdu%]:tur VerbApplicative ; 67 | %[%+1incpl%]:tat VerbApplicative ; 68 | %[%+1sg%]:i VerbApplicative ; 69 | %[%+1excdu%]:amur VerbApplicative ; 70 | %[%+1excpl%]:amat VerbApplicative ; 71 | %[%+2sg%]:%{B%}u VerbApplicative ; 72 | %[%+2du%]:mur VerbApplicative ; 73 | %[%+2pl%]:met VerbApplicative ; 74 | %[%+3sg%]:%{D%}i VerbApplicative ; 75 | %[%+3du%]:sur VerbApplicative ; 76 | %[%+3plhum%]:set VerbApplicative ; 77 | %[%+3plnh%]:si VerbApplicative ; 78 | 79 | LEXICON VerbApplicative 80 | 81 | %[%+appl%]:it VerbCausative ; 82 | VerbCausative ; 83 | 84 | LEXICON VerbCausative 85 | 86 | %[%+caus%]:on VerbRoot ; 87 | VerbRoot ; 88 | 89 | LEXICON VerbInfl 90 | 91 | %%%%%[%-1incdu%]: VerbInflAppl ; 92 | %%%%%[%-1incpl%]: VerbInflAppl ; 93 | %%%%[%-1sg%]: VerbInflAppl ; 94 | %%%%%[%-1excdu%]: VerbInflAppl ; 95 | %%%%%[%-1excpl%]: VerbInflAppl ; 96 | %%%%[%-2sg%]: VerbInflAppl ; 97 | %%%%[%-2du%]: VerbInflAppl ; 98 | %%%%[%-2pl%]: VerbInflAppl ; 99 | %%%%[%-3sg%]: VerbInflAppl ; 100 | %%%%[%-3du%]: VerbInflAppl ; 101 | %%%%%[%-3plhum%]: VerbInflAppl ; 102 | %%%%%[%-3plnh%]: VerbInflAppl ; 103 | 104 | LEXICON VerbInflAppl 105 | 106 | %%[%-appl%]: VerbInflCaus ; 107 | VerbInflCaus ; 108 | 109 | LEXICON VerbInflCaus 110 | 111 | %%[%-caus%]: # ; 112 | # ; 113 | 114 | LEXICON VerbRoot 115 | 116 | a:a VerbInfl ; 117 | adiat:%{A%}diat VerbInfl ; 118 | adiava:%{A%}diava VerbInfl ; 119 | adu:%{A%}du VerbInfl ; 120 | amise:%{A%}mise VerbInfl ; 121 | ana:%{A%}na VerbInfl ; 122 | ananar:%{A%}nanar VerbInfl ; 123 | ane:%{A%}ne VerbInfl ; 124 | ania:%{A%}nia VerbInfl ; 125 | aniau:%{A%}niau VerbInfl ; 126 | aniwar:%{A%}niwar VerbInfl ; 127 | anota:%{A%}nota VerbInfl ; 128 | anu:%{A%}nu VerbInfl ; 129 | apai:%{A%}pai VerbInfl ; 130 | ape:%{A%}pe VerbInfl ; 131 | api:%{A%}pi VerbInfl ; 132 | ara:%{A%}ra VerbInfl ; 133 | ariarir:%{A%}riarir VerbInfl ; 134 | aririarir:%{A%}ririarir VerbInfl ; 135 | as:%{A%}s VerbInfl ; 136 | asiei:%{A%}siei VerbInfl ; 137 | asiwar:%{A%}siwar VerbInfl ; 138 | atore:%{A%}tore VerbInfl ; 139 | avakir:%{A%}vakir VerbInfl ; 140 | avi:%{A%}vi VerbInfl ; 141 | avute:%{A%}vute VerbInfl ; 142 | aware:%{A%}ware VerbInfl ; 143 | awawai:%{A%}wawai VerbInfl ; 144 | awer:%{A%}wer VerbInfl ; 145 | awia:%{A%}wia VerbInfl ; 146 | bai:b%{A%}i VerbInfl ; 147 | bajor:b%{A%}jor VerbInfl ; 148 | deriasi:deriasi VerbInfl ; 149 | ena:ena VerbInfl ; 150 | iravesie:iravesie VerbInfl ; 151 | iri:iri VerbInfl ; 152 | isa:isa VerbInfl ; 153 | isani:isani VerbInfl ; 154 | isi:isi VerbInfl ; 155 | ka:k%{A%} VerbInfl ; 156 | kadadiri:k%{A%}dadiri VerbInfl ; 157 | kais:k%{A%}is VerbInfl ; 158 | kamadiwa:k%{A%}madiwa VerbInfl ; 159 | kamanam:k%{A%}manam VerbInfl ; 160 | kamaru:k%{A%}maru VerbInfl ; 161 | kambarai:k%{A%}mbarai VerbInfl ; 162 | kanisu:k%{A%}nisu VerbInfl ; 163 | kapori:k%{A%}pori VerbInfl ; 164 | karamut:k%{A%}ramut VerbInfl ; 165 | karier:k%{A%}rier VerbInfl ; 166 | karieri:k%{A%}rieri VerbInfl ; 167 | karipe:k%{A%}ripe VerbInfl ; 168 | kariria:k%{A%}riria VerbInfl ; 169 | karotui:k%{A%}rotui VerbInfl ; 170 | karutarera:k%{A%}rutarera VerbInfl ; 171 | kase:k%{A%}se VerbInfl ; 172 | kasie:k%{A%}sie VerbInfl ; 173 | kasiou:k%{A%}siou VerbInfl ; 174 | kavera:k%{A%}vera VerbInfl ; 175 | kavio:k%{A%}vio VerbInfl ; 176 | kaye:k%{A%}ye VerbInfl ; 177 | kerat:kerat VerbInfl ; 178 | keri:keri VerbInfl ; 179 | kivari:kivari VerbInfl ; 180 | kopa:kopa VerbInfl ; 181 | kotar:kotar VerbInfl ; 182 | kubira:kubira VerbInfl ; 183 | kutu:kutu VerbInfl ; 184 | mamaya:m%{A%}maya VerbInfl ; 185 | mamu:m%{A%}mu VerbInfl ; 186 | manau:m%{A%}nau VerbInfl ; 187 | manggar:m%{A%}ŋgar VerbInfl ; 188 | marandi:m%{A%}randi VerbInfl ; 189 | marariat:m%{A%}rariat VerbInfl ; 190 | mararieti:m%{A%}rarieti VerbInfl ; 191 | marera:m%{A%}rera VerbInfl ; 192 | marera:m%{A%}rera VerbInfl ; 193 | mari:m%{A%}ri VerbInfl ; 194 | marior:m%{A%}rior VerbInfl ; 195 | marova:m%{A%}rova VerbInfl ; 196 | maroya:m%{A%}roya VerbInfl ; 197 | marura:m%{A%}rura VerbInfl ; 198 | marutu:m%{A%}rutu VerbInfl ; 199 | masa:m%{A%}sa VerbInfl ; 200 | masivaveri:m%{A%}sivaveri VerbInfl ; 201 | maso:m%{A%}so VerbInfl ; 202 | masop:m%{A%}sop VerbInfl ; 203 | masubat:m%{A%}subat VerbInfl ; 204 | matai:m%{A%}tai VerbInfl ; 205 | matitiotap:m%{A%}titiotap VerbInfl ; 206 | maye:m%{A%}ye VerbInfl ; 207 | mayo:m%{A%}yo VerbInfl ; 208 | meire:meire VerbInfl ; 209 | mesari:mesari VerbInfl ; 210 | mosi:mosi VerbInfl ; 211 | mun:mun VerbInfl ; 212 | musane:musane VerbInfl ; 213 | nai:n%{A%}i VerbInfl ; 214 | nanari:n%{A%}nari VerbInfl ; 215 | nanaripi:n%{A%}naripi VerbInfl ; 216 | naya:n%{A%}ya VerbInfl ; 217 | nei:nei VerbInfl ; 218 | newe:newe VerbInfl ; 219 | newi:newi VerbInfl ; 220 | nganggat:ŋ%{A%}ŋgat VerbInfl ; 221 | nomu:nomu VerbInfl ; 222 | nunu:nunu VerbInfl ; 223 | o:o VerbInfl ; 224 | one:one VerbInfl ; 225 | osa:osa VerbInfl ; 226 | otabia:otabia VerbInfl ; 227 | otara:otara VerbInfl ; 228 | otara:otara VerbInfl ; 229 | otu:otu VerbInfl ; 230 | oyo:oyo VerbInfl ; 231 | pape:p%{A%}pe VerbInfl ; 232 | papo:p%{A%}po VerbInfl ; 233 | pera:pera VerbInfl ; 234 | peya:peya VerbInfl ; 235 | poriri:poriri VerbInfl ; 236 | posasa:posasa VerbInfl ; 237 | posasiei:posasiei VerbInfl ; 238 | pota:pota VerbInfl ; 239 | pote:pote VerbInfl ; 240 | poya:poya VerbInfl ; 241 | pu:pu VerbInfl ; 242 | pui:pui VerbInfl ; 243 | ra:r%{A%} VerbInfl ; 244 | rama:r%{A%}ma VerbInfl ; 245 | rariorar:r%{A%}riorar VerbInfl ; 246 | rau:r%{A%}u VerbInfl ; 247 | rekamani:rekamani VerbInfl ; 248 | repe:repe VerbInfl ; 249 | rera:rera VerbInfl ; 250 | resasi:resasi VerbInfl ; 251 | rewas:rewas VerbInfl ; 252 | ri:ri VerbInfl ; 253 | riesa:riesa VerbInfl ; 254 | rina:rina VerbInfl ; 255 | rira:rira VerbInfl ; 256 | ririau:ririau VerbInfl ; 257 | riwan:riwan VerbInfl ; 258 | ro:ro VerbInfl ; 259 | roi:roi VerbInfl ; 260 | rora:rora VerbInfl ; 261 | rotu:rotu VerbInfl ; 262 | royare:royare VerbInfl ; 263 | ruase:ruase VerbInfl ; 264 | rubatai:rubatai VerbInfl ; 265 | rukusare:rukusare VerbInfl ; 266 | rure:rure VerbInfl ; 267 | ruri:ruri VerbInfl ; 268 | rusasa:rusasa VerbInfl ; 269 | rut:rut VerbInfl ; 270 | sa:s%{A%} VerbInfl ; 271 | sa:s%{A%} VerbInfl ; 272 | sabaya:s%{A%}baya VerbInfl ; 273 | sairara:s%{A%}irara VerbInfl ; 274 | samariari:s%{A%}mariari VerbInfl ; 275 | samuai:s%{A%}muai VerbInfl ; 276 | sanana:s%{A%}nana VerbInfl ; 277 | sanepai:s%{A%}nepai VerbInfl ; 278 | sanepaya:s%{A%}nepaya VerbInfl ; 279 | sanepaya:s%{A%}nepaya VerbInfl ; 280 | sanevesie:s%{A%}nevesie VerbInfl ; 281 | sarai:s%{A%}rai VerbInfl ; 282 | sario:s%{A%}rio VerbInfl ; 283 | sasara:s%{A%}sara VerbInfl ; 284 | sase:s%{A%}se VerbInfl ; 285 | sasera:s%{A%}sera VerbInfl ; 286 | sasi:s%{A%}si VerbInfl ; 287 | sasie:s%{A%}sie VerbInfl ; 288 | sasiri:s%{A%}siri VerbInfl ; 289 | saviori:s%{A%}viori VerbInfl ; 290 | sayor:s%{A%}yor VerbInfl ; 291 | se:se VerbInfl ; 292 | sera:sera VerbInfl ; 293 | sesa:sesa VerbInfl ; 294 | setawa:setawa VerbInfl ; 295 | seva:seva VerbInfl ; 296 | siar:siar VerbInfl ; 297 | simarai:simarai VerbInfl ; 298 | siorap:siorap VerbInfl ; 299 | siwara:siwara VerbInfl ; 300 | so:so VerbInfl ; 301 | so:so VerbInfl ; 302 | sobata:sobata VerbInfl ; 303 | soive:soive VerbInfl ; 304 | sokiare:sokiare VerbInfl ; 305 | sokiase:sokiase VerbInfl ; 306 | som:som VerbInfl ; 307 | sonini:sonini VerbInfl ; 308 | sora:sora VerbInfl ; 309 | sori:sori VerbInfl ; 310 | sosa:sosa VerbInfl ; 311 | sosoya:sosoya VerbInfl ; 312 | sovera:sovera VerbInfl ; 313 | su:su VerbInfl ; 314 | suetawana:suetawana VerbInfl ; 315 | sum:sum VerbInfl ; 316 | sume:sume VerbInfl ; 317 | suvi:suvi VerbInfl ; 318 | suvuan:suvuan VerbInfl ; 319 | suwasi:suwasi VerbInfl ; 320 | ta:t%{A%} VerbInfl ; 321 | taisu:t%{A%}isu VerbInfl ; 322 | tandor:t%{A%}ndor VerbInfl ; 323 | tapu:t%{A%}pu VerbInfl ; 324 | tara:t%{A%}ra VerbInfl ; 325 | tare:t%{A%}re VerbInfl ; 326 | tarisu:t%{A%}risu VerbInfl ; 327 | tata:t%{A%}ta VerbInfl ; 328 | tatare:t%{A%}tare VerbInfl ; 329 | tatira:t%{A%}tira VerbInfl ; 330 | tatopa:t%{A%}topa VerbInfl ; 331 | tavavu:t%{A%}vavu VerbInfl ; 332 | tavera:t%{A%}vera VerbInfl ; 333 | tavi:t%{A%}vi VerbInfl ; 334 | taviar:t%{A%}viar VerbInfl ; 335 | tawa:t%{A%}wa VerbInfl ; 336 | tenam:tenam VerbInfl ; 337 | tera:tera VerbInfl ; 338 | tere:tere VerbInfl ; 339 | tipira:tipira VerbInfl ; 340 | tire:tire VerbInfl ; 341 | tisa:tisa VerbInfl ; 342 | tita:tita VerbInfl ; 343 | titiai:titiai VerbInfl ; 344 | tob:tob VerbInfl ; 345 | topa:topa VerbInfl ; 346 | tota:tota VerbInfl ; 347 | tupar:tupar VerbInfl ; 348 | turu:turu VerbInfl ; 349 | tut:tut VerbInfl ; 350 | tuwa:tuwa VerbInfl ; 351 | ubira:ubira VerbInfl ; 352 | unusar:unusar VerbInfl ; 353 | ur:ur VerbInfl ; 354 | urusara:urusara VerbInfl ; 355 | usar:usar VerbInfl ; 356 | uta:uta VerbInfl ; 357 | utanusara:utanusara VerbInfl ; 358 | uvietu:uvietu VerbInfl ; 359 | vabara:v%{A%}bara VerbInfl ; 360 | varakare:v%{A%}rakare VerbInfl ; 361 | varo:v%{A%}ro VerbInfl ; 362 | vata:v%{A%}ta VerbInfl ; 363 | vaune:v%{A%}une VerbInfl ; 364 | vavou:v%{A%}vou VerbInfl ; 365 | vawatatar:v%{A%}watatar VerbInfl ; 366 | ve:ve VerbInfl ; 367 | vediadi:vediadi VerbInfl ; 368 | vedior:vedior VerbInfl ; 369 | vekapo:vekapo VerbInfl ; 370 | venadi:venadi VerbInfl ; 371 | veprenta:veprenta VerbInfl ; 372 | vereri:vereri VerbInfl ; 373 | vesikop:vesikop VerbInfl ; 374 | vesusa:vesusa VerbInfl ; 375 | via:via VerbInfl ; 376 | viviare:viviare VerbInfl ; 377 | vo:vo VerbInfl ; 378 | vori:vori VerbInfl ; 379 | voru:voru VerbInfl ; 380 | vove:vove VerbInfl ; 381 | vui:vui VerbInfl ; 382 | vuvu:vuvu VerbInfl ; 383 | -------------------------------------------------------------------------------- /tests/wad.lexd: -------------------------------------------------------------------------------- 1 | PATTERNS 2 | :VerbInfl :VerbApplicative :VerbCausative VerbLemma VerbInfl: VerbApplicative: VerbCausative: 3 | 4 | LEXICON VerbLemma 5 | a:a 6 | adiat:{A}diat 7 | adiava:{A}diava 8 | adu:{A}du 9 | amise:{A}mise 10 | ana:{A}na 11 | ananar:{A}nanar 12 | ane:{A}ne 13 | ania:{A}nia 14 | aniau:{A}niau 15 | aniwar:{A}niwar 16 | anota:{A}nota 17 | anu:{A}nu 18 | apai:{A}pai 19 | ape:{A}pe 20 | api:{A}pi 21 | ara:{A}ra 22 | ariarir:{A}riarir 23 | aririarir:{A}ririarir 24 | as:{A}s 25 | asiei:{A}siei 26 | asiwar:{A}siwar 27 | atore:{A}tore 28 | avakir:{A}vakir 29 | avi:{A}vi 30 | avute:{A}vute 31 | aware:{A}ware 32 | awawai:{A}wawai 33 | awer:{A}wer 34 | awia:{A}wia 35 | bai:b{A}i 36 | bajor:b{A}jor 37 | deriasi:deriasi 38 | ena:ena 39 | iravesie:iravesie 40 | iri:iri 41 | isa:isa 42 | isani:isani 43 | isi:isi 44 | ka:k{A} 45 | kadadiri:k{A}dadiri 46 | kais:k{A}is 47 | kamadiwa:k{A}madiwa 48 | kamanam:k{A}manam 49 | kamaru:k{A}maru 50 | kambarai:k{A}mbarai 51 | kanisu:k{A}nisu 52 | kapori:k{A}pori 53 | karamut:k{A}ramut 54 | karier:k{A}rier 55 | karieri:k{A}rieri 56 | karipe:k{A}ripe 57 | kariria:k{A}riria 58 | karotui:k{A}rotui 59 | karutarera:k{A}rutarera 60 | kase:k{A}se 61 | kasie:k{A}sie 62 | kasiou:k{A}siou 63 | kavera:k{A}vera 64 | kavio:k{A}vio 65 | kaye:k{A}ye 66 | kerat:kerat 67 | keri:keri 68 | kivari:kivari 69 | kopa:kopa 70 | kotar:kotar 71 | kubira:kubira 72 | kutu:kutu 73 | mamaya:m{A}maya 74 | mamu:m{A}mu 75 | manau:m{A}nau 76 | manggar:m{A}ŋgar 77 | marandi:m{A}randi 78 | marariat:m{A}rariat 79 | mararieti:m{A}rarieti 80 | marera:m{A}rera 81 | marera:m{A}rera 82 | mari:m{A}ri 83 | marior:m{A}rior 84 | marova:m{A}rova 85 | maroya:m{A}roya 86 | marura:m{A}rura 87 | marutu:m{A}rutu 88 | masa:m{A}sa 89 | masivaveri:m{A}sivaveri 90 | maso:m{A}so 91 | masop:m{A}sop 92 | masubat:m{A}subat 93 | matai:m{A}tai 94 | matitiotap:m{A}titiotap 95 | maye:m{A}ye 96 | mayo:m{A}yo 97 | meire:meire 98 | mesari:mesari 99 | mosi:mosi 100 | mun:mun 101 | musane:musane 102 | nai:n{A}i 103 | nanari:n{A}nari 104 | nanaripi:n{A}naripi 105 | naya:n{A}ya 106 | nei:nei 107 | newe:newe 108 | newi:newi 109 | nganggat:ŋ{A}ŋgat 110 | nomu:nomu 111 | nunu:nunu 112 | o:o 113 | one:one 114 | osa:osa 115 | otabia:otabia 116 | otara:otara 117 | otara:otara 118 | otu:otu 119 | oyo:oyo 120 | pape:p{A}pe 121 | papo:p{A}po 122 | pera:pera 123 | peya:peya 124 | poriri:poriri 125 | posasa:posasa 126 | posasiei:posasiei 127 | pota:pota 128 | pote:pote 129 | poya:poya 130 | pu:pu 131 | pui:pui 132 | ra:r{A} 133 | rama:r{A}ma 134 | rariorar:r{A}riorar 135 | rau:r{A}u 136 | rekamani:rekamani 137 | repe:repe 138 | rera:rera 139 | resasi:resasi 140 | rewas:rewas 141 | ri:ri 142 | riesa:riesa 143 | rina:rina 144 | rira:rira 145 | ririau:ririau 146 | riwan:riwan 147 | ro:ro 148 | roi:roi 149 | rora:rora 150 | rotu:rotu 151 | royare:royare 152 | ruase:ruase 153 | rubatai:rubatai 154 | rukusare:rukusare 155 | rure:rure 156 | ruri:ruri 157 | rusasa:rusasa 158 | rut:rut 159 | sa:s{A} 160 | sa:s{A} 161 | sabaya:s{A}baya 162 | sairara:s{A}irara 163 | samariari:s{A}mariari 164 | samuai:s{A}muai 165 | sanana:s{A}nana 166 | sanepai:s{A}nepai 167 | sanepaya:s{A}nepaya 168 | sanepaya:s{A}nepaya 169 | sanevesie:s{A}nevesie 170 | sarai:s{A}rai 171 | sario:s{A}rio 172 | sasara:s{A}sara 173 | sase:s{A}se 174 | sasera:s{A}sera 175 | sasi:s{A}si 176 | sasie:s{A}sie 177 | sasiri:s{A}siri 178 | saviori:s{A}viori 179 | sayor:s{A}yor 180 | se:se 181 | sera:sera 182 | sesa:sesa 183 | setawa:setawa 184 | seva:seva 185 | siar:siar 186 | simarai:simarai 187 | siorap:siorap 188 | siwara:siwara 189 | so:so 190 | so:so 191 | sobata:sobata 192 | soive:soive 193 | sokiare:sokiare 194 | sokiase:sokiase 195 | som:som 196 | sonini:sonini 197 | sora:sora 198 | sori:sori 199 | sosa:sosa 200 | sosoya:sosoya 201 | sovera:sovera 202 | su:su 203 | suetawana:suetawana 204 | sum:sum 205 | sume:sume 206 | suvi:suvi 207 | suvuan:suvuan 208 | suwasi:suwasi 209 | ta:t{A} 210 | taisu:t{A}isu 211 | tandor:t{A}ndor 212 | tapu:t{A}pu 213 | tara:t{A}ra 214 | tare:t{A}re 215 | tarisu:t{A}risu 216 | tata:t{A}ta 217 | tatare:t{A}tare 218 | tatira:t{A}tira 219 | tatopa:t{A}topa 220 | tavavu:t{A}vavu 221 | tavera:t{A}vera 222 | tavi:t{A}vi 223 | taviar:t{A}viar 224 | tawa:t{A}wa 225 | tenam:tenam 226 | tera:tera 227 | tere:tere 228 | tipira:tipira 229 | tire:tire 230 | tisa:tisa 231 | tita:tita 232 | titiai:titiai 233 | tob:tob 234 | topa:topa 235 | tota:tota 236 | tupar:tupar 237 | turu:turu 238 | tut:tut 239 | tuwa:tuwa 240 | ubira:ubira 241 | unusar:unusar 242 | ur:ur 243 | urusara:urusara 244 | usar:usar 245 | uta:uta 246 | utanusara:utanusara 247 | uvietu:uvietu 248 | vabara:v{A}bara 249 | varakare:v{A}rakare 250 | varo:v{A}ro 251 | vata:v{A}ta 252 | vaune:v{A}une 253 | vavou:v{A}vou 254 | vawatatar:v{A}watatar 255 | ve:ve 256 | vediadi:vediadi 257 | vedior:vedior 258 | vekapo:vekapo 259 | venadi:venadi 260 | veprenta:veprenta 261 | vereri:vereri 262 | vesikop:vesikop 263 | vesusa:vesusa 264 | via:via 265 | viviare:viviare 266 | vo:vo 267 | vori:vori 268 | voru:voru 269 | vove:vove 270 | vui:vui 271 | vuvu:vuvu 272 | 273 | LEXICON VerbInfl 274 | :tur 275 | :tat 276 | :i 277 | :amur 278 | :amat 279 | :{B}u 280 | :mur 281 | :met 282 | :{D}i 283 | :sur 284 | :set 285 | :si 286 | 287 | LEXICON VerbApplicative 288 | :it 289 | : 290 | 291 | LEXICON VerbCausative 292 | :on 293 | : 294 | -------------------------------------------------------------------------------- /tests/wad.sh2: -------------------------------------------------------------------------------- 1 | hfst-lexc -s wad.lexc -o wad.lexc.hfst 2 | hfst-twolc -s wad.twoc -o wad.twoc.hfst 3 | hfst-invert wad.lexc.hfst | hfst-compose-intersect -s -1 - -2 wad.twoc.hfst 2>/dev/null | hfst-invert | hfst-minimize -o wad.hfst 4 | hfst-invert wad.lexc.hfst | hfst-compose-intersect -s -1 - -2 wad.twoc.hfst 2>/dev/null | hfst-invert -o wad.nomin.hfst 5 | hfst-minimize wad.nomin.hfst -o wad.hfst 6 | ../src/lexd wad.lexd wad.att 7 | hfst-txt2fst wad.att -o wad_d.hfst 8 | -------------------------------------------------------------------------------- /tests/wad.twoc: -------------------------------------------------------------------------------- 1 | Alphabet 2 | 3 | ! A B D E G I J K L M N O P R S T U V W Y Z Ŋ 4 | ! a b d e g i j k l m n o p r s t u v w y z ŋ 5 | ! %{A%}:a %{A%}:0 ! %>:0 6 | 7 | %[%+1incdu%]:0 %[%+1incpl%]:0 %[%+1sg%]:0 %[%+1excdu%]:0 %[%+1excpl%]:0 %[%+2sg%]:0 %[%+2du%]:0 %[%+2pl%]:0 %[%+3sg%]:0 %[%+3du%]:0 %[%+3plhum%]:0 %[%+3plnh%]:0 %[%-1incdu%]:0 %[%-1incpl%]:0 %[%-1sg%]:0 %[%-1excdu%]:0 %[%-1excpl%]:0 %[%-2sg%]:0 %[%-2du%]:0 %[%-2pl%]:0 %[%-3sg%]:0 %[%-3du%]:0 %[%-3plhum%]:0 %[%-3plnh%]:0 8 | 9 | %[%+appl%]:0 %[%-appl%]:0 10 | %[%+caus%]:0 %[%-caus%]:0 11 | 12 | % 13 | 14 | ! Number morphology 15 | % 16 | % 17 | % 18 | 19 | !% 20 | !% 21 | !% 22 | 23 | !% 24 | !% 25 | 26 | !% 27 | !% 28 | ; 29 | 30 | Sets 31 | 32 | Prefix = %[%+1incdu%] %[%+1incpl%] %[%+1sg%] %[%+1excdu%] %[%+1excpl%] %[%+2sg%] %[%+2du%] %[%+2pl%] %[%+3sg%] %[%+3du%] %[%+3plhum%] %[%+3plnh%] %[%+appl%] %[%+caus%] ; 33 | 34 | Suffix = %[%-1incdu%] %[%-1incpl%] %[%-1sg%] %[%-1excdu%] %[%-1excpl%] %[%-2sg%] %[%-2du%] %[%-2pl%] %[%-3sg%] %[%-3du%] %[%-3plhum%] %[%-3plnh%] %[%-appl%] %[%-caus%] ; 35 | 36 | Rules 37 | 38 | "Remove paths without matching suffix feature" 39 | Fx:0 /<= _ ; 40 | except 41 | _ :* Fy:0 ; 42 | where Fx in Prefix 43 | Fy in Suffix 44 | matched ; 45 | 46 | "Remove paths without matching prefix feature" 47 | Fy:0 /<= _ ; 48 | except 49 | Fx:0 :* _ ; 50 | where Fx in Prefix 51 | Fy in Suffix 52 | matched ; 53 | --------------------------------------------------------------------------------