├── tests ├── legacy │ ├── nosuggest.sug │ ├── IJ.sug │ ├── IJ.wrong │ ├── arabic.wrong │ ├── onlyincompound.sug │ ├── reputf.sug │ ├── reputf.wrong │ ├── 1975530.wrong │ ├── 2970242.wrong │ ├── IJ.good │ ├── alias.dic │ ├── arabic.dic │ ├── flag.dic │ ├── i35725.good │ ├── i53643.dic │ ├── korean.aff │ ├── korean.wrong │ ├── needaffix.wrong │ ├── needaffix3.wrong │ ├── reputf.dic │ ├── alias2.dic │ ├── alias3.dic │ ├── checksharps.wrong │ ├── checksharpsutf.wrong │ ├── compoundrule5.wrong │ ├── flagutf8.dic │ ├── i68568.dic │ ├── i68568utf.dic │ ├── needaffix3.dic │ ├── ngram_utf_fix.good │ ├── ngram_utf_fix.sug │ ├── oconv.good │ ├── phone.wrong │ ├── utf8_nonbmp.aff │ ├── warn.good │ ├── 2970240.good │ ├── 2970240.wrong │ ├── 2999225.good │ ├── IJ.dic │ ├── allcaps2.sug │ ├── allcaps2.wrong │ ├── checkcompoundcase2.wrong │ ├── checksharps.sug │ ├── compoundrule.good │ ├── condition_utf.dic │ ├── digits_in_words.wrong │ ├── encoding.aff │ ├── flaglong.dic │ ├── flagnum.dic │ ├── i54633.good │ ├── i54633.sug │ ├── i54633.wrong │ ├── korean.good │ ├── needaffix3.good │ ├── needaffix5.dic │ ├── simplifiedtriple.wrong │ ├── utf8.dic │ ├── utf8_bom.dic │ ├── utf8_bom2.dic │ ├── warn.dic │ ├── 1463589_utf.dic │ ├── 1592880.good │ ├── 1695964.dic │ ├── 1695964.sug │ ├── 1975530.dic │ ├── 1975530.good │ ├── checkcompoundcaseutf.wrong │ ├── checkcompoundpattern2.wrong │ ├── checksharpsutf.sug │ ├── circumfix.dic │ ├── complexprefixesutf.dic │ ├── dotless_i.aff │ ├── forbiddenword.good │ ├── forceucase.sug │ ├── korean.dic │ ├── nosuggest.dic │ ├── oconv.sug │ ├── oconv.wrong │ ├── slash.wrong │ ├── 1695964.wrong │ ├── breakoff.good │ ├── checkcompoundcase2.good │ ├── circumfix.wrong │ ├── complexprefixes.dic │ ├── forceucase.wrong │ ├── ignore.dic │ ├── map.sug │ ├── needaffix.dic │ ├── needaffix.good │ ├── nepali.wrong │ ├── ngram_utf_fix.wrong │ ├── nosuggest.good │ ├── oconv.dic │ ├── onlyincompound.wrong │ ├── opentaal_cpdpat.good │ ├── opentaal_cpdpat.wrong │ ├── opentaal_cpdpat2.good │ ├── opentaal_cpdpat2.wrong │ ├── opentaal_forbiddenword1.sug │ ├── opentaal_forbiddenword2.sug │ ├── utf8_bom.good │ ├── utf8_bom2.good │ ├── utf8_nonbmp.sug │ ├── utf8_nonbmp.wrong │ ├── zeroaffix.good │ ├── 1592880.dic │ ├── 2970242.dic │ ├── 2999225.dic │ ├── affixes.dic │ ├── alias.good │ ├── alias2.good │ ├── allcaps.dic │ ├── checkcompoundcaseutf.dic │ ├── checkcompoundcaseutf.good │ ├── checkcompounddup.dic │ ├── checkcompoundrep.good │ ├── checkcompoundtriple.wrong │ ├── colons_in_words.dic │ ├── complexprefixes.wrong │ ├── compoundaffix.dic │ ├── compoundaffix2.dic │ ├── compoundaffix3.dic │ ├── conditionalprefix.wrong │ ├── dotless_i.dic │ ├── fogemorpheme.dic │ ├── i53643.wrong │ ├── i54980.good │ ├── i58202.dic │ ├── iconv.dic │ ├── iconv2.good │ ├── keepcase.good │ ├── map.wrong │ ├── maputf.sug │ ├── maputf.wrong │ ├── needaffix2.aff │ ├── needaffix4.aff │ ├── nepali.good │ ├── ngram_utf_fix.dic │ ├── nosuggest.wrong │ ├── onlyincompound.dic │ ├── onlyincompound2.dic │ ├── simplifiedtriple.dic │ ├── 2970240.dic │ ├── allcaps.sug │ ├── allcaps2.dic │ ├── allcaps2.good │ ├── allcaps_utf.dic │ ├── checkcompoundpattern2.dic │ ├── checkcompoundpattern2.good │ ├── checkcompoundpattern4.good │ ├── complexprefixes2.dic │ ├── complexprefixesutf.wrong │ ├── compoundflag.good │ ├── compoundrule.dic │ ├── compoundrule2.dic │ ├── compoundrule3.dic │ ├── compoundrule6.dic │ ├── encoding.good │ ├── forceucase.dic │ ├── fullstrip.dic │ ├── iconv2.dic │ ├── maputf.dic │ ├── nepali.dic │ ├── simplifiedtriple.good │ ├── zeroaffix.dic │ ├── 1706659.dic │ ├── allcaps.wrong │ ├── allcaps3.wrong │ ├── allcaps_utf.sug │ ├── allcaps_utf.wrong │ ├── checkcompoundcase.wrong │ ├── checkcompoundpattern.good │ ├── checkcompoundpattern4.wrong │ ├── complexprefixes.good │ ├── complexprefixesutf.good │ ├── compoundflag.aff │ ├── compoundflag.dic │ ├── compoundflag.wrong │ ├── fogemorpheme.good │ ├── forbiddenword.wrong │ ├── needaffix2.good │ ├── needaffix4.good │ ├── onlyincompound2.good │ ├── slash.good │ ├── utf8_nonbmp.good │ ├── alias3.good │ ├── break.dic │ ├── checkcompoundcase.dic │ ├── checkcompounddup.wrong │ ├── compoundrule3.good │ ├── fogemorpheme.wrong │ ├── hu.wrong │ ├── keepcase.dic │ ├── onlyincompound2.wrong │ ├── 1706659.wrong │ ├── 2970242.good │ ├── breakoff.dic │ ├── checkcompoundrep.wrong │ ├── circumfix.good │ ├── compoundaffix.wrong │ ├── compoundrule4.wrong │ ├── compoundrule6.good │ ├── compoundrule7.wrong │ ├── compoundrule8.wrong │ ├── conditionalprefix.dic │ ├── i68568utf.wrong │ ├── iconv.good │ ├── onlyincompound.good │ ├── opentaal_cpdpat.dic │ ├── opentaal_forbiddenword1.good │ ├── utfcompound.aff │ ├── breakdefault.dic │ ├── breakoff.wrong │ ├── checkcompoundpattern3.dic │ ├── checkcompoundtriple.dic │ ├── complexprefixes2.good │ ├── compoundaffix3.good │ ├── compoundrule.aff │ ├── compoundrule6.wrong │ ├── i53643.aff │ ├── i68568.wrong │ ├── keepcase.aff │ ├── opentaal_cpdpat2.dic │ ├── opentaal_keepcase.good │ ├── slash.dic │ ├── utfcompound.good │ ├── allcaps.good │ ├── allcaps_utf.good │ ├── checkcompoundcaseutf.aff │ ├── compoundrule2.aff │ ├── compoundrule3.aff │ ├── dotless_i.wrong │ ├── needaffix4.dic │ ├── needaffix5.wrong │ ├── opentaal_forbiddenword2.dic │ ├── phone.sug │ ├── utf8_bom.aff │ ├── utf8_bom2.aff │ ├── affixes.good │ ├── allcaps3.dic │ ├── breakdefault.sug │ ├── breakdefault.wrong │ ├── checkcompoundcase.good │ ├── compoundrule5.good │ ├── keepcase.sug │ ├── keepcase.wrong │ ├── map.aff │ ├── map.dic │ ├── needaffix.aff │ ├── opentaal_forbiddenword1.dic │ ├── opentaal_forbiddenword2.good │ ├── rep.aff │ ├── 1975530.aff │ ├── allcaps_utf.aff │ ├── checkcompoundcase2.aff │ ├── checkcompounddup.good │ ├── checkcompoundpattern4.dic │ ├── compoundaffix.good │ ├── forceucase.good │ ├── germancompounding.dic │ ├── germancompoundingold.dic │ ├── i58202.aff │ ├── ignore.good │ ├── opentaal_keepcase.dic │ ├── phone.aff │ ├── slash.aff │ ├── utfcompound.wrong │ ├── 1463589.dic │ ├── 1463589.sug │ ├── 1463589.wrong │ ├── 1463589_utf.sug │ ├── 2970242.aff │ ├── compoundrule2.wrong │ ├── dotless_i.good │ ├── encoding.dic │ ├── forceucase.aff │ ├── i54633.aff │ ├── i54633.dic │ ├── i54980.dic │ ├── needaffix2.dic │ ├── 1463589_utf.wrong │ ├── 2999225.aff │ ├── arabic.aff │ ├── breakdefault.good │ ├── checkcompoundtriple.good │ ├── condition.aff │ ├── condition.dic │ ├── flag.good │ ├── utf8.good │ ├── utf8_nonbmp.dic │ ├── checkcompoundpattern.wrong │ ├── checksharps.aff │ ├── checksharps.dic │ ├── compoundrule6.aff │ ├── conditionalprefix.good │ ├── flaglong.good │ ├── flagnum.good │ ├── flagutf8.good │ ├── i58202.good │ ├── opentaal_forbiddenword1.wrong │ ├── opentaal_forbiddenword2.wrong │ ├── checkcompounddup.aff │ ├── checksharpsutf.aff │ ├── i54980.aff │ ├── rep.sug │ ├── utfcompound.dic │ ├── checkcompoundcase2.dic │ ├── checkcompoundrep.aff │ ├── checkcompoundrep.dic │ ├── checkcompoundtriple.aff │ ├── checksharpsutf.dic │ ├── fullstrip.good │ ├── germancompounding.aff │ ├── checkcompoundpattern.dic │ ├── compoundaffix3.wrong │ ├── forbiddenword.dic │ ├── germancompoundingold.aff │ ├── ignoreutf.good │ ├── opentaal_forbiddenword2.aff │ ├── checkcompoundcase.aff │ ├── checkcompoundpattern3.wrong │ ├── ignoreutf.dic │ ├── 2970240.aff │ ├── breakdefault.aff │ ├── opentaal_keepcase.aff │ ├── opentaal_keepcase.wrong │ ├── compoundaffix2.good │ ├── digits_in_words.dic │ ├── hu.good │ ├── ignore.aff │ ├── needaffix3.aff │ ├── sugutf.dic │ ├── 1463589.aff │ ├── checkcompoundpattern3.good │ ├── compoundaffix.aff │ ├── allcaps3.aff │ ├── i68568.aff │ ├── IJ.aff │ ├── base.wrong │ ├── colons_in_words.aff │ ├── onlyincompound.aff │ ├── rep.wrong │ ├── allcaps.aff │ ├── i35725.wrong │ ├── affixes.aff │ ├── breakoff.aff │ ├── i58202.wrong │ ├── i68568utf.aff │ ├── sug.dic │ ├── allcaps2.aff │ ├── rep.dic │ ├── allcaps3.good │ ├── base.sug │ ├── compoundaffix2.aff │ ├── compoundaffix3.aff │ ├── compoundrule5.aff │ ├── i58202.sug │ ├── opentaal_forbiddenword1.aff │ ├── 1463589_utf.aff │ ├── base_utf.wrong │ ├── compoundrule4.aff │ ├── utf8.aff │ ├── phone.dic │ ├── sugutf.sug │ ├── sugutf.wrong │ ├── alias.aff │ ├── break.good │ ├── iconv2.aff │ ├── oconv.aff │ ├── opentaal_keepcase.sug │ ├── simplifiedtriple.aff │ ├── sug.sug │ ├── checkcompoundpattern.aff │ ├── nepali.aff │ ├── base_utf.sug │ ├── complexprefixes.aff │ ├── flag.aff │ ├── i35725.sug │ ├── hu.dic │ ├── nosuggest.aff │ ├── needaffix5.good │ ├── reputf.aff │ ├── checksharps.good │ ├── conditionalprefix.aff │ ├── flaglong.aff │ ├── sug.wrong │ ├── zeroaffix.aff │ ├── checksharpsutf.good │ ├── needaffix5.aff │ ├── compoundrule7.aff │ ├── flagutf8.aff │ ├── maputf.aff │ ├── checkcompoundpattern3.aff │ ├── flagnum.aff │ ├── iconv.aff │ ├── i53643.good │ ├── ignoreutf.aff │ ├── break.wrong │ ├── checkcompoundpattern4.aff │ ├── compoundrule8.aff │ ├── compoundrule5.dic │ ├── break.aff │ ├── complexprefixes2.aff │ ├── condition_utf.wrong │ ├── 1695964.aff │ ├── condition_utf.good │ ├── alias3.morph │ ├── i35725.dic │ ├── warn.aff │ ├── 1706659.aff │ ├── checkcompoundpattern2.aff │ ├── compoundrule4.dic │ ├── fogemorpheme.aff │ ├── morph.aff │ ├── complexprefixesutf.aff │ ├── opentaal_cpdpat.aff │ ├── forbiddenword.aff │ ├── alias2.aff │ ├── alias3.aff │ ├── compoundrule4.good │ ├── compoundrule7.good │ ├── compoundrule8.good │ ├── condition.wrong │ ├── digits_in_words.aff │ ├── compoundrule7.dic │ ├── base.good │ ├── base_utf.good │ ├── germancompoundingold.good │ ├── onlyincompound2.aff │ ├── condition.good │ ├── morph.dic │ ├── needaffix2.morph │ ├── base_utf.dic │ ├── base.dic │ ├── 1592880.aff │ ├── circumfix.aff │ ├── compoundrule2.good │ ├── alias2.morph │ ├── compoundrule.wrong │ ├── circumfix.morph │ ├── compoundrule8.dic │ ├── zeroaffix.morph │ ├── compoundrule3.wrong │ ├── morph.good │ ├── ngram_utf_fix.aff │ ├── fullstrip.aff │ ├── germancompounding.good │ ├── sug.aff │ ├── sugutf.aff │ ├── opentaal_cpdpat2.aff │ ├── hu.aff │ ├── conditionalprefix.morph │ ├── compoundrule5.morph │ ├── germancompounding.wrong │ ├── germancompoundingold.wrong │ ├── condition_utf.aff │ ├── morph.morph │ ├── license.hunspell │ ├── base.aff │ ├── base_utf.aff │ └── i35725.aff └── legacy.rs ├── .gitignore ├── .envrc ├── vendor └── en_US │ ├── README.txt │ ├── WordNet_license.txt │ └── en_US.aff ├── shell.nix ├── benches ├── compilation.rs └── check.rs ├── examples ├── load-dictionary.rs ├── check.rs ├── suggest.rs └── prose.rs ├── Cargo.toml ├── flake.nix ├── flake.lock ├── Cargo.lock ├── .github └── workflows │ └── ci.yml ├── CHANGELOG.md ├── docs ├── CONTRIBUTING.md ├── internals.md └── compare.md ├── README.md └── src └── hash_bag.rs /tests/legacy/nosuggest.sug: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/legacy/IJ.sug: -------------------------------------------------------------------------------- 1 | IJs, ijs 2 | -------------------------------------------------------------------------------- /tests/legacy/IJ.wrong: -------------------------------------------------------------------------------- 1 | Ijs 2 | -------------------------------------------------------------------------------- /tests/legacy/arabic.wrong: -------------------------------------------------------------------------------- 1 | ـ 2 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound.sug: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/legacy/reputf.sug: -------------------------------------------------------------------------------- 1 | főő 2 | -------------------------------------------------------------------------------- /tests/legacy/reputf.wrong: -------------------------------------------------------------------------------- 1 | foo 2 | -------------------------------------------------------------------------------- /tests/legacy/1975530.wrong: -------------------------------------------------------------------------------- 1 | تيار 2 | -------------------------------------------------------------------------------- /tests/legacy/2970242.wrong: -------------------------------------------------------------------------------- 1 | foobaz 2 | -------------------------------------------------------------------------------- /tests/legacy/IJ.good: -------------------------------------------------------------------------------- 1 | ijs 2 | IJs 3 | -------------------------------------------------------------------------------- /tests/legacy/alias.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/1 3 | -------------------------------------------------------------------------------- /tests/legacy/arabic.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ب 3 | -------------------------------------------------------------------------------- /tests/legacy/flag.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/A3 3 | -------------------------------------------------------------------------------- /tests/legacy/i35725.good: -------------------------------------------------------------------------------- 1 | permanent 2 | -------------------------------------------------------------------------------- /tests/legacy/i53643.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo 3 | -------------------------------------------------------------------------------- /tests/legacy/korean.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | -------------------------------------------------------------------------------- /tests/legacy/korean.wrong: -------------------------------------------------------------------------------- 1 | 들어오세 2 | -------------------------------------------------------------------------------- /tests/legacy/needaffix.wrong: -------------------------------------------------------------------------------- 1 | foo 2 | -------------------------------------------------------------------------------- /tests/legacy/needaffix3.wrong: -------------------------------------------------------------------------------- 1 | foos 2 | -------------------------------------------------------------------------------- /tests/legacy/reputf.dic: -------------------------------------------------------------------------------- 1 | 1 2 | főő 3 | -------------------------------------------------------------------------------- /tests/legacy/alias2.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/1 3 3 | -------------------------------------------------------------------------------- /tests/legacy/alias3.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ouro/BC 4 3 | -------------------------------------------------------------------------------- /tests/legacy/checksharps.wrong: -------------------------------------------------------------------------------- 1 | MÜßIG 2 | -------------------------------------------------------------------------------- /tests/legacy/checksharpsutf.wrong: -------------------------------------------------------------------------------- 1 | MÜßIG 2 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule5.wrong: -------------------------------------------------------------------------------- 1 | .25 2 | -------------------------------------------------------------------------------- /tests/legacy/flagutf8.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/AÜ 3 | -------------------------------------------------------------------------------- /tests/legacy/i68568.dic: -------------------------------------------------------------------------------- 1 | 1 2 | Elia/a 3 | -------------------------------------------------------------------------------- /tests/legacy/i68568utf.dic: -------------------------------------------------------------------------------- 1 | 1 2 | Bár/a 3 | -------------------------------------------------------------------------------- /tests/legacy/needaffix3.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | -------------------------------------------------------------------------------- /tests/legacy/ngram_utf_fix.good: -------------------------------------------------------------------------------- 1 | человек 2 | -------------------------------------------------------------------------------- /tests/legacy/ngram_utf_fix.sug: -------------------------------------------------------------------------------- 1 | человек 2 | -------------------------------------------------------------------------------- /tests/legacy/oconv.good: -------------------------------------------------------------------------------- 1 | bébé 2 | dádá 3 | -------------------------------------------------------------------------------- /tests/legacy/phone.wrong: -------------------------------------------------------------------------------- 1 | Brasillian 2 | -------------------------------------------------------------------------------- /tests/legacy/utf8_nonbmp.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | -------------------------------------------------------------------------------- /tests/legacy/warn.good: -------------------------------------------------------------------------------- 1 | foo 2 | foos 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | /result 3 | /.direnv/ 4 | -------------------------------------------------------------------------------- /tests/legacy/2970240.good: -------------------------------------------------------------------------------- 1 | firstmiddlelast 2 | -------------------------------------------------------------------------------- /tests/legacy/2970240.wrong: -------------------------------------------------------------------------------- 1 | lastmiddlefirst 2 | -------------------------------------------------------------------------------- /tests/legacy/2999225.good: -------------------------------------------------------------------------------- 1 | foobar 2 | foobaz 3 | -------------------------------------------------------------------------------- /tests/legacy/IJ.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ijs/i 3 | Ijs/* 4 | -------------------------------------------------------------------------------- /tests/legacy/allcaps2.sug: -------------------------------------------------------------------------------- 1 | iPod 2 | ipodos 3 | -------------------------------------------------------------------------------- /tests/legacy/allcaps2.wrong: -------------------------------------------------------------------------------- 1 | ipod 2 | iPodos 3 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcase2.wrong: -------------------------------------------------------------------------------- 1 | áoóÓoá 2 | -------------------------------------------------------------------------------- /tests/legacy/checksharps.sug: -------------------------------------------------------------------------------- 1 | MÜSSIG, müßig 2 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule.good: -------------------------------------------------------------------------------- 1 | abc 2 | acc 3 | -------------------------------------------------------------------------------- /tests/legacy/condition_utf.dic: -------------------------------------------------------------------------------- 1 | 1 2 | óőó/SP 3 | -------------------------------------------------------------------------------- /tests/legacy/digits_in_words.wrong: -------------------------------------------------------------------------------- 1 | -jährig 2 | -------------------------------------------------------------------------------- /tests/legacy/encoding.aff: -------------------------------------------------------------------------------- 1 | SET ISO-8859-15 2 | -------------------------------------------------------------------------------- /tests/legacy/flaglong.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/zx09 3 | -------------------------------------------------------------------------------- /tests/legacy/flagnum.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/999,54321 3 | -------------------------------------------------------------------------------- /tests/legacy/i54633.good: -------------------------------------------------------------------------------- 1 | éditer 2 | Éditer 3 | -------------------------------------------------------------------------------- /tests/legacy/i54633.sug: -------------------------------------------------------------------------------- 1 | éditer 2 | Éditer 3 | -------------------------------------------------------------------------------- /tests/legacy/i54633.wrong: -------------------------------------------------------------------------------- 1 | editer 2 | Editer 3 | -------------------------------------------------------------------------------- /tests/legacy/korean.good: -------------------------------------------------------------------------------- 1 | 들어오세요 2 | 안녕하세요 3 | -------------------------------------------------------------------------------- /tests/legacy/needaffix3.good: -------------------------------------------------------------------------------- 1 | foo 2 | foosbaz 3 | -------------------------------------------------------------------------------- /tests/legacy/needaffix5.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/AC 3 | -------------------------------------------------------------------------------- /tests/legacy/simplifiedtriple.wrong: -------------------------------------------------------------------------------- 1 | glasssko 2 | -------------------------------------------------------------------------------- /tests/legacy/utf8.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | foó/A 4 | -------------------------------------------------------------------------------- /tests/legacy/utf8_bom.dic: -------------------------------------------------------------------------------- 1 | 1 2 | apéritif 3 | -------------------------------------------------------------------------------- /tests/legacy/utf8_bom2.dic: -------------------------------------------------------------------------------- 1 | 1 2 | apéritif 3 | -------------------------------------------------------------------------------- /tests/legacy/warn.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/WA 3 | bar 4 | -------------------------------------------------------------------------------- /tests/legacy/1463589_utf.dic: -------------------------------------------------------------------------------- 1 | 1 2 | Kühlschrank 3 | -------------------------------------------------------------------------------- /tests/legacy/1592880.good: -------------------------------------------------------------------------------- 1 | weg 2 | wege 3 | wegen 4 | -------------------------------------------------------------------------------- /tests/legacy/1695964.dic: -------------------------------------------------------------------------------- 1 | 2 2 | Mull/he 3 | Mull/S 4 | -------------------------------------------------------------------------------- /tests/legacy/1695964.sug: -------------------------------------------------------------------------------- 1 | Mull 2 | Mulle 3 | Mulls 4 | -------------------------------------------------------------------------------- /tests/legacy/1975530.dic: -------------------------------------------------------------------------------- 1 | 2 2 | أرى/x 3 | أيار/x 4 | -------------------------------------------------------------------------------- /tests/legacy/1975530.good: -------------------------------------------------------------------------------- 1 | أرى 2 | أيار 3 | ترى 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcaseutf.wrong: -------------------------------------------------------------------------------- 1 | áoóÓoá 2 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern2.wrong: -------------------------------------------------------------------------------- 1 | foobar 2 | -------------------------------------------------------------------------------- /tests/legacy/checksharpsutf.sug: -------------------------------------------------------------------------------- 1 | MÜSSIG, müßig 2 | -------------------------------------------------------------------------------- /tests/legacy/circumfix.dic: -------------------------------------------------------------------------------- 1 | 1 2 | nagy/C po:adj 3 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixesutf.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ⲟⲩⲣⲟ/B 3 | -------------------------------------------------------------------------------- /tests/legacy/dotless_i.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | LANG tr 3 | -------------------------------------------------------------------------------- /tests/legacy/forbiddenword.good: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | 4 | -------------------------------------------------------------------------------- /tests/legacy/forceucase.sug: -------------------------------------------------------------------------------- 1 | Foobaz 2 | Foobarbaz 3 | -------------------------------------------------------------------------------- /tests/legacy/korean.dic: -------------------------------------------------------------------------------- 1 | 2 2 | 들어오세요 3 | 안녕하세요 4 | -------------------------------------------------------------------------------- /tests/legacy/nosuggest.dic: -------------------------------------------------------------------------------- 1 | 1 2 | foo/AB 3 | bar/B 4 | -------------------------------------------------------------------------------- /tests/legacy/oconv.sug: -------------------------------------------------------------------------------- 1 | BÉBÉ 2 | DÁDÁ 3 | AÁBCDEÉ 4 | -------------------------------------------------------------------------------- /tests/legacy/oconv.wrong: -------------------------------------------------------------------------------- 1 | béb 2 | dád 3 | aábcde 4 | -------------------------------------------------------------------------------- /tests/legacy/slash.wrong: -------------------------------------------------------------------------------- 1 | \/usr\/share\/nuspell\/ 2 | -------------------------------------------------------------------------------- /tests/legacy/1695964.wrong: -------------------------------------------------------------------------------- 1 | Mall 2 | Malle 3 | Malls 4 | -------------------------------------------------------------------------------- /tests/legacy/breakoff.good: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | scot-free 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcase2.good: -------------------------------------------------------------------------------- 1 | áoóáoó 2 | Óoááoó 3 | -------------------------------------------------------------------------------- /tests/legacy/circumfix.wrong: -------------------------------------------------------------------------------- 1 | legnagy 2 | legeslegnagy 3 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixes.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ouro/B 3 | 4 | -------------------------------------------------------------------------------- /tests/legacy/forceucase.wrong: -------------------------------------------------------------------------------- 1 | foobaz 2 | foobarbaz 3 | -------------------------------------------------------------------------------- /tests/legacy/ignore.dic: -------------------------------------------------------------------------------- 1 | 2 2 | xmpl 3 | expression/A 4 | -------------------------------------------------------------------------------- /tests/legacy/map.sug: -------------------------------------------------------------------------------- 1 | Frühstück 2 | tükörfúró 3 | groß 4 | -------------------------------------------------------------------------------- /tests/legacy/needaffix.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/YXA 3 | bar/Y 4 | -------------------------------------------------------------------------------- /tests/legacy/needaffix.good: -------------------------------------------------------------------------------- 1 | bar 2 | foos 3 | barfoos 4 | -------------------------------------------------------------------------------- /tests/legacy/nepali.wrong: -------------------------------------------------------------------------------- 1 | र्य 2 | क्‍यार 3 | अलम्‌ 4 | -------------------------------------------------------------------------------- /tests/legacy/ngram_utf_fix.wrong: -------------------------------------------------------------------------------- 1 | времячко 2 | человеко 3 | -------------------------------------------------------------------------------- /tests/legacy/nosuggest.good: -------------------------------------------------------------------------------- 1 | foo 2 | foobar 3 | barfoo 4 | -------------------------------------------------------------------------------- /tests/legacy/oconv.dic: -------------------------------------------------------------------------------- 1 | 3 2 | bébé 3 | dádá 4 | aábcdeé 5 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound.wrong: -------------------------------------------------------------------------------- 1 | pseudo 2 | pseudos 3 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat.good: -------------------------------------------------------------------------------- 1 | schoonheidssport 2 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat.wrong: -------------------------------------------------------------------------------- 1 | schoonheidsport 2 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat2.good: -------------------------------------------------------------------------------- 1 | zout-suikertest 2 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat2.wrong: -------------------------------------------------------------------------------- 1 | zoutsuikertest 2 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword1.sug: -------------------------------------------------------------------------------- 1 | barwordfoo 2 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword2.sug: -------------------------------------------------------------------------------- 1 | barwordfoo 2 | -------------------------------------------------------------------------------- /tests/legacy/utf8_bom.good: -------------------------------------------------------------------------------- 1 | apéritif 2 | APÉRITIF 3 | -------------------------------------------------------------------------------- /tests/legacy/utf8_bom2.good: -------------------------------------------------------------------------------- 1 | apéritif 2 | APÉRITIF 3 | -------------------------------------------------------------------------------- /tests/legacy/utf8_nonbmp.sug: -------------------------------------------------------------------------------- 1 | 𐏒𐏑, 𐏒𐏒 2 | 𐏒𐏑, 𐏒𐏒 3 | -------------------------------------------------------------------------------- /tests/legacy/utf8_nonbmp.wrong: -------------------------------------------------------------------------------- 1 | 𐏑𐏒𐏒 2 | 𐏑𐏒𐏒 3 | -------------------------------------------------------------------------------- /tests/legacy/zeroaffix.good: -------------------------------------------------------------------------------- 1 | bar 2 | foo 3 | barbaz 4 | -------------------------------------------------------------------------------- /tests/legacy/1592880.dic: -------------------------------------------------------------------------------- 1 | 3 2 | weg/Qoz 3 | weg/P 4 | wege 5 | -------------------------------------------------------------------------------- /tests/legacy/2970242.dic: -------------------------------------------------------------------------------- 1 | 3 2 | foo/ac 3 | bar/c 4 | baz/bc 5 | -------------------------------------------------------------------------------- /tests/legacy/2999225.dic: -------------------------------------------------------------------------------- 1 | 3 2 | foo/aA 3 | bar/b 4 | baz/B 5 | -------------------------------------------------------------------------------- /tests/legacy/affixes.dic: -------------------------------------------------------------------------------- 1 | 3 2 | hello 3 | try/B 4 | work/AB 5 | -------------------------------------------------------------------------------- /tests/legacy/alias.good: -------------------------------------------------------------------------------- 1 | foo 2 | foox 3 | fooy 4 | fooyx 5 | -------------------------------------------------------------------------------- /tests/legacy/alias2.good: -------------------------------------------------------------------------------- 1 | foo 2 | foox 3 | fooy 4 | fooyx 5 | -------------------------------------------------------------------------------- /tests/legacy/allcaps.dic: -------------------------------------------------------------------------------- 1 | 2 2 | OpenOffice.org 3 | UNICEF/S 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcaseutf.dic: -------------------------------------------------------------------------------- 1 | 2 2 | áoó/A 3 | Óoá/A 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcaseutf.good: -------------------------------------------------------------------------------- 1 | áoóáoó 2 | Óoááoó 3 | -------------------------------------------------------------------------------- /tests/legacy/checkcompounddup.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | bar/A 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundrep.good: -------------------------------------------------------------------------------- 1 | vízszer 2 | szerkocsi 3 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundtriple.wrong: -------------------------------------------------------------------------------- 1 | fooopera 2 | bareeel 3 | -------------------------------------------------------------------------------- /tests/legacy/colons_in_words.dic: -------------------------------------------------------------------------------- 1 | 2 2 | c:a 3 | S:t 4 | foo 5 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixes.wrong: -------------------------------------------------------------------------------- 1 | tekouro 2 | mettekouro 3 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/XPS 3 | bar/XPS 4 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix2.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/XPS 3 | bar/XPS 4 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix3.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/XPS 3 | bar/XPS 4 | -------------------------------------------------------------------------------- /tests/legacy/conditionalprefix.wrong: -------------------------------------------------------------------------------- 1 | undrink 2 | undrinks 3 | -------------------------------------------------------------------------------- /tests/legacy/dotless_i.dic: -------------------------------------------------------------------------------- 1 | 3 2 | iç 3 | ışık 4 | Diyarbakır 5 | -------------------------------------------------------------------------------- /tests/legacy/fogemorpheme.dic: -------------------------------------------------------------------------------- 1 | 2 2 | gata/A 3 | kontoret/X 4 | -------------------------------------------------------------------------------- /tests/legacy/i53643.wrong: -------------------------------------------------------------------------------- 1 | 1..2 2 | 1,,2 3 | 1.,2 4 | 1,.2 5 | -------------------------------------------------------------------------------- /tests/legacy/i54980.good: -------------------------------------------------------------------------------- 1 | cœur 2 | œuvre 3 | CŒUR 4 | ŒUVRE 5 | -------------------------------------------------------------------------------- /tests/legacy/i58202.dic: -------------------------------------------------------------------------------- 1 | 4 2 | foo 3 | bar 4 | Baz 5 | Boo 6 | -------------------------------------------------------------------------------- /tests/legacy/iconv.dic: -------------------------------------------------------------------------------- 1 | 4 2 | Chișinău 3 | Țepes 4 | ț 5 | Ș 6 | -------------------------------------------------------------------------------- /tests/legacy/iconv2.good: -------------------------------------------------------------------------------- 1 | GaNa 2 | Gag 3 | GaggNa 4 | NanDa 5 | -------------------------------------------------------------------------------- /tests/legacy/keepcase.good: -------------------------------------------------------------------------------- 1 | foo 2 | Bar 3 | baz. 4 | Quux. 5 | -------------------------------------------------------------------------------- /tests/legacy/map.wrong: -------------------------------------------------------------------------------- 1 | Fruhstuck 2 | tukorfuro 3 | gross 4 | -------------------------------------------------------------------------------- /tests/legacy/maputf.sug: -------------------------------------------------------------------------------- 1 | Frühstück 2 | tükörfúró 3 | groß 4 | -------------------------------------------------------------------------------- /tests/legacy/maputf.wrong: -------------------------------------------------------------------------------- 1 | Fruhstuck 2 | tukorfuro 3 | gross 4 | -------------------------------------------------------------------------------- /tests/legacy/needaffix2.aff: -------------------------------------------------------------------------------- 1 | NEEDAFFIX X 2 | COMPOUNDFLAG Y 3 | -------------------------------------------------------------------------------- /tests/legacy/needaffix4.aff: -------------------------------------------------------------------------------- 1 | NEEDAFFIX X 2 | COMPOUNDFLAG Y 3 | -------------------------------------------------------------------------------- /tests/legacy/nepali.good: -------------------------------------------------------------------------------- 1 | न्न 2 | न्‌न 3 | अलम्‍ 4 | र्‌य 5 | -------------------------------------------------------------------------------- /tests/legacy/ngram_utf_fix.dic: -------------------------------------------------------------------------------- 1 | 1 2 | человек/2022,2000,101 3 | -------------------------------------------------------------------------------- /tests/legacy/nosuggest.wrong: -------------------------------------------------------------------------------- 1 | foox 2 | foobarx 3 | barfoox 4 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | pseudo/OAB 4 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound2.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | pseudo/AB 4 | -------------------------------------------------------------------------------- /tests/legacy/simplifiedtriple.dic: -------------------------------------------------------------------------------- 1 | 2 2 | glass/A 3 | sko/A 4 | -------------------------------------------------------------------------------- /tests/legacy/2970240.dic: -------------------------------------------------------------------------------- 1 | 3 2 | first/c 3 | middle/c 4 | last/c 5 | -------------------------------------------------------------------------------- /tests/legacy/allcaps.sug: -------------------------------------------------------------------------------- 1 | OpenOffice.org 2 | UNICEF 3 | UNICEF's 4 | -------------------------------------------------------------------------------- /tests/legacy/allcaps2.dic: -------------------------------------------------------------------------------- 1 | 3 2 | iPod/s 3 | iPodos/* 4 | ipodos 5 | -------------------------------------------------------------------------------- /tests/legacy/allcaps2.good: -------------------------------------------------------------------------------- 1 | iPod 2 | IPOD 3 | ipodos 4 | IPODOS 5 | -------------------------------------------------------------------------------- /tests/legacy/allcaps_utf.dic: -------------------------------------------------------------------------------- 1 | 2 2 | OpenOffice.org 3 | UNICEF/S 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern2.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/A 3 | bar/A 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern2.good: -------------------------------------------------------------------------------- 1 | barfoo 2 | fozar 3 | fur 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern4.good: -------------------------------------------------------------------------------- 1 | sUryOdayaM 2 | pErunna 3 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixes2.dic: -------------------------------------------------------------------------------- 1 | 1 2 | ouro/BC [stem_1] 3 | 4 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixesutf.wrong: -------------------------------------------------------------------------------- 1 | ⲧⲉⲕⲟⲩⲣⲟ 2 | ⲙⲉⲧⲧⲉⲕⲟⲩⲣⲟ 3 | -------------------------------------------------------------------------------- /tests/legacy/compoundflag.good: -------------------------------------------------------------------------------- 1 | foobar 2 | barfoo 3 | foobarfoo 4 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule.dic: -------------------------------------------------------------------------------- 1 | 3 2 | a/A 3 | b/B 4 | c/BC 5 | 6 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule2.dic: -------------------------------------------------------------------------------- 1 | 3 2 | a/A 3 | b/B 4 | c/C 5 | 6 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule3.dic: -------------------------------------------------------------------------------- 1 | 3 2 | a/A 3 | b/B 4 | c/C 5 | 6 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule6.dic: -------------------------------------------------------------------------------- 1 | 3 2 | a/A 3 | b/B 4 | c/C 5 | 6 | -------------------------------------------------------------------------------- /tests/legacy/encoding.good: -------------------------------------------------------------------------------- 1 | cœur 2 | œuvre 3 | CŒUR 4 | ŒUVRE 5 | -------------------------------------------------------------------------------- /tests/legacy/forceucase.dic: -------------------------------------------------------------------------------- 1 | 3 2 | foo/C 3 | bar/C 4 | baz/CA 5 | -------------------------------------------------------------------------------- /tests/legacy/fullstrip.dic: -------------------------------------------------------------------------------- 1 | 2 2 | andare/A 3 | riandare/A 4 | 5 | -------------------------------------------------------------------------------- /tests/legacy/iconv2.dic: -------------------------------------------------------------------------------- 1 | 4 2 | GAG 3 | GAGGNA 4 | GANA 5 | NANDA 6 | -------------------------------------------------------------------------------- /tests/legacy/maputf.dic: -------------------------------------------------------------------------------- 1 | 3 2 | Frühstück 3 | tükörfúró 4 | groß 5 | -------------------------------------------------------------------------------- /tests/legacy/nepali.dic: -------------------------------------------------------------------------------- 1 | 4 2 | अलम् 3 | क्यार 4 | न्न 5 | र्‌य 6 | -------------------------------------------------------------------------------- /tests/legacy/simplifiedtriple.good: -------------------------------------------------------------------------------- 1 | glass 2 | sko 3 | glassko 4 | -------------------------------------------------------------------------------- /tests/legacy/zeroaffix.dic: -------------------------------------------------------------------------------- 1 | 2 2 | foo/XA Baz 4 | TRY B 5 | -------------------------------------------------------------------------------- /tests/legacy/ignore.good: -------------------------------------------------------------------------------- 1 | example 2 | expression 3 | xmpl 4 | xprssn 5 | reexpression 6 | rxprssn 7 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_keepcase.dic: -------------------------------------------------------------------------------- 1 | 5 2 | tv-/KB 3 | -tv/KE 4 | word/C 5 | NATO-/B 6 | -NATO/E 7 | 8 | -------------------------------------------------------------------------------- /tests/legacy/phone.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/phone.aff -------------------------------------------------------------------------------- /tests/legacy/slash.aff: -------------------------------------------------------------------------------- 1 | # slashes in words (\/) 2 | 3 | # (only for tokenization) 4 | WORDCHARS /: 5 | -------------------------------------------------------------------------------- /tests/legacy/utfcompound.wrong: -------------------------------------------------------------------------------- 1 | xyyz 2 | fooxy 3 | xyfoo 4 | fooxybar 5 | ééőő 6 | fóóéé 7 | őőáár 8 | -------------------------------------------------------------------------------- /tests/legacy/1463589.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/1463589.dic -------------------------------------------------------------------------------- /tests/legacy/1463589.sug: -------------------------------------------------------------------------------- 1 | Kühlschrank 2 | Kühlschrank 3 | Kühlschrank 4 | Kühlschrank 5 | Kühlschrank 6 | -------------------------------------------------------------------------------- /tests/legacy/1463589.wrong: -------------------------------------------------------------------------------- 1 | kuhlschrank 2 | kuehlschrank 3 | kühlschrank 4 | Kuhlschrank 5 | Kuehlschrank 6 | -------------------------------------------------------------------------------- /tests/legacy/1463589_utf.sug: -------------------------------------------------------------------------------- 1 | Kühlschrank 2 | Kühlschrank 3 | Kühlschrank 4 | Kühlschrank 5 | Kühlschrank 6 | -------------------------------------------------------------------------------- /tests/legacy/2970242.aff: -------------------------------------------------------------------------------- 1 | CHECKCOMPOUNDPATTERN 1 2 | CHECKCOMPOUNDPATTERN /a /b 3 | COMPOUNDFLAG c 4 | 5 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule2.wrong: -------------------------------------------------------------------------------- 1 | ba 2 | aaabaaa 3 | bbaaa 4 | aaaaba 5 | bbbbbaa 6 | cba 7 | cab 8 | acb 9 | -------------------------------------------------------------------------------- /tests/legacy/dotless_i.good: -------------------------------------------------------------------------------- 1 | Diyarbakır 2 | DİYARBAKIR 3 | iç 4 | İç 5 | ışık 6 | Işık 7 | İÇ 8 | IŞIK 9 | -------------------------------------------------------------------------------- /tests/legacy/encoding.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/encoding.dic -------------------------------------------------------------------------------- /tests/legacy/forceucase.aff: -------------------------------------------------------------------------------- 1 | # force capitalized compound 2 | TRY F 3 | FORCEUCASE A 4 | COMPOUNDFLAG C 5 | -------------------------------------------------------------------------------- /tests/legacy/i54633.aff: -------------------------------------------------------------------------------- 1 | # Missing capitalized suggestion for capitalized bad words 2 | SET ISO8859-1 3 | -------------------------------------------------------------------------------- /tests/legacy/i54633.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/i54633.dic -------------------------------------------------------------------------------- /tests/legacy/i54980.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/i54980.dic -------------------------------------------------------------------------------- /tests/legacy/needaffix2.dic: -------------------------------------------------------------------------------- 1 | 4 2 | foo st:foo id:1 3 | foo/YX st:foo id:2 4 | foo/Y st:foo id:3 5 | bar/Y 6 | -------------------------------------------------------------------------------- /tests/legacy/1463589_utf.wrong: -------------------------------------------------------------------------------- 1 | kuhlschrank 2 | kuehlschrank 3 | kühlschrank 4 | Kuhlschrank 5 | Kuehlschrank 6 | -------------------------------------------------------------------------------- /tests/legacy/2999225.aff: -------------------------------------------------------------------------------- 1 | COMPOUNDRULE 1 2 | COMPOUNDRULE ab 3 | 4 | COMPOUNDBEGIN A 5 | COMPOUNDEND B 6 | 7 | -------------------------------------------------------------------------------- /tests/legacy/arabic.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | TRY أ 3 | IGNORE ٌٍَُِّْ 4 | 5 | PFX Aa Y 1 6 | PFX Aa 0 0/X0 أ[^ي] 7 | -------------------------------------------------------------------------------- /tests/legacy/breakdefault.good: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | foo- 4 | -foo 5 | scot-free 6 | foo-bar 7 | foo-bar-foo-bar 8 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundtriple.good: -------------------------------------------------------------------------------- 1 | operafoo 2 | operaeel 3 | operabare 4 | eelbare 5 | eelfoo 6 | eelopera 7 | -------------------------------------------------------------------------------- /tests/legacy/condition.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/condition.aff -------------------------------------------------------------------------------- /tests/legacy/condition.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/condition.dic -------------------------------------------------------------------------------- /tests/legacy/flag.good: -------------------------------------------------------------------------------- 1 | foo 2 | foos 3 | foosbar 4 | foosbaz 5 | unfoo 6 | unfoos 7 | unfoosbar 8 | unfoosbaz 9 | -------------------------------------------------------------------------------- /tests/legacy/utf8.good: -------------------------------------------------------------------------------- 1 | foo 2 | foó 3 | fooőő 4 | fooő 5 | foóő 6 | foő 7 | foőo 8 | foőoo 9 | foóó 10 | -------------------------------------------------------------------------------- /tests/legacy/utf8_nonbmp.dic: -------------------------------------------------------------------------------- 1 | 4 # Old Persian numbers (1-4), source: Wikipedia 2 | 𐏑 3 | 𐏒 4 | 𐏒𐏑 5 | 𐏒𐏒 6 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern.wrong: -------------------------------------------------------------------------------- 1 | könnynyelés 2 | hosszszámítás 3 | hosszkönnynyelés 4 | könnynyeléshossz 5 | -------------------------------------------------------------------------------- /tests/legacy/checksharps.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checksharps.aff -------------------------------------------------------------------------------- /tests/legacy/checksharps.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checksharps.dic -------------------------------------------------------------------------------- /tests/legacy/compoundrule6.aff: -------------------------------------------------------------------------------- 1 | COMPOUNDMIN 1 2 | COMPOUNDRULE 2 3 | COMPOUNDRULE A*A 4 | COMPOUNDRULE A*AAB*BBBC*C 5 | -------------------------------------------------------------------------------- /tests/legacy/conditionalprefix.good: -------------------------------------------------------------------------------- 1 | drink 2 | drinks 3 | drinkable 4 | drinkables 5 | undrinkable 6 | undrinkables 7 | -------------------------------------------------------------------------------- /tests/legacy/flaglong.good: -------------------------------------------------------------------------------- 1 | foo 2 | foos 3 | foosbar 4 | foosbaz 5 | unfoo 6 | unfoos 7 | unfoosbar 8 | unfoosbaz 9 | -------------------------------------------------------------------------------- /tests/legacy/flagnum.good: -------------------------------------------------------------------------------- 1 | foo 2 | foos 3 | foosbar 4 | foosbaz 5 | unfoo 6 | unfoos 7 | unfoosbar 8 | unfoosbaz 9 | -------------------------------------------------------------------------------- /tests/legacy/flagutf8.good: -------------------------------------------------------------------------------- 1 | foo 2 | foos 3 | foosbar 4 | foosbaz 5 | unfoo 6 | unfoos 7 | unfoosbar 8 | unfoosbaz 9 | -------------------------------------------------------------------------------- /tests/legacy/i58202.good: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | Foo 4 | Bar 5 | Baz 6 | Boo 7 | FOO 8 | BAR 9 | BAZ 10 | BOO 11 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword1.wrong: -------------------------------------------------------------------------------- 1 | foowordbar 2 | foowordbars 3 | foowordba 4 | foowordbas 5 | barwodfoo 6 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword2.wrong: -------------------------------------------------------------------------------- 1 | foowordbar 2 | foowordbars 3 | foowordba 4 | foowordbas 5 | barwodfoo 6 | -------------------------------------------------------------------------------- /tests/legacy/checkcompounddup.aff: -------------------------------------------------------------------------------- 1 | # Forbid compound word with triple letters 2 | CHECKCOMPOUNDDUP 3 | COMPOUNDFLAG A 4 | -------------------------------------------------------------------------------- /tests/legacy/checksharpsutf.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checksharpsutf.aff -------------------------------------------------------------------------------- /tests/legacy/i54980.aff: -------------------------------------------------------------------------------- 1 | # ISO-8859-15 (extended latin-1) support for French, Finnish and EURO symbol 2 | SET ISO8859-15 3 | -------------------------------------------------------------------------------- /tests/legacy/rep.sug: -------------------------------------------------------------------------------- 1 | form 2 | phantom 3 | vacation 4 | a lot, lot 5 | un alunno 6 | bar 7 | vinte e un 8 | auto's, auto 9 | -------------------------------------------------------------------------------- /tests/legacy/utfcompound.dic: -------------------------------------------------------------------------------- 1 | 8 2 | foo/A 3 | bar/A 4 | fóó/A 5 | áár/A 6 | xy/A 7 | yz/A 8 | éé/A 9 | őő/A 10 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcase2.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checkcompoundcase2.dic -------------------------------------------------------------------------------- /tests/legacy/checkcompoundrep.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checkcompoundrep.aff -------------------------------------------------------------------------------- /tests/legacy/checkcompoundrep.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checkcompoundrep.dic -------------------------------------------------------------------------------- /tests/legacy/checkcompoundtriple.aff: -------------------------------------------------------------------------------- 1 | # Forbid compound word with triple letters 2 | CHECKCOMPOUNDTRIPLE 3 | COMPOUNDFLAG A 4 | -------------------------------------------------------------------------------- /tests/legacy/checksharpsutf.dic: -------------------------------------------------------------------------------- 1 | 6 2 | müßig/k 3 | Ausstoß 4 | Abstoß. 5 | Außenabmessung 6 | Prozessionsstraße 7 | Außenmaße 8 | -------------------------------------------------------------------------------- /tests/legacy/fullstrip.good: -------------------------------------------------------------------------------- 1 | andare 2 | vado 3 | va 4 | andiamo 5 | riandare 6 | rivado 7 | riva 8 | riandiamo 9 | 10 | -------------------------------------------------------------------------------- /tests/legacy/germancompounding.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/germancompounding.aff -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/checkcompoundpattern.dic -------------------------------------------------------------------------------- /tests/legacy/compoundaffix3.wrong: -------------------------------------------------------------------------------- 1 | prefoobarsuf 2 | foosufbar 3 | fooprebar 4 | foosufprebar 5 | fooprebarsuf 6 | prefooprebarsuf 7 | -------------------------------------------------------------------------------- /tests/legacy/forbiddenword.dic: -------------------------------------------------------------------------------- 1 | 5 2 | foo/S [1] 3 | foo/YX [2] 4 | foo/Y [3] 5 | foo/S [4] 6 | bar/YS [5] 7 | bars/X 8 | foos/X 9 | -------------------------------------------------------------------------------- /tests/legacy/germancompoundingold.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helix-editor/spellbook/HEAD/tests/legacy/germancompoundingold.aff -------------------------------------------------------------------------------- /tests/legacy/ignoreutf.good: -------------------------------------------------------------------------------- 1 | طير 2 | فتحة 3 | ضمة 4 | كسرة 5 | فتحتان 6 | ضمتان 7 | كسرتان 8 | شدة 9 | سكون 10 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword2.aff: -------------------------------------------------------------------------------- 1 | TRY r 2 | 3 | FORBIDDENWORD F 4 | COMPOUNDFLAG W 5 | 6 | SFX S Y 1 7 | SFX S 0 s . 8 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundcase.aff: -------------------------------------------------------------------------------- 1 | # forbid upper case letters at word bounds in compounding 2 | CHECKCOMPOUNDCASE 3 | COMPOUNDFLAG A 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern3.wrong: -------------------------------------------------------------------------------- 1 | booban 2 | boobanfoo 3 | fozar 4 | fozarfoo 5 | fozan 6 | fozanfoo 7 | bozar 8 | bozarfoo 9 | -------------------------------------------------------------------------------- /tests/legacy/ignoreutf.dic: -------------------------------------------------------------------------------- 1 | 9 2 | طِير 3 | فَتحة 4 | ضُمة 5 | كِسرة 6 | فتحًتان 7 | ضمتانٌ 8 | كسرتاٍن 9 | شدّة 10 | سكوْن 11 | -------------------------------------------------------------------------------- /tests/legacy/2970240.aff: -------------------------------------------------------------------------------- 1 | # test words with three parts 2 | CHECKCOMPOUNDPATTERN 1 3 | CHECKCOMPOUNDPATTERN le fi 4 | COMPOUNDFLAG c 5 | 6 | -------------------------------------------------------------------------------- /tests/legacy/breakdefault.aff: -------------------------------------------------------------------------------- 1 | # default word break at hyphens and n-dashes 2 | 3 | SET UTF-8 4 | MAXNGRAMSUGS 0 5 | WORDCHARS - 6 | TRY ot 7 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_keepcase.aff: -------------------------------------------------------------------------------- 1 | KEEPCASE K 2 | COMPOUNDBEGIN B 3 | COMPOUNDEND E 4 | COMPOUNDFLAG C 5 | COMPOUNDMIN 1 6 | WORDCHARS - 7 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_keepcase.wrong: -------------------------------------------------------------------------------- 1 | TV-word 2 | Tv-word 3 | word-TV 4 | word-Tv 5 | wordword-TV 6 | TV-word-TV 7 | Nato-word 8 | word-nato 9 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix2.good: -------------------------------------------------------------------------------- 1 | foo 2 | prefoo 3 | foosuf 4 | prefoosuf 5 | prefoobarsuf 6 | foosufbar 7 | fooprebarsuf 8 | prefooprebarsuf 9 | -------------------------------------------------------------------------------- /tests/legacy/digits_in_words.dic: -------------------------------------------------------------------------------- 1 | 11 2 | 0/a 3 | 1/a 4 | 2/a 5 | 3/a 6 | 4/a 7 | 5/a 8 | 6/a 9 | 7/a 10 | 8/a 11 | 9/a 12 | -jährig/bc 13 | -------------------------------------------------------------------------------- /tests/legacy/hu.good: -------------------------------------------------------------------------------- 1 | majomkenyér 2 | majomkenyérfa 3 | majomkenyérfaág 4 | majomkenyérvirág 5 | kenyérfavirág 6 | sárkányfogvetemény 7 | kortárs 8 | -------------------------------------------------------------------------------- /tests/legacy/ignore.aff: -------------------------------------------------------------------------------- 1 | # ignore characters in words (for Arabic Harakat or Hebrew niqqud) 2 | IGNORE aeiou 3 | 4 | PFX A Y 1 5 | PFX A 0 re . 6 | -------------------------------------------------------------------------------- /tests/legacy/needaffix3.aff: -------------------------------------------------------------------------------- 1 | # neeadaffix on affixes 2 | NEEDAFFIX X 3 | 4 | SFX A Y 1 5 | SFX A 0 s/XB . 6 | 7 | SFX B Y 1 8 | SFX B 0 baz . 9 | -------------------------------------------------------------------------------- /tests/legacy/sugutf.dic: -------------------------------------------------------------------------------- 1 | 10 2 | NASA 3 | Gandhi 4 | grateful 5 | permanent 6 | vacation 7 | a 8 | lot 9 | have 10 | which 11 | McDonald 12 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | watch_file flake.lock 2 | 3 | # try to use flakes, if it fails use normal nix (ie. shell.nix) 4 | use flake || use nix 5 | eval "$shellHook" 6 | -------------------------------------------------------------------------------- /tests/legacy/1463589.aff: -------------------------------------------------------------------------------- 1 | # capitalized ngram suggestion test data for 2 | # Sf.net Bug ID 1463589, reported by Frederik Fouvry. 3 | MAXNGRAMSUGS 1 4 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern3.good: -------------------------------------------------------------------------------- 1 | bozan 2 | barfoo 3 | banfoo 4 | banbar 5 | foobar 6 | fooban 7 | foobanbar 8 | boobar 9 | boobarfoo 10 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix.aff: -------------------------------------------------------------------------------- 1 | COMPOUNDFLAG X 2 | 3 | PFX P Y 1 4 | PFX P 0 pre . 5 | 6 | SFX S Y 1 7 | SFX S 0 suf . 8 | -------------------------------------------------------------------------------- /tests/legacy/allcaps3.aff: -------------------------------------------------------------------------------- 1 | # homonym support 2 | WORDCHARS ' 3 | 4 | SFX s N 1 5 | SFX s 0 s . 6 | 7 | SFX S N 1 8 | SFX S 0 's . 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/legacy/i68568.aff: -------------------------------------------------------------------------------- 1 | # Sant'Elia -> SANT'ELIA (Italian) 2 | # OpenOffice.org Issue 68658 3 | 4 | PFX a Y 1 5 | PFX a 0 Sant' E 6 | 7 | WORDCHARS ' 8 | -------------------------------------------------------------------------------- /tests/legacy/IJ.aff: -------------------------------------------------------------------------------- 1 | # check bad capitalisation of Dutch letter IJ. 2 | TRY i 3 | FORBIDDENWORD * 4 | PFX i N 1 5 | PFX i ij IJ ij 6 | 7 | REP 1 8 | REP ij IJ 9 | -------------------------------------------------------------------------------- /tests/legacy/base.wrong: -------------------------------------------------------------------------------- 1 | loooked 2 | texxt 3 | hlelo 4 | seid 5 | rottenday 6 | tomorow 7 | seeeven 8 | Nasa 9 | horrorfying 10 | peech 11 | sugesst 12 | -------------------------------------------------------------------------------- /tests/legacy/colons_in_words.aff: -------------------------------------------------------------------------------- 1 | # Colons in Finnish and Swedish words. Problem reported by Lars Aronsson. 2 | # Parsing test (src/parsers) 3 | WORDCHARS : 4 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound.aff: -------------------------------------------------------------------------------- 1 | # words only in compounds (see also fogemorpheme example) 2 | ONLYINCOMPOUND O 3 | COMPOUNDFLAG A 4 | SFX B Y 1 5 | SFX B 0 s . 6 | -------------------------------------------------------------------------------- /tests/legacy/rep.wrong: -------------------------------------------------------------------------------- 1 | phorm 2 | fantom 3 | vacashun 4 | vacashuns 5 | alot 6 | un'alunno 7 | foo 8 | foobars 9 | barfoos 10 | vinteún 11 | autos 12 | -------------------------------------------------------------------------------- /tests/legacy/allcaps.aff: -------------------------------------------------------------------------------- 1 | # check uppercase forms of allcaps word + affix and words with mixed casing 2 | WORDCHARS '. 3 | 4 | SFX S N 1 5 | SFX S 0 's . 6 | -------------------------------------------------------------------------------- /tests/legacy/i35725.wrong: -------------------------------------------------------------------------------- 1 | permenant 2 | pernament 3 | pernemant 4 | Permenant 5 | Pernament 6 | Pernemant 7 | unesco 8 | Unesco 9 | unesco's 10 | Unesco's 11 | -------------------------------------------------------------------------------- /tests/legacy/affixes.aff: -------------------------------------------------------------------------------- 1 | # simple example for affix compression (see Hunspell(4)) 2 | PFX A Y 1 3 | PFX A 0 re . 4 | 5 | SFX B Y 2 6 | SFX B 0 ed [^y] 7 | SFX B y ied y 8 | -------------------------------------------------------------------------------- /tests/legacy/breakoff.aff: -------------------------------------------------------------------------------- 1 | # switch off default word break at hyphens and n-dashes by BREAK 0 2 | SET UTF-8 3 | MAXNGRAMSUGS 0 4 | WORDCHARS - 5 | TRY ot 6 | 7 | BREAK 0 8 | -------------------------------------------------------------------------------- /tests/legacy/i58202.wrong: -------------------------------------------------------------------------------- 1 | fOO 2 | BAr 3 | baz 4 | BOo 5 | foobar 6 | fooBar 7 | Foobar 8 | FooBar 9 | fooBaz 10 | FooBaz 11 | Bazfoo 12 | BazFoo 13 | BazBoo 14 | -------------------------------------------------------------------------------- /tests/legacy/i68568utf.aff: -------------------------------------------------------------------------------- 1 | # Sant'Elia -> SANT'ELIA (Italian) 2 | # OpenOffice.org Issue 68658 3 | SET UTF-8 4 | 5 | PFX a Y 1 6 | PFX a 0 Foó' B 7 | 8 | WORDCHARS ' 9 | -------------------------------------------------------------------------------- /tests/legacy/sug.dic: -------------------------------------------------------------------------------- 1 | 1 2 | NASA 3 | Gandhi 4 | grateful 5 | permanent 6 | vacation 7 | a 8 | lot 9 | have 10 | which 11 | McDonald 12 | permanent-vacation/? 13 | -------------------------------------------------------------------------------- /tests/legacy/allcaps2.aff: -------------------------------------------------------------------------------- 1 | # forbidden all caps words are case sensitive 2 | # iPod -> ipodos ("iPodic" in Hungarian) 3 | FORBIDDENWORD * 4 | SFX s N 1 5 | SFX s 0 os . 6 | 7 | -------------------------------------------------------------------------------- /tests/legacy/rep.dic: -------------------------------------------------------------------------------- 1 | 10 2 | form 3 | phantom 4 | vacation 5 | vacations 6 | a 7 | lot 8 | un 9 | alunno 10 | bar 11 | barbars 12 | vinte 13 | e 14 | un 15 | auto/A 16 | -------------------------------------------------------------------------------- /tests/legacy/allcaps3.good: -------------------------------------------------------------------------------- 1 | UNESCO 2 | Unesco 3 | UNESCO's 4 | Unesco's 5 | UNESCO'S 6 | NASA 7 | Nasa 8 | NASA's 9 | Nasa's 10 | NASA'S 11 | ACTS 12 | acts 13 | Acts 14 | -------------------------------------------------------------------------------- /tests/legacy/base.sug: -------------------------------------------------------------------------------- 1 | looked, look 2 | text 3 | hello 4 | said 5 | rotten day, rotten-day, rotten 6 | tomorrow 7 | seven 8 | NASA 9 | horrifying 10 | speech 11 | suggest 12 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix2.aff: -------------------------------------------------------------------------------- 1 | COMPOUNDFLAG X 2 | COMPOUNDPERMITFLAG Y 3 | 4 | PFX P Y 1 5 | PFX P 0 pre/Y . 6 | 7 | SFX S Y 1 8 | SFX S 0 suf/Y . 9 | -------------------------------------------------------------------------------- /tests/legacy/compoundaffix3.aff: -------------------------------------------------------------------------------- 1 | COMPOUNDFLAG X 2 | COMPOUNDFORBIDFLAG Z 3 | 4 | PFX P Y 1 5 | PFX P 0 pre/Z . 6 | 7 | SFX S Y 1 8 | SFX S 0 suf/Z . 9 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule5.aff: -------------------------------------------------------------------------------- 1 | # number + percent 2 | SET UTF-8 3 | COMPOUNDMIN 1 4 | COMPOUNDRULE 2 5 | COMPOUNDRULE N*%? 6 | COMPOUNDRULE NN*.NN*%? 7 | WORDCHARS 0123456789‰. 8 | -------------------------------------------------------------------------------- /tests/legacy/i58202.sug: -------------------------------------------------------------------------------- 1 | foo, Boo 2 | Bar 3 | Baz 4 | Boo 5 | foo bar 6 | foo Bar 7 | Foo bar 8 | Foo Bar 9 | foo Baz 10 | Foo Baz 11 | Baz foo 12 | Baz Foo 13 | Baz Boo 14 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_forbiddenword1.aff: -------------------------------------------------------------------------------- 1 | TRY r 2 | 3 | FORBIDDENWORD F 4 | COMPOUNDRULE 2 5 | COMPOUNDRULE WW 6 | COMPOUNDRULE WWW 7 | 8 | SFX S Y 1 9 | SFX S 0 s . 10 | -------------------------------------------------------------------------------- /tests/legacy/1463589_utf.aff: -------------------------------------------------------------------------------- 1 | # capitalized ngram suggestion test data (Unicode version) for 2 | # Sf.net Bug ID 1463589, reported by Frederik Fouvry. 3 | SET UTF-8 4 | MAXNGRAMSUGS 1 5 | -------------------------------------------------------------------------------- /tests/legacy/base_utf.wrong: -------------------------------------------------------------------------------- 1 | loooked 2 | texxt 3 | hlelo 4 | seid 5 | rottenday 6 | tomorow 7 | seeeven 8 | Nasa 9 | horrorfying 10 | peech 11 | sugesst 12 | İmply 13 | İMPLY 14 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule4.aff: -------------------------------------------------------------------------------- 1 | # English ordinal numbers 2 | WORDCHARS 0123456789 3 | COMPOUNDMIN 1 4 | ONLYINCOMPOUND c 5 | COMPOUNDRULE 2 6 | COMPOUNDRULE n*1t 7 | COMPOUNDRULE n*mp 8 | -------------------------------------------------------------------------------- /tests/legacy/utf8.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | 3 | SFX A Y 7 4 | SFX A 0 őő . 5 | SFX A 0 ő o 6 | SFX A 0 ő ó 7 | SFX A ó ő ó 8 | SFX A ó őoo ó 9 | SFX A o őo o 10 | SFX A 0 ó [abcdó] 11 | -------------------------------------------------------------------------------- /tests/legacy/phone.dic: -------------------------------------------------------------------------------- 1 | 10 2 | Brasilia 3 | brassily 4 | Brazilian 5 | brilliance 6 | brilliancy 7 | brilliant 8 | brain 9 | brass 10 | Churchillian 11 | xxxxxxxxxx ph:Brasilia 12 | -------------------------------------------------------------------------------- /tests/legacy/sugutf.sug: -------------------------------------------------------------------------------- 1 | NASA 2 | Gandhi 3 | grateful 4 | permanent 5 | vacation 6 | a lot, lot 7 | permanent. Vacation 8 | have 9 | which 10 | Gandhi 11 | McDonald 12 | permanent 13 | -------------------------------------------------------------------------------- /tests/legacy/sugutf.wrong: -------------------------------------------------------------------------------- 1 | nasa 2 | Ghandi 3 | greatful 4 | permenant 5 | vacacation 6 | alot 7 | permanent.Vacation 8 | ahev 9 | hwihc 10 | GAndhi 11 | Mcdonald 12 | permqnent 13 | -------------------------------------------------------------------------------- /tests/legacy/alias.aff: -------------------------------------------------------------------------------- 1 | # aliases for flag vectors (AF) 2 | # AB -> 1 3 | # A -> 2 4 | AF 2 5 | AF AB 6 | AF A 7 | 8 | SFX A Y 1 9 | SFX A 0 x . 10 | 11 | SFX B Y 1 12 | SFX B 0 y/2 . 13 | -------------------------------------------------------------------------------- /tests/legacy/break.good: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | fox-bax 4 | foo-bar 5 | foo–bar 6 | foo-bar-foo-bar 7 | foo-bar–foo-bar 8 | bar-baz 9 | baz-foo 10 | foo-bar-foo-bar-foo-bar-foo-bar-foo-bar 11 | -------------------------------------------------------------------------------- /tests/legacy/iconv2.aff: -------------------------------------------------------------------------------- 1 | # The longer input pattern should be used if matched 2 | ICONV 6 3 | ICONV Da DA 4 | ICONV Ga GA 5 | ICONV Gag GAG 6 | ICONV Gagg GAGG 7 | ICONV Na NA 8 | ICONV Nan NAN 9 | -------------------------------------------------------------------------------- /tests/legacy/oconv.aff: -------------------------------------------------------------------------------- 1 | # output conversion 2 | SET UTF-8 3 | 4 | OCONV 7 5 | OCONV a A 6 | OCONV á Á 7 | OCONV b B 8 | OCONV c C 9 | OCONV d D 10 | OCONV e E 11 | OCONV é É 12 | 13 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_keepcase.sug: -------------------------------------------------------------------------------- 1 | Tv-word, Tv- word, Word 2 | Tv- word, Word 3 | word -tv, word-tv, word 4 | word -tv, word-tv, word 5 | wordword-tv, word 6 | Tv-word-tv 7 | NATO- 8 | -NATO 9 | -------------------------------------------------------------------------------- /tests/legacy/simplifiedtriple.aff: -------------------------------------------------------------------------------- 1 | # Forbid compound word with triple letters 2 | CHECKCOMPOUNDTRIPLE 3 | # Allow simplified forms 4 | SIMPLIFIEDTRIPLE 5 | 6 | COMPOUNDMIN 2 7 | 8 | COMPOUNDFLAG A 9 | -------------------------------------------------------------------------------- /tests/legacy/sug.sug: -------------------------------------------------------------------------------- 1 | NASA 2 | Gandhi 3 | grateful 4 | permanent 5 | vacation 6 | a lot, lot 7 | permanent. Vacation 8 | have 9 | which 10 | Gandhi 11 | McDonald 12 | permanent 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern.aff: -------------------------------------------------------------------------------- 1 | # forbid compounds with spec. pattern at word bounds 2 | COMPOUNDFLAG A 3 | CHECKCOMPOUNDPATTERN 2 4 | CHECKCOMPOUNDPATTERN nny ny 5 | CHECKCOMPOUNDPATTERN ssz sz 6 | -------------------------------------------------------------------------------- /tests/legacy/nepali.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | IGNORE ￰ 3 | WORDCHARS ःािीॉॊोौॎॏॕॖॗ‌‍ 4 | 5 | 6 | ICONV 5 7 | ICONV ‌_ ‌ 8 | ICONV र्‌य र्‌य 9 | ICONV र्‌व र्‌व 10 | ICONV ‌ ￰ 11 | ICONV ‍_ ￰ 12 | 13 | -------------------------------------------------------------------------------- /tests/legacy/base_utf.sug: -------------------------------------------------------------------------------- 1 | looked, look 2 | text 3 | hello 4 | said 5 | rotten day, rotten-day, rotten 6 | tomorrow 7 | seven 8 | NASA 9 | horrifying 10 | speech 11 | suggest 12 | Imply 13 | IMPLY 14 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixes.aff: -------------------------------------------------------------------------------- 1 | # set twofold prefix stripping 2 | # Coptic example by Moheb Mekhaiel 3 | COMPLEXPREFIXES 4 | 5 | PFX A Y 1 6 | PFX A 0 tek . 7 | 8 | PFX B Y 1 9 | PFX B 0 met/A . 10 | -------------------------------------------------------------------------------- /tests/legacy/flag.aff: -------------------------------------------------------------------------------- 1 | # base 1-character flags 2 | 3 | SFX A Y 1 4 | SFX A 0 s/123 . 5 | 6 | SFX 1 Y 1 7 | SFX 1 0 bar . 8 | 9 | SFX 2 Y 1 10 | SFX 2 0 baz . 11 | 12 | PFX 3 Y 1 13 | PFX 3 0 un . 14 | -------------------------------------------------------------------------------- /tests/legacy/i35725.sug: -------------------------------------------------------------------------------- 1 | permanent, preferment 2 | permanent, ornament 3 | permanent 4 | Permanent, Preferment 5 | Permanent, Ornament 6 | Permanent 7 | UNESCO 8 | UNESCO 9 | UNESCO's 10 | UNESCO's 11 | -------------------------------------------------------------------------------- /tests/legacy/hu.dic: -------------------------------------------------------------------------------- 1 | 5 2 | majom/Y 3 | kenyér/Y 4 | fa/Y 5 | ág/Y 6 | virág/Y 7 | sárkány/Y 8 | fog/Y 9 | vetemény/Y 10 | iskola/Y 11 | tej/Y 12 | akció/Y 13 | devon/Y 14 | kor/Y 15 | társ/Y 16 | devon kor 17 | -------------------------------------------------------------------------------- /tests/legacy/nosuggest.aff: -------------------------------------------------------------------------------- 1 | # don't suggest word with NOSUGGEST flag (for example vulgar or obscene words) 2 | # See OpenOffice.org Issue #55498 3 | # (nosuggest.sug is an empty file) 4 | NOSUGGEST A 5 | COMPOUNDFLAG B 6 | -------------------------------------------------------------------------------- /tests/legacy/needaffix5.good: -------------------------------------------------------------------------------- 1 | foo 2 | prefoo 3 | foosuf 4 | prefoosuf 5 | foosufbar 6 | prefoosufbar 7 | pseudoprefoosuf 8 | pseudoprefoosufbar 9 | pseudoprefoopseudosufbar 10 | prefoopseudosuf 11 | prefoopseudosufbar 12 | -------------------------------------------------------------------------------- /tests/legacy/reputf.aff: -------------------------------------------------------------------------------- 1 | # With REP suggestions, we can fix typical language specific misspellings. 2 | 3 | SET UTF-8 4 | 5 | # switch off ngram suggestion for testing 6 | MAXNGRAMSUGS 0 7 | 8 | REP 1 9 | REP oo őő 10 | -------------------------------------------------------------------------------- /tests/legacy/checksharps.good: -------------------------------------------------------------------------------- 1 | müßig 2 | Müßig 3 | MÜSSIG 4 | Ausstoß 5 | Abstoß. 6 | Außenabmessung 7 | Prozessionsstraße 8 | Außenmaße 9 | AUSSTOSS 10 | ABSTOSS. 11 | AUSSENABMESSUNG 12 | PROZESSIONSSTRASSE 13 | AUSSENMASSE 14 | -------------------------------------------------------------------------------- /tests/legacy/conditionalprefix.aff: -------------------------------------------------------------------------------- 1 | PFX P Y 1 2 | PFX P 0 un . ip:un 3 | 4 | SFX S Y 1 5 | SFX S 0 s . is:PL 6 | 7 | SFX Q Y 1 8 | SFX Q 0 s . is:3SGV 9 | 10 | SFX R Y 1 11 | SFX R 0 able/PS . ds:DER_V_ADJ_ABLE 12 | -------------------------------------------------------------------------------- /tests/legacy/flaglong.aff: -------------------------------------------------------------------------------- 1 | # 2-character flags 2 | FLAG long 3 | 4 | SFX zx Y 1 5 | SFX zx 0 s/g?1G09 . 6 | 7 | SFX g? Y 1 8 | SFX g? 0 bar . 9 | 10 | SFX 1G Y 1 11 | SFX 1G 0 baz . 12 | 13 | PFX 09 Y 1 14 | PFX 09 0 un . 15 | -------------------------------------------------------------------------------- /tests/legacy/sug.wrong: -------------------------------------------------------------------------------- 1 | nasa 2 | Ghandi 3 | greatful 4 | permenant 5 | vacacation 6 | alot 7 | permanent.Vacation 8 | ahev 9 | hwihc 10 | GAndhi 11 | Mcdonald 12 | permqnent 13 | permanent-vacation 14 | permqnent-vacation 15 | -------------------------------------------------------------------------------- /tests/legacy/zeroaffix.aff: -------------------------------------------------------------------------------- 1 | PSEUDOROOT X 2 | COMPOUNDFLAG Y 3 | 4 | SFX A Y 1 5 | SFX A 0 0 . > 6 | 7 | SFX B Y 1 8 | SFX B 0 0 . > 9 | 10 | SFX C Y 2 11 | SFX C 0 0/XAB . 12 | SFX C 0 baz/XAB . 13 | -------------------------------------------------------------------------------- /vendor/en_US/README.txt: -------------------------------------------------------------------------------- 1 | US English dictionary. 2 | 3 | These files are licensed separately from spellbook. See the '*license.txt' 4 | files in this directory. 5 | 6 | Upstream 7 | -------------------------------------------------------------------------------- /tests/legacy/checksharpsutf.good: -------------------------------------------------------------------------------- 1 | müßig 2 | Müßig 3 | MÜSSIG 4 | Ausstoß 5 | Abstoß. 6 | Außenabmessung 7 | Prozessionsstraße 8 | Außenmaße 9 | AUSSTOSS 10 | ABSTOSS. 11 | AUSSENABMESSUNG 12 | PROZESSIONSSTRASSE 13 | AUSSENMASSE 14 | -------------------------------------------------------------------------------- /tests/legacy/needaffix5.aff: -------------------------------------------------------------------------------- 1 | # on affixes 2 | NEEDAFFIX X 3 | 4 | SFX A Y 2 5 | SFX A 0 suf/B . 6 | SFX A 0 pseudosuf/XB . 7 | 8 | SFX B Y 1 9 | SFX B 0 bar . 10 | 11 | PFX C Y 2 12 | PFX C 0 pre . 13 | PFX C 0 pseudopre/X . 14 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule7.aff: -------------------------------------------------------------------------------- 1 | # English ordinal numbers (parenthesized long flags) 2 | FLAG long 3 | WORDCHARS 0123456789 4 | COMPOUNDMIN 1 5 | ONLYINCOMPOUND cc 6 | COMPOUNDRULE 2 7 | COMPOUNDRULE (nn)*(11)(tt) 8 | COMPOUNDRULE (nn)*(mm)(pp) 9 | -------------------------------------------------------------------------------- /tests/legacy/flagutf8.aff: -------------------------------------------------------------------------------- 1 | # UTF-8 flags 2 | FLAG UTF-8 3 | 4 | SFX A Y 1 5 | SFX A 0 s/ÖüÜ . 6 | #SFX A 0 s/ÖüÖÜ . 7 | 8 | SFX Ö Y 1 9 | SFX Ö 0 bar . 10 | 11 | SFX ü Y 1 12 | SFX ü 0 baz . 13 | 14 | PFX Ü Y 1 15 | PFX Ü 0 un . 16 | -------------------------------------------------------------------------------- /tests/legacy/maputf.aff: -------------------------------------------------------------------------------- 1 | # With MAP suggestion, Hunspell can add missing accents to a word. 2 | 3 | SET UTF-8 4 | 5 | # switch off ngram suggestion for testing 6 | MAXNGRAMSUGS 0 7 | 8 | MAP 3 9 | MAP uúü 10 | MAP öóo 11 | MAP ß(ss) 12 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern3.aff: -------------------------------------------------------------------------------- 1 | # forbid compounds with spec. pattern at word bound and allow modificated form 2 | # (for Indian languages) 3 | COMPOUNDFLAG A 4 | CHECKCOMPOUNDPATTERN 1 5 | CHECKCOMPOUNDPATTERN o/X b/Y z 6 | COMPOUNDMIN 1 7 | -------------------------------------------------------------------------------- /tests/legacy/flagnum.aff: -------------------------------------------------------------------------------- 1 | # numerical flags 2 | FLAG num 3 | 4 | SFX 999 Y 1 5 | SFX 999 0 s/214,216,54321 . 6 | 7 | SFX 214 Y 1 8 | SFX 214 0 bar . 9 | 10 | SFX 216 Y 1 11 | SFX 216 0 baz . 12 | 13 | PFX 54321 Y 1 14 | PFX 54321 0 un . 15 | -------------------------------------------------------------------------------- /tests/legacy/iconv.aff: -------------------------------------------------------------------------------- 1 | # input conversion (accept comma acuted letters also with cedilla, 2 | # as de facto replacement of the Romanian standard) 3 | SET UTF-8 4 | 5 | ICONV 4 6 | ICONV ş ș 7 | ICONV ţ ț 8 | ICONV Ş Ș 9 | ICONV Ţ Ț 10 | 11 | -------------------------------------------------------------------------------- /tests/legacy/i53643.good: -------------------------------------------------------------------------------- 1 | 1 2 | 12 3 | 123 4 | 1234 5 | 12345 6 | 123456 7 | 1234567 8 | 1.1 9 | 1.12 10 | 1.123 11 | 1.1234 12 | 1.12345 13 | 1.123456 14 | 12.1 15 | 123.12 16 | 1234.123 17 | 12345.1234 18 | 123456.12345 19 | 1234567.123456 20 | -------------------------------------------------------------------------------- /tests/legacy/ignoreutf.aff: -------------------------------------------------------------------------------- 1 | # Arabic test for feature ignoring diacritics 2 | SET UTF-8 3 | # Arabic diacritics (harakat): 4 | # sukun, shadda, kasra, damma, fatha, kasratan, dammantan, fathatan (left to right) 5 | IGNORE ًٌٍَُِّْ 6 | WORDCHARS ًٌٍَُِّْ 7 | -------------------------------------------------------------------------------- /tests/legacy/break.wrong: -------------------------------------------------------------------------------- 1 | fox 2 | bax 3 | -foo 4 | bar- 5 | fox-bar 6 | foo-bax 7 | foo–bax 8 | fox–bar 9 | foo-bar-fox-bar 10 | foo-bax-foo-bar 11 | foo-bar–fox-bar 12 | foo-bax–foo-bar 13 | foo-baz 14 | foo-bar-foo-bar-foo-bar-foo-bar-foo-bar-foo 15 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern4.aff: -------------------------------------------------------------------------------- 1 | # sandhi in Telugu writing system, based on the Kiran Chittella's example 2 | 3 | COMPOUNDFLAG x 4 | COMPOUNDMIN 1 5 | CHECKCOMPOUNDPATTERN 2 6 | CHECKCOMPOUNDPATTERN a/A u/A O 7 | CHECKCOMPOUNDPATTERN u/B u/B u 8 | 9 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule8.aff: -------------------------------------------------------------------------------- 1 | # English ordinal numbers (parenthesized numerical flags) 2 | FLAG num 3 | WORDCHARS 0123456789 4 | COMPOUNDMIN 1 5 | ONLYINCOMPOUND 1000 6 | COMPOUNDRULE 2 7 | COMPOUNDRULE (1001)*(1002)(2001) 8 | COMPOUNDRULE (1001)*(2002)(2000) 9 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule5.dic: -------------------------------------------------------------------------------- 1 | 13 2 | 0/N po:num 3 | 1/N po:num 4 | 2/N po:num 5 | 3/N po:num 6 | 4/N po:num 7 | 5/N po:num 8 | 6/N po:num 9 | 7/N po:num 10 | 8/N po:num 11 | 9/N po:num 12 | ./. po:sign_dot 13 | %/% po:sign_percent 14 | ‰/% po:sign_per_mille 15 | -------------------------------------------------------------------------------- /tests/legacy/break.aff: -------------------------------------------------------------------------------- 1 | # word break points test, recursive break at dash and n-dash 2 | # note: spelling is incorrect when word has ten or more break patterns 3 | SET UTF-8 4 | 5 | BREAK 2 6 | BREAK - 7 | BREAK – 8 | 9 | WORDCHARS -– 10 | 11 | FORBIDDENWORD ! 12 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixes2.aff: -------------------------------------------------------------------------------- 1 | # complex prefixes with morphological analysis 2 | COMPLEXPREFIXES 3 | WORDCHARS _ 4 | 5 | PFX A Y 1 6 | PFX A 0 tek . affix_1/ 7 | 8 | PFX B Y 1 9 | PFX B 0 met/A . affix_2/ 10 | 11 | SFX C Y 1 12 | SFX C 0 _test_ . /suffix_1 13 | -------------------------------------------------------------------------------- /tests/legacy/condition_utf.wrong: -------------------------------------------------------------------------------- 1 | óőósuf4 2 | pre4óőó 3 | óőósuf5 4 | pre5óőó 5 | óőósuf8 6 | pre8óőó 7 | óőósuf9 8 | pre9óőó 9 | óőósuf11 10 | pre11óőó 11 | óőósuf12 12 | pre12óőó 13 | óőósuf15 14 | pre15óőó 15 | óőósuf17 16 | óőósuf18 17 | pre17óőó 18 | pre18óőó 19 | -------------------------------------------------------------------------------- /tests/legacy/1695964.aff: -------------------------------------------------------------------------------- 1 | # fix NEEDAFFIX homonym suggestion. 2 | # Sf.net Bug ID 1695964, reported by Björn Jacke. 3 | TRY esianrtolcdugmphbyfvkwESIANRTOLCDUGMPHBYFVKW 4 | MAXNGRAMSUGS 0 5 | NEEDAFFIX h 6 | SFX S Y 1 7 | SFX S 0 s . 8 | 9 | SFX e Y 1 10 | SFX e 0 e . 11 | -------------------------------------------------------------------------------- /tests/legacy/condition_utf.good: -------------------------------------------------------------------------------- 1 | óőó 2 | óőósuf1 3 | pre1óőó 4 | óőósuf2 5 | pre2óőó 6 | óőósuf3 7 | pre3óőó 8 | óőósuf6 9 | pre6óőó 10 | óőósuf7 11 | pre7óőó 12 | óőósuf10 13 | pre10óőó 14 | óőósuf13 15 | pre13óőó 16 | óőósuf14 17 | pre14óőó 18 | óőósuf16 19 | pre16óőó 20 | -------------------------------------------------------------------------------- /tests/legacy/alias3.morph: -------------------------------------------------------------------------------- 1 | > ouro 2 | analyze(ouro) = [stem_1] ouro:ts 3 | > metouro 4 | analyze(metouro) = affix_2/ ouro:ts [stem_1] 5 | > tekmetouro 6 | analyze(tekmetouro) = affix_1/ affix_2/ ouro:ts [stem_1] 7 | > ouro_test_ 8 | analyze(ouro_test_) = [stem_1] ouro:ts /suffix_1 9 | -------------------------------------------------------------------------------- /tests/legacy/i35725.dic: -------------------------------------------------------------------------------- 1 | 15 2 | endangerment/SM 3 | ferment/FSCM 4 | preferment/SM 5 | impermanent/Y 6 | permanent/YSP 7 | semipermanent/Y 8 | empowerment/MS 9 | supermen 10 | tournament/MS 11 | ornamental/SY 12 | ornament/GSDM 13 | supernatant 14 | pimpernel 15 | UNESCO/M 16 | -------------------------------------------------------------------------------- /tests/legacy/warn.aff: -------------------------------------------------------------------------------- 1 | # WARN flag 2 | # The signed word, and its suffixed forms result warning message in command-line 3 | 4 | #Use to forbid the words with flag WARN 5 | #FORBIDWARN 6 | 7 | WARN W 8 | 9 | SFX A Y 1 10 | SFX A 0 s . 11 | 12 | REP 1 13 | REP foo bar 14 | -------------------------------------------------------------------------------- /tests/legacy/1706659.aff: -------------------------------------------------------------------------------- 1 | # test COMPOUNDRULE bug reported by Björn Jacke 2 | SET ISO8859-1 3 | TRY esijanrtolcdugmphbyfvkwqxz 4 | 5 | SFX A Y 5 6 | SFX A 0 e . 7 | SFX A 0 er . 8 | SFX A 0 en . 9 | SFX A 0 em . 10 | SFX A 0 es . 11 | 12 | COMPOUNDRULE 1 13 | COMPOUNDRULE vw 14 | -------------------------------------------------------------------------------- /tests/legacy/checkcompoundpattern2.aff: -------------------------------------------------------------------------------- 1 | # forbid compounds with spec. pattern at word bound and allow modificated form 2 | # (for German and Indian languages) 3 | COMPOUNDFLAG A 4 | CHECKCOMPOUNDPATTERN 2 5 | CHECKCOMPOUNDPATTERN o b z 6 | CHECKCOMPOUNDPATTERN oo ba u 7 | COMPOUNDMIN 1 8 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule4.dic: -------------------------------------------------------------------------------- 1 | 22 2 | 0/nm 3 | 1/n1 4 | 2/nm 5 | 3/nm 6 | 4/nm 7 | 5/nm 8 | 6/nm 9 | 7/nm 10 | 8/nm 11 | 9/nm 12 | 0th/pt 13 | 1st/p 14 | 1th/tc 15 | 2nd/p 16 | 2th/tc 17 | 3rd/p 18 | 3th/tc 19 | 4th/pt 20 | 5th/pt 21 | 6th/pt 22 | 7th/pt 23 | 8th/pt 24 | 9th/pt 25 | -------------------------------------------------------------------------------- /tests/legacy/fogemorpheme.aff: -------------------------------------------------------------------------------- 1 | # fogemorphemes: special morphemes in compounds 2 | # 3 | # Swedish example: 4 | # gata + kontoret = gatukontoret 5 | 6 | COMPOUNDFLAG X 7 | COMPOUNDBEGIN Y 8 | ONLYINCOMPOUND Z 9 | COMPOUNDPERMITFLAG P 10 | 11 | SFX A Y 1 12 | SFX A a u/YPZ . 13 | -------------------------------------------------------------------------------- /tests/legacy/morph.aff: -------------------------------------------------------------------------------- 1 | # example for morphological analysis, stemming and generation 2 | PFX P Y 1 3 | PFX P 0 un . dp:pfx_un sp:un 4 | 5 | SFX S Y 1 6 | SFX S 0 s . is:plur 7 | 8 | SFX Q Y 1 9 | SFX Q 0 s . is:sg_3 10 | 11 | SFX R Y 1 12 | SFX R 0 able/PS . ds:der_able 13 | -------------------------------------------------------------------------------- /tests/legacy/complexprefixesutf.aff: -------------------------------------------------------------------------------- 1 | # Coptic example by Moheb Mekhaiel 2 | # Encoded with the new Coptic character encoding of Unicode 4.1 3 | SET UTF-8 4 | 5 | # set twofold prefix stripping 6 | COMPLEXPREFIXES 7 | 8 | PFX A Y 1 9 | PFX A 0 ⲧⲉⲕ . 10 | 11 | PFX B Y 1 12 | PFX B 0 ⲙⲉⲧ/A . 13 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat.aff: -------------------------------------------------------------------------------- 1 | FLAG long 2 | COMPOUNDBEGIN Ca 3 | COMPOUNDMIDDLE Cb 4 | COMPOUNDEND Cc 5 | COMPOUNDPERMITFLAG Cp 6 | ONLYINCOMPOUND Cx 7 | 8 | CHECKCOMPOUNDPATTERN 1 9 | CHECKCOMPOUNDPATTERN /Ch /Xs 10 | 11 | SFX Ch Y 2 12 | SFX Ch 0 s/CaCbCxCp . 13 | SFX Ch 0 s-/CaCbCcCp . 14 | -------------------------------------------------------------------------------- /tests/legacy/forbiddenword.aff: -------------------------------------------------------------------------------- 1 | # FORBIDDENWORD flag 2 | # The signed word, and its suffixed forms are all forbidden, 3 | # excepts with root homonyms. 4 | # Useful for forbidding bad suffixed forms or compounds. 5 | 6 | 7 | FORBIDDENWORD X 8 | COMPOUNDFLAG Y 9 | 10 | SFX A Y 1 11 | SFX A 0 s . 12 | -------------------------------------------------------------------------------- /tests/legacy/alias2.aff: -------------------------------------------------------------------------------- 1 | # aliases for flag vectors (AF) and morphological descriptions (AM) 2 | # AB -> 1 3 | # A -> 2 4 | AF 2 5 | AF AB 6 | AF A 7 | 8 | AM 3 9 | AM is:affix_x 10 | AM ds:affix_y 11 | AM po:noun xx:other_data 12 | 13 | SFX A Y 1 14 | SFX A 0 x . 1 15 | 16 | SFX B Y 1 17 | SFX B 0 y/2 . 2 18 | -------------------------------------------------------------------------------- /tests/legacy/alias3.aff: -------------------------------------------------------------------------------- 1 | # morph. aliases with complex prefixes 2 | COMPLEXPREFIXES 3 | WORDCHARS _ 4 | 5 | AM 4 6 | AM affix_1/ 7 | AM affix_2/ 8 | AM /suffix_1 9 | AM [stem_1] 10 | 11 | PFX A Y 1 12 | PFX A 0 tek . 1 13 | 14 | PFX B Y 1 15 | PFX B 0 met/A . 2 16 | 17 | SFX C Y 1 18 | SFX C 0 _test_ . 3 19 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule4.good: -------------------------------------------------------------------------------- 1 | 1st 2 | 2nd 3 | 3rd 4 | 4th 5 | 5th 6 | 6th 7 | 7th 8 | 8th 9 | 9th 10 | 10th 11 | 11th 12 | 12th 13 | 13th 14 | 14th 15 | 15th 16 | 16th 17 | 17th 18 | 18th 19 | 19th 20 | 20th 21 | 21st 22 | 22nd 23 | 23rd 24 | 24th 25 | 25th 26 | 100th 27 | 1000th 28 | 10001st 29 | 10011th 30 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule7.good: -------------------------------------------------------------------------------- 1 | 1st 2 | 2nd 3 | 3rd 4 | 4th 5 | 5th 6 | 6th 7 | 7th 8 | 8th 9 | 9th 10 | 10th 11 | 11th 12 | 12th 13 | 13th 14 | 14th 15 | 15th 16 | 16th 17 | 17th 18 | 18th 19 | 19th 20 | 20th 21 | 21st 22 | 22nd 23 | 23rd 24 | 24th 25 | 25th 26 | 100th 27 | 1000th 28 | 10001st 29 | 10011th 30 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule8.good: -------------------------------------------------------------------------------- 1 | 1st 2 | 2nd 3 | 3rd 4 | 4th 5 | 5th 6 | 6th 7 | 7th 8 | 8th 9 | 9th 10 | 10th 11 | 11th 12 | 12th 13 | 13th 14 | 14th 15 | 15th 16 | 16th 17 | 17th 18 | 18th 19 | 19th 20 | 20th 21 | 21st 22 | 22nd 23 | 23rd 24 | 24th 25 | 25th 26 | 100th 27 | 1000th 28 | 10001st 29 | 10011th 30 | -------------------------------------------------------------------------------- /tests/legacy/condition.wrong: -------------------------------------------------------------------------------- 1 | ofosuf4 2 | pre4ofo 3 | ofosuf5 4 | pre5ofo 5 | ofosuf8 6 | pre8ofo 7 | ofosuf9 8 | pre9ofo 9 | ofosuf11 10 | pre10ofo 11 | pre11ofo 12 | ofosuf12 13 | pre12ofo 14 | ofosuf15 15 | pre15ofo 16 | ofosuf17 17 | pre17ofo 18 | ofosuf18 19 | pre18ofo 20 | entertainning 21 | gninnianretne 22 | -------------------------------------------------------------------------------- /tests/legacy/digits_in_words.aff: -------------------------------------------------------------------------------- 1 | # Digits in words, handled by COMPOUNDRULE. 2 | # 1-jährig, 2-jährig, 100-jährig etc. 3 | SET UTF-8 4 | COMPOUNDMIN 1 5 | # recognize ab, aab, aaab etc. compounds (a=digits, b=-jährig, see dic file) 6 | COMPOUNDRULE 1 7 | COMPOUNDRULE a*b 8 | ONLYINCOMPOUND c 9 | WORDCHARS 0123456789- 10 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule7.dic: -------------------------------------------------------------------------------- 1 | 22 2 | 0/nnmm 3 | 1/nn11 4 | 2/nnmm 5 | 3/nnmm 6 | 4/nnmm 7 | 5/nnmm 8 | 6/nnmm 9 | 7/nnmm 10 | 8/nnmm 11 | 9/nnmm 12 | 0th/pptt 13 | 1st/pp 14 | 1th/ttcc 15 | 2nd/pp 16 | 2th/ttcc 17 | 3rd/pp 18 | 3th/ttcc 19 | 4th/pptt 20 | 5th/pptt 21 | 6th/pptt 22 | 7th/pptt 23 | 8th/pptt 24 | 9th/pptt 25 | -------------------------------------------------------------------------------- /tests/legacy/base.good: -------------------------------------------------------------------------------- 1 | created 2 | uncreate 3 | uncreated 4 | imply 5 | implied 6 | unnatural 7 | conveyed 8 | sawyer 9 | NASA 10 | FAQs 11 | can't 12 | doesn't 13 | won't 14 | Created 15 | Hello 16 | HELLO 17 | NASA 18 | etc. 19 | etc 20 | HELLO 21 | lip. 22 | text. 23 | NASA. 24 | Text. 25 | TEXT. 26 | Nuspell. 27 | NUSPELL. 28 | -------------------------------------------------------------------------------- /tests/legacy/base_utf.good: -------------------------------------------------------------------------------- 1 | created 2 | uncreate 3 | uncreated 4 | imply 5 | implied 6 | unnatural 7 | conveyed 8 | sawyer 9 | NASA 10 | FAQs 11 | can’t 12 | doesn’t 13 | won’t 14 | Created 15 | Hello 16 | HELLO 17 | NASA 18 | etc. 19 | etc 20 | HELLO 21 | lip. 22 | text. 23 | NASA. 24 | Text. 25 | TEXT. 26 | Imply 27 | IMPLY 28 | -------------------------------------------------------------------------------- /tests/legacy/germancompoundingold.good: -------------------------------------------------------------------------------- 1 | Computer 2 | Computern 3 | Arbeit 4 | Arbeits- 5 | Computerarbeit 6 | Computerarbeits- 7 | Arbeitscomputer 8 | Arbeitscomputern 9 | Computerarbeitscomputer 10 | Computerarbeitscomputern 11 | Arbeitscomputerarbeit 12 | Computerarbeits-Computer 13 | Computerarbeits-Computern 14 | Computer-Arbeit 15 | -------------------------------------------------------------------------------- /tests/legacy/onlyincompound2.aff: -------------------------------------------------------------------------------- 1 | # affixes only in compounds (see also fogemorpheme example) 2 | ONLYINCOMPOUND O 3 | COMPOUNDFLAG A 4 | COMPOUNDPERMITFLAG P 5 | 6 | SFX B Y 1 7 | SFX B 0 s/OP . 8 | 9 | # obligate fogemorpheme by forbidding the stem (0) in compounds 10 | 11 | CHECKCOMPOUNDPATTERN 1 12 | CHECKCOMPOUNDPATTERN 0/B /A 13 | -------------------------------------------------------------------------------- /tests/legacy/condition.good: -------------------------------------------------------------------------------- 1 | ofo 2 | ofosuf1 3 | pre1ofo 4 | ofosuf2 5 | pre2ofo 6 | ofosuf3 7 | pre3ofo 8 | ofosuf6 9 | pre6ofo 10 | ofosuf7 11 | pre7ofo 12 | ofosuf10 13 | ofosuf13 14 | pre13ofo 15 | ofosuf14 16 | pre14ofo 17 | ofosuf16 18 | pre16ofo 19 | entertain 20 | entertaining 21 | gninianretne 22 | ér 23 | éram 24 | érach 25 | wries 26 | unwry 27 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | # Flake's devShell for non-flake-enabled nix instances 2 | let 3 | compat = builtins.fetchTarball { 4 | url = "https://github.com/edolstra/flake-compat/archive/b4a34015c698c7793d592d66adbab377907a2be8.tar.gz"; 5 | sha256 = "sha256:1qc703yg0babixi6wshn5wm2kgl5y1drcswgszh4xxzbrwkk9sv7"; 6 | }; 7 | in 8 | (import compat { src = ./.; }).shellNix.default 9 | -------------------------------------------------------------------------------- /tests/legacy/morph.dic: -------------------------------------------------------------------------------- 1 | 9 2 | drink/S po:noun 3 | drink/RQ po:verb al:drank al:drunk ts:present 4 | drank po:verb st:drink is:past_1 5 | drunk po:verb st:drink is:past_2 6 | eat/RQ po:verb al:ate al:eaten ts:present 7 | ate po:verb st:eat is:past_1 8 | eaten po:verb st:eat is:past_2 9 | phenomenon po:noun al:phenomena 10 | phenomena po:noun st:phenomenon is:plur 11 | -------------------------------------------------------------------------------- /tests/legacy/needaffix2.morph: -------------------------------------------------------------------------------- 1 | > foo 2 | analyze(foo) = st:foo id:1 3 | analyze(foo) = st:foo id:3 4 | stem(foo) = foo 5 | > bar 6 | analyze(bar) = st:bar 7 | stem(bar) = bar 8 | > foobar 9 | analyze(foobar) = pa:foo st:foo id:3 pa:bar 10 | stem(foobar) = foo 11 | > barfoo 12 | analyze(barfoo) = pa:bar st:bar pa:foo st:foo id:3 13 | stem(barfoo) = barfoo 14 | -------------------------------------------------------------------------------- /tests/legacy/base_utf.dic: -------------------------------------------------------------------------------- 1 | 28 2 | created/U 3 | create/XKVNGADS 4 | imply/GNSDX 5 | natural/PUY 6 | like/USPBY 7 | convey/BDGS 8 | look/GZRDS 9 | text 10 | hello 11 | said 12 | sawyer 13 | NASA 14 | rotten 15 | day 16 | tomorrow 17 | seven 18 | FAQ/SM 19 | can’t 20 | doesn’t 21 | etc 22 | won’t 23 | lip 24 | text 25 | horrifying 26 | speech 27 | suggest 28 | uncreate/V 29 | -------------------------------------------------------------------------------- /tests/legacy/base.dic: -------------------------------------------------------------------------------- 1 | 28 2 | created/U 3 | create/XKVNGADS 4 | imply/GNSDX 5 | natural/PUY 6 | like/USPBY 7 | convey/BDGS 8 | look/GZRDS 9 | text 10 | hello 11 | said 12 | sawyer 13 | NASA 14 | rotten 15 | day 16 | tomorrow 17 | seven 18 | FAQ/SM 19 | can't 20 | doesn't 21 | etc 22 | won't 23 | lip 24 | text 25 | horrifying 26 | speech 27 | suggest 28 | uncreate/V 29 | Nuspell 30 | -------------------------------------------------------------------------------- /tests/legacy/1592880.aff: -------------------------------------------------------------------------------- 1 | # fix homonym handling for German dictionary project, 2 | # reported by Björn Jacke (sf.net Bug ID 1592880). 3 | SET ISO8859-1 4 | 5 | SFX N Y 1 6 | SFX N 0 n . 7 | 8 | SFX S Y 1 9 | SFX S 0 s . 10 | 11 | SFX P Y 1 12 | SFX P 0 en . 13 | 14 | SFX Q Y 2 15 | SFX Q 0 e . 16 | SFX Q 0 en . 17 | 18 | COMPOUNDEND z 19 | COMPOUNDPERMITFLAG c 20 | ONLYINCOMPOUND o 21 | -------------------------------------------------------------------------------- /tests/legacy/circumfix.aff: -------------------------------------------------------------------------------- 1 | # circumfixes: ~ obligate prefix/suffix combinations 2 | # superlative in Hungarian: leg- (prefix) AND -bb (suffix) 3 | 4 | CIRCUMFIX X 5 | 6 | PFX A Y 1 7 | PFX A 0 leg/X . 8 | 9 | PFX B Y 1 10 | PFX B 0 legesleg/X . 11 | 12 | SFX C Y 3 13 | SFX C 0 obb . is:COMPARATIVE 14 | SFX C 0 obb/AX . is:SUPERLATIVE 15 | SFX C 0 obb/BX . is:SUPERSUPERLATIVE 16 | 17 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule2.good: -------------------------------------------------------------------------------- 1 | aa 2 | aaa 3 | aaaa 4 | ab 5 | aab 6 | aaab 7 | aaaab 8 | abb 9 | aabb 10 | aaabbb 11 | bb 12 | bbb 13 | bbbb 14 | aaab 15 | abc 16 | abcc 17 | abbc 18 | abbcc 19 | aabc 20 | aabcc 21 | aabbc 22 | aabbcc 23 | aaabbbccc 24 | ac 25 | acc 26 | aac 27 | aacc 28 | aaaccc 29 | bc 30 | bcc 31 | bbc 32 | bbcc 33 | bbbccc 34 | cc 35 | ccc 36 | cccccc 37 | abcc 38 | -------------------------------------------------------------------------------- /tests/legacy/alias2.morph: -------------------------------------------------------------------------------- 1 | > foo 2 | analyze(foo) = st:foo po:noun xx:other_data 3 | stem(foo) = foo 4 | > foox 5 | analyze(foox) = st:foo po:noun xx:other_data is:affix_x 6 | stem(foox) = foo 7 | > fooy 8 | analyze(fooy) = st:foo po:noun xx:other_data ds:affix_y 9 | stem(fooy) = fooy 10 | > fooyx 11 | analyze(fooyx) = st:foo po:noun xx:other_data ds:affix_y is:affix_x 12 | stem(fooyx) = fooy 13 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule.wrong: -------------------------------------------------------------------------------- 1 | ba 2 | aaabaaa 3 | bbaaa 4 | aaaaba 5 | bbbbbaa 6 | aa 7 | aaa 8 | aaaa 9 | ab 10 | aab 11 | aaab 12 | aaaab 13 | abb 14 | aabb 15 | aaabbb 16 | bb 17 | bbb 18 | bbbb 19 | aaab 20 | abcc 21 | abbc 22 | abbcc 23 | aabc 24 | aabcc 25 | aabbc 26 | aabbcc 27 | aaabbbccc 28 | ac 29 | aac 30 | aacc 31 | aaaccc 32 | bc 33 | bcc 34 | bbc 35 | bbcc 36 | bbbccc 37 | cc 38 | ccc 39 | cccccc 40 | -------------------------------------------------------------------------------- /tests/legacy/circumfix.morph: -------------------------------------------------------------------------------- 1 | > nagy 2 | analyze(nagy) = st:nagy po:adj 3 | stem(nagy) = nagy 4 | > nagyobb 5 | analyze(nagyobb) = st:nagy po:adj is:COMPARATIVE 6 | stem(nagyobb) = nagy 7 | > legnagyobb 8 | analyze(legnagyobb) = fl:A st:nagy po:adj is:SUPERLATIVE 9 | stem(legnagyobb) = nagy 10 | > legeslegnagyobb 11 | analyze(legeslegnagyobb) = fl:B st:nagy po:adj is:SUPERSUPERLATIVE 12 | stem(legeslegnagyobb) = nagy 13 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule8.dic: -------------------------------------------------------------------------------- 1 | 22 2 | 0/1001,2002 3 | 1/1001,1002 4 | 2/1001,2002 5 | 3/1001,2002 6 | 4/1001,2002 7 | 5/1001,2002 8 | 6/1001,2002 9 | 7/1001,2002 10 | 8/1001,2002 11 | 9/1001,2002 12 | 0th/2000,2001 13 | 1st/2000 14 | 1th/2001,1000 15 | 2nd/2000 16 | 2th/2001,1000 17 | 3rd/2000 18 | 3th/2001,1000 19 | 4th/2000,2001 20 | 5th/2000,2001 21 | 6th/2000,2001 22 | 7th/2000,2001 23 | 8th/2000,2001 24 | 9th/2000,2001 25 | -------------------------------------------------------------------------------- /tests/legacy/zeroaffix.morph: -------------------------------------------------------------------------------- 1 | > bar 2 | analyze(bar) = st:bar > 3 | analyze(bar) = st:bar 4 | analyze(bar) = st:bar > 5 | analyze(bar) = st:bar > 6 | stem(bar) = bar 7 | > foo 8 | analyze(foo) = st:foo 9 | stem(foo) = foo 10 | > barbaz 11 | analyze(barbaz) = st:bar > 12 | analyze(barbaz) = st:bar > 13 | stem(barbaz) = bar 14 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule3.wrong: -------------------------------------------------------------------------------- 1 | aa 2 | aaa 3 | aaaa 4 | aab 5 | aaab 6 | aaaab 7 | abb 8 | aabb 9 | aaabbb 10 | bb 11 | bbb 12 | bbbb 13 | aaab 14 | abcc 15 | abbc 16 | abbcc 17 | aabc 18 | aabcc 19 | aabbc 20 | aabbcc 21 | aaabbbccc 22 | acc 23 | aac 24 | aacc 25 | aaaccc 26 | bcc 27 | bbc 28 | bbcc 29 | bbbccc 30 | cc 31 | ccc 32 | cccccc 33 | abcc 34 | ba 35 | aaabaaa 36 | bbaaa 37 | aaaaba 38 | bbbbbaa 39 | cba 40 | cab 41 | acb 42 | -------------------------------------------------------------------------------- /tests/legacy/morph.good: -------------------------------------------------------------------------------- 1 | drink 2 | drinks 3 | drinkable 4 | drinkables 5 | undrinkable 6 | undrinkables 7 | drank 8 | drunk 9 | phenomenon 10 | phenomena 11 | drink eat 12 | drink eats 13 | drink ate 14 | drink eaten 15 | drink eatable 16 | drink eatables 17 | drink phenomena 18 | drinks eat 19 | drinks eats 20 | drinks ate 21 | drinks eaten 22 | drinks eatable 23 | drinks eatables 24 | drinks phenomena 25 | undrinkable phenomena 26 | phenomenon drinks 27 | -------------------------------------------------------------------------------- /tests/legacy/ngram_utf_fix.aff: -------------------------------------------------------------------------------- 1 | # Test fix of suffixed ngram suggestions with UTF-8 encoding and long flags. 2 | # Based on Vitaly Piryatinsky's bug report and example. 3 | SET UTF-8 4 | FLAG num 5 | 6 | PFX 101 Y 1 7 | PFX 101 0 пред . 8 | 9 | SFX 1381 Y 1 10 | SFX 1381 0 о . 11 | 12 | SFX 2000 Y 3 13 | SFX 2000 0 ам . 14 | SFX 2000 0 ами . 15 | SFX 2000 0 ах . 16 | 17 | SFX 2022 Y 4 18 | SFX 2022 0 а . 19 | SFX 2022 0 у . 20 | SFX 2022 0 ом . 21 | SFX 2022 0 е . 22 | -------------------------------------------------------------------------------- /tests/legacy/fullstrip.aff: -------------------------------------------------------------------------------- 1 | # FULLSTRIP option: Hunspell can strip full words by affix rules 2 | # see OpenOffice.org Issue #80145 3 | # test data from Davide Prina 4 | 5 | FULLSTRIP 6 | 7 | SET ISO8859-15 8 | TRY aioertnsclmdpgubzfvhàq'ACMSkBGPLxEyRTVòIODNwFéùèìjUZKHWJYQX 9 | 10 | SFX A Y 3 # verbo andare (verb to go) 11 | SFX A andare vado andare # io vado (I go) 12 | SFX A andare va andare # tu vai (you go) 13 | SFX A are iamo andare # noi andiamo (we go) 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/legacy/germancompounding.good: -------------------------------------------------------------------------------- 1 | Computer 2 | Computern 3 | Arbeit 4 | Arbeits- 5 | Computerarbeit 6 | Computerarbeits- 7 | Arbeitscomputer 8 | Computercomputer 9 | Computercomputern 10 | Arbeitscomputern 11 | Computerarbeitscomputer 12 | Computerarbeitscomputern 13 | Arbeitscomputercomputer 14 | Computercomputerarbeit 15 | Arbeitscomputerarbeit 16 | Arbeitsarbeitsarbeit 17 | Computerarbeitsarbeit 18 | Computerarbeits-Computer 19 | Computerarbeits-Computern 20 | Computer-Arbeit 21 | -------------------------------------------------------------------------------- /tests/legacy/sug.aff: -------------------------------------------------------------------------------- 1 | # new suggestion methods of Hunspell 1.5: 2 | # capitalization: nasa -> NASA 3 | # long swap: permenant -> permanent 4 | # long mov: Ghandi -> Gandhi 5 | # double two characters: vacacation -> vacation 6 | # space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.) 7 | 8 | # switch off ngram suggestion for testing 9 | MAXNGRAMSUGS 0 10 | REP 1 11 | REP alot a_lot 12 | KEY qwertzuiop|asdfghjkl|yxcvbnm|aq 13 | WORDCHARS .- 14 | FORBIDDENWORD ? 15 | 16 | -------------------------------------------------------------------------------- /tests/legacy/sugutf.aff: -------------------------------------------------------------------------------- 1 | # new suggestion methods of Hunspell 1.5: 2 | # capitalization: nasa -> NASA 3 | # long swap: permenant -> permanent 4 | # long mov: Ghandi -> Gandhi 5 | # double two characters: vacacation -> vacation 6 | # space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.) 7 | 8 | SET UTF-8 9 | # switch off ngram suggestion for testing 10 | MAXNGRAMSUGS 0 11 | REP 1 12 | REP alot a_lot 13 | KEY qwertzuiop|asdfghjkl|yxcvbnm|aq 14 | WORDCHARS . 15 | FORBIDDENWORD ? 16 | -------------------------------------------------------------------------------- /tests/legacy/opentaal_cpdpat2.aff: -------------------------------------------------------------------------------- 1 | # Test file based on OpenTaal's Dutch dictionary, coded by Ruud Baars 2 | 3 | WORDCHARS - 4 | NOSPLITSUGS 5 | FLAG long 6 | 7 | COMPOUNDBEGIN Ca 8 | COMPOUNDMIDDLE Cb 9 | COMPOUNDEND Cc 10 | COMPOUNDPERMITFLAG Cp 11 | ONLYINCOMPOUND Cx 12 | 13 | CHECKCOMPOUNDPATTERN 2 14 | CHECKCOMPOUNDPATTERN 0/Ch /Xs 15 | CHECKCOMPOUNDPATTERN 0/Xm /Xm 16 | 17 | SFX CA Y 2 18 | SFX CA 0 /CaCp . 19 | SFX CA 0 -/CaCp . 20 | 21 | SFX CB Y 2 22 | SFX CB 0 /CbCp . 23 | SFX CB 0 -/CbCp . 24 | 25 | SFX Ch Y 2 26 | SFX Ch 0 s/CaCbCxCp . 27 | SFX Ch 0 s-/CaCbCcCp . 28 | -------------------------------------------------------------------------------- /tests/legacy/hu.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | LANG hu_HU 3 | 4 | # words with flag Y can form compound words 5 | COMPOUNDFLAG Y 6 | 7 | # min. word length in compounds: 8 | # allow 2-letter words 9 | COMPOUNDMIN 2 10 | 11 | # max. word count in compounds 12 | COMPOUNDWORDMAX 2 13 | 14 | # exception for Hungarian: 15 | # allow more words in a compound, than COMPOUNDWORDMAX, 16 | # if syllable count of the compound is 6 or less 17 | COMPOUNDSYLLABLE 6 aáeéiíoóöőuúüű 18 | 19 | # test case for commit 1fada01 "fix other regression in compounding" 20 | REP 1 21 | REP kor _kor 22 | 23 | CHECKCOMPOUNDREP 24 | -------------------------------------------------------------------------------- /tests/legacy/conditionalprefix.morph: -------------------------------------------------------------------------------- 1 | > drink 2 | analyze(drink) = st:drink po:verb 3 | analyze(drink) = st:drink po:noun 4 | stem(drink) = drink 5 | > drinks 6 | analyze(drinks) = st:drink po:verb is:3SGV 7 | analyze(drinks) = st:drink po:noun is:PL 8 | stem(drinks) = drink 9 | > drinkable 10 | analyze(drinkable) = st:drink po:verb ds:DER_V_ADJ_ABLE 11 | stem(drinkable) = drinkable 12 | > drinkables 13 | analyze(drinkables) = st:drink po:verb ds:DER_V_ADJ_ABLE is:PL 14 | stem(drinkables) = drinkable 15 | > undrinkable 16 | analyze(undrinkable) = ip:un st:drink po:verb ds:DER_V_ADJ_ABLE 17 | stem(undrinkable) = drinkable 18 | > undrinkables 19 | analyze(undrinkables) = ip:un st:drink po:verb ds:DER_V_ADJ_ABLE is:PL 20 | stem(undrinkables) = drinkable 21 | -------------------------------------------------------------------------------- /benches/compilation.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use foldhash::fast::FixedState; 6 | use spellbook::Dictionary; 7 | use test::{black_box, Bencher}; 8 | 9 | const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff"); 10 | const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic"); 11 | 12 | /// A random seed from a sample run. The values aren't important here: just that they're constant. 13 | /// We don't want the benchmark outputs to reflect random changes to the seed. 14 | const HASHER: FixedState = FixedState::with_seed(16553733157538299820); 15 | 16 | #[bench] 17 | #[allow(non_snake_case)] 18 | fn compile_en_US(b: &mut Bencher) { 19 | b.iter(|| Dictionary::new_with_hasher(black_box(EN_US_AFF), black_box(EN_US_DIC), HASHER)) 20 | } 21 | -------------------------------------------------------------------------------- /examples/load-dictionary.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, io, time::Instant}; 2 | 3 | use spellbook::Dictionary; 4 | 5 | macro_rules! usage { 6 | () => { 7 | eprintln!("Usage: load-dictionary path/to/dict.aff path/to/dict.dic"); 8 | eprintln!(" Note: some shells accept a syntax like path/to/dict.{{aff,dic}}"); 9 | std::process::exit(1); 10 | }; 11 | } 12 | 13 | fn main() -> io::Result<()> { 14 | let mut args = std::env::args().skip(1); 15 | let Some(aff) = args.next() else { 16 | usage!(); 17 | }; 18 | let Some(dic) = args.next() else { 19 | usage!(); 20 | }; 21 | let aff = fs::read_to_string(aff)?; 22 | let dic = fs::read_to_string(dic)?; 23 | let now = Instant::now(); 24 | match Dictionary::new(&aff, &dic) { 25 | Ok(_) => println!("Compiled the dictionary in {:?}", now.elapsed()), 26 | Err(err) => eprintln!("Failed to compile the dictionary: {err}"), 27 | } 28 | Ok(()) 29 | } 30 | -------------------------------------------------------------------------------- /tests/legacy/compoundrule5.morph: -------------------------------------------------------------------------------- 1 | > 10% 2 | analyze(10%) = pa:1 st:1 po:num pa:0 st:0 po:num pa:% st:% po:sign_percent 3 | stem(10%) = 10% 4 | > 0.2% 5 | analyze(0.2%) = pa:0 st:0 po:num pa:. st:. po:sign_dot pa:2 st:2 po:num pa:% st:% po:sign_percent 6 | stem(0.2%) = 0.2% 7 | > 0.20% 8 | analyze(0.20%) = pa:0 st:0 po:num pa:. st:. po:sign_dot pa:2 st:2 po:num pa:0 st:0 po:num pa:% st:% po:sign_percent 9 | stem(0.20%) = 0.20% 10 | > 123.4561‰ 11 | analyze(123.4561‰) = pa:1 st:1 po:num pa:2 st:2 po:num pa:3 st:3 po:num pa:. st:. po:sign_dot pa:4 st:4 po:num pa:5 st:5 po:num pa:6 st:6 po:num pa:1 st:1 po:num pa:‰ st:‰ po:sign_per_mille 12 | stem(123.4561‰) = 123.4561‰ 13 | > 10 14 | analyze(10) = pa:1 st:1 po:num pa:0 st:0 po:num 15 | stem(10) = 10 16 | > 0000 17 | analyze(0000) = pa:0 st:0 po:num pa:0 st:0 po:num pa:0 st:0 po:num pa:0 st:0 po:num 18 | stem(0000) = 0000 19 | > 10.25 20 | analyze(10.25) = pa:1 st:1 po:num pa:0 st:0 po:num pa:. st:. po:sign_dot pa:2 st:2 po:num pa:5 st:5 po:num 21 | stem(10.25) = 10.25 22 | -------------------------------------------------------------------------------- /tests/legacy/germancompounding.wrong: -------------------------------------------------------------------------------- 1 | computer 2 | computern 3 | arbeit 4 | Arbeits 5 | arbeits 6 | ComputerArbeit 7 | ComputernArbeit 8 | Computernarbeit 9 | ComputerArbeits 10 | Arbeitcomputer 11 | Arbeitcomputern 12 | ArbeitsComputer 13 | ArbeitsComputern 14 | Computerarbeitcomputer 15 | ComputerArbeitcomputer 16 | ComputerArbeitscomputer 17 | Computerarbeitcomputern 18 | ComputerArbeitcomputern 19 | ComputerArbeitscomputern 20 | Arbeitscomputerarbeits 21 | Arbeitscomputernarbeits 22 | Computerarbeits-computer 23 | Arbeitsnehmer 24 | computers 25 | computern 26 | computernarbeit 27 | computernArbeit 28 | computerArbeit 29 | computerArbeits 30 | arbeitcomputer 31 | arbeitsComputer 32 | computerarbeitcomputer 33 | computerArbeitcomputer 34 | computerArbeitscomputer 35 | arbeitscomputerarbeits 36 | computerarbeits-computer 37 | arbeitsnehmer 38 | computernarbeit 39 | computernArbeit 40 | arbeits- 41 | computerarbeit 42 | computerarbeits- 43 | arbeitscomputer 44 | arbeitscomputern 45 | computerarbeitscomputer 46 | computerarbeitscomputern 47 | computerarbeitscomputers 48 | arbeitscomputerarbeit 49 | computerarbeits-Computer 50 | computerarbeits-Computern 51 | -------------------------------------------------------------------------------- /tests/legacy/germancompoundingold.wrong: -------------------------------------------------------------------------------- 1 | computer 2 | computern 3 | arbeit 4 | Arbeits 5 | arbeits 6 | ComputerArbeit 7 | ComputernArbeit 8 | Computernarbeit 9 | ComputerArbeits 10 | Arbeitcomputer 11 | Arbeitcomputern 12 | ArbeitsComputer 13 | ArbeitsComputern 14 | Computerarbeitcomputer 15 | ComputerArbeitcomputer 16 | ComputerArbeitscomputer 17 | Computerarbeitcomputern 18 | ComputerArbeitcomputern 19 | ComputerArbeitscomputern 20 | Arbeitscomputerarbeits 21 | Arbeitscomputernarbeits 22 | Computerarbeits-computer 23 | Arbeitsnehmer 24 | computers 25 | computern 26 | computernarbeit 27 | computernArbeit 28 | computerArbeit 29 | computerArbeits 30 | arbeitcomputer 31 | arbeitsComputer 32 | computerarbeitcomputer 33 | computerArbeitcomputer 34 | computerArbeitscomputer 35 | arbeitscomputerarbeits 36 | computerarbeits-computer 37 | arbeitsnehmer 38 | computernarbeit 39 | computernArbeit 40 | arbeits- 41 | computerarbeit 42 | computerarbeits- 43 | arbeitscomputer 44 | arbeitscomputern 45 | computerarbeitscomputer 46 | computerarbeitscomputern 47 | computerarbeitscomputers 48 | arbeitscomputerarbeit 49 | computerarbeits-Computer 50 | computerarbeits-Computern 51 | -------------------------------------------------------------------------------- /tests/legacy/condition_utf.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | WORDCHARS 0123456789 3 | 4 | SFX S N 18 5 | SFX S 0 suf1 . 6 | SFX S 0 suf2 ó 7 | SFX S 0 suf3 [áéóú] 8 | SFX S 0 suf4 [^ó] 9 | SFX S 0 suf5 [^áéóú] 10 | SFX S 0 suf6 őó 11 | SFX S 0 suf7 ő[áéóú] 12 | SFX S 0 suf8 ő[^ó] 13 | SFX S 0 suf9 ő[^áéóú] 14 | SFX S 0 suf10 [áéóőú]ó 15 | SFX S 0 suf11 [^ő]ó 16 | SFX S 0 suf12 [^áéóőú]ó 17 | SFX S 0 suf13 [áéőú][^ú] 18 | SFX S 0 suf14 [^ú][áéóú] 19 | SFX S 0 suf15 [áéóú][^áéőú] 20 | SFX S 0 suf16 [^áéóú][^áéőú] 21 | SFX S 0 suf17 [áéóú][bcdfgkmnóprstvz] 22 | SFX S 0 suf18 [áéóú]ó 23 | 24 | PFX P N 18 25 | PFX P 0 pre1 . 26 | PFX P 0 pre2 ó 27 | PFX P 0 pre3 [áéóú] 28 | PFX P 0 pre4 [^ó] 29 | PFX P 0 pre5 [^áéóú] 30 | PFX P 0 pre6 óő 31 | PFX P 0 pre7 ó[áéőú] 32 | PFX P 0 pre8 ó[^ő] 33 | PFX P 0 pre9 ó[^áéóőú] 34 | PFX P 0 pre10 [áéóőú]ő 35 | PFX P 0 pre11 [^ó]ő 36 | PFX P 0 pre12 [^áéóőú]ő 37 | PFX P 0 pre13 [áéóú][áéőú] 38 | PFX P 0 pre14 [áéóú][^áéóú] 39 | PFX P 0 pre15 [áéóú][^áéőú] 40 | PFX P 0 pre16 [^áéőú][^áéóú] 41 | PFX P 0 pre17 [bcdfgkmnóprstvz][áéóú] 42 | PFX P 0 pre18 ó[áéóú] 43 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spellbook" 3 | version = "0.3.5" 4 | authors = ["Michael Davis "] 5 | description = "A spellchecking library compatible with Hunspell dictionaries" 6 | readme = "README.md" 7 | repository = "https://github.com/helix-editor/spellbook" 8 | keywords = ["spellcheck", "no_std"] 9 | edition = "2021" 10 | license = "MPL-2.0" 11 | rust-version = "1.70" 12 | 13 | [dependencies] 14 | # Used for HashMap and HashTable for a custom bag structure. 15 | hashbrown = { version = ">=0.15", default-features = false } 16 | # Used as the default BuildHasher when the `default-hasher` feature flag 17 | # is enabled (which it is by default). 18 | foldhash = { version = ">=0.1", default-features = false, optional = true } 19 | 20 | [dev-dependencies] 21 | # Used in unit tests to lazily compile en_US. Used instead of 22 | # `core::cell::OnceCell` since it implements `Send + Sync`. 23 | once_cell = "1.19" 24 | # Used in the integration tests to read Hunspell test case files, potentially 25 | # with odd encodings. 26 | encoding_rs = "0.8" 27 | chardetng = "0.1" 28 | 29 | [features] 30 | default = ["default-hasher"] 31 | # Sets a default hasher (currently foldhash). 32 | default-hasher = ["dep:foldhash"] 33 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "A Hunspell-like spell-checking library in Rust"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; 6 | rust-overlay.url = "github:oxalica/rust-overlay"; 7 | }; 8 | 9 | outputs = 10 | { nixpkgs, rust-overlay, ... }: 11 | let 12 | inherit (nixpkgs) lib; 13 | forEachSystem = lib.genAttrs lib.systems.flakeExposed; 14 | in 15 | { 16 | devShell = forEachSystem ( 17 | system: 18 | let 19 | pkgs = import nixpkgs { 20 | inherit system; 21 | overlays = [ rust-overlay.overlays.default ]; 22 | }; 23 | toolchain = pkgs.rust-bin.stable.latest.default; 24 | in 25 | pkgs.mkShell { 26 | nativeBuildInputs = with pkgs; [ 27 | (toolchain.override { 28 | extensions = [ 29 | "rust-src" 30 | "clippy" 31 | "llvm-tools-preview" 32 | ]; 33 | }) 34 | rust-analyzer 35 | cargo-flamegraph 36 | cargo-llvm-cov 37 | valgrind 38 | ]; 39 | RUST_BACKTRACE = "1"; 40 | } 41 | ); 42 | }; 43 | } 44 | -------------------------------------------------------------------------------- /examples/check.rs: -------------------------------------------------------------------------------- 1 | /* 2 | Most basic example for the checker for quick debugging. 3 | 4 | ## Usage 5 | 6 | ``` 7 | $ cargo run --example check hello 8 | Compiled the dictionary in 113ms 9 | "hello" is in the dictionary (checked in 3µs) 10 | $ cargo run --example check helol 11 | Compiled the dictionary in 110ms 12 | "helol" is NOT in the dictionary (checked in 21µs) 13 | ``` 14 | */ 15 | use std::time::Instant; 16 | 17 | use spellbook::Dictionary; 18 | 19 | const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff"); 20 | const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic"); 21 | 22 | fn main() { 23 | let mut args = std::env::args().skip(1); 24 | let word = match args.next() { 25 | Some(arg) => arg, 26 | None => { 27 | eprintln!("Usage: check WORD"); 28 | std::process::exit(1); 29 | } 30 | }; 31 | 32 | let now = Instant::now(); 33 | let dict = Dictionary::new(EN_US_AFF, EN_US_DIC).unwrap(); 34 | println!("Compiled the dictionary in {:?}", now.elapsed()); 35 | 36 | let now = Instant::now(); 37 | if dict.check(&word) { 38 | println!( 39 | "\"{word}\" is in the dictionary (checked in {:?})", 40 | now.elapsed() 41 | ); 42 | } else { 43 | eprintln!( 44 | "\"{word}\" is NOT in the dictionary (checked in {:?})", 45 | now.elapsed() 46 | ); 47 | std::process::exit(1); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /vendor/en_US/WordNet_license.txt: -------------------------------------------------------------------------------- 1 | WordNet Release 2.1 2 | 3 | This software and database is being provided to you, the LICENSEE, by 4 | Princeton University under the following license. By obtaining, using 5 | and/or copying this software and database, you agree that you have 6 | read, understood, and will comply with these terms and conditions.: 7 | 8 | Permission to use, copy, modify and distribute this software and 9 | database and its documentation for any purpose and without fee or 10 | royalty is hereby granted, provided that you agree to comply with 11 | the following copyright notice and statements, including the disclaimer, 12 | and that the same appear on ALL copies of the software, database and 13 | documentation, including modifications that you make for internal 14 | use or for distribution. 15 | 16 | WordNet 2.1 Copyright 2005 by Princeton University. All rights reserved. 17 | 18 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON 19 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 20 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON 21 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- 22 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE 23 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT 24 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR 25 | OTHER RIGHTS. 26 | 27 | The name of Princeton University or Princeton may not be used in 28 | advertising or publicity pertaining to distribution of the software 29 | and/or database. Title to copyright in this software, database and 30 | any associated documentation shall at all times remain with 31 | Princeton University and LICENSEE agrees to preserve same. 32 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "nixpkgs": { 4 | "locked": { 5 | "lastModified": 1724224976, 6 | "narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=", 7 | "owner": "nixos", 8 | "repo": "nixpkgs", 9 | "rev": "c374d94f1536013ca8e92341b540eba4c22f9c62", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "nixos", 14 | "ref": "nixos-unstable", 15 | "repo": "nixpkgs", 16 | "type": "github" 17 | } 18 | }, 19 | "nixpkgs_2": { 20 | "locked": { 21 | "lastModified": 1718428119, 22 | "narHash": "sha256-WdWDpNaq6u1IPtxtYHHWpl5BmabtpmLnMAx0RdJ/vo8=", 23 | "owner": "NixOS", 24 | "repo": "nixpkgs", 25 | "rev": "e6cea36f83499eb4e9cd184c8a8e823296b50ad5", 26 | "type": "github" 27 | }, 28 | "original": { 29 | "owner": "NixOS", 30 | "ref": "nixpkgs-unstable", 31 | "repo": "nixpkgs", 32 | "type": "github" 33 | } 34 | }, 35 | "root": { 36 | "inputs": { 37 | "nixpkgs": "nixpkgs", 38 | "rust-overlay": "rust-overlay" 39 | } 40 | }, 41 | "rust-overlay": { 42 | "inputs": { 43 | "nixpkgs": "nixpkgs_2" 44 | }, 45 | "locked": { 46 | "lastModified": 1724293269, 47 | "narHash": "sha256-x/XhOAszT/ejditCHUtGOjQcVg2AQhrC/QVew3i7kTI=", 48 | "owner": "oxalica", 49 | "repo": "rust-overlay", 50 | "rev": "6dc6d34a3a217457d7044dcce32b6d537480a6a1", 51 | "type": "github" 52 | }, 53 | "original": { 54 | "owner": "oxalica", 55 | "repo": "rust-overlay", 56 | "type": "github" 57 | } 58 | } 59 | }, 60 | "root": "root", 61 | "version": 7 62 | } 63 | -------------------------------------------------------------------------------- /examples/suggest.rs: -------------------------------------------------------------------------------- 1 | /* 2 | Most basic example for the suggester for quick debugging. 3 | 4 | This example doesn't check whether the input word is in the dictionary first. 5 | 6 | ## Usage 7 | 8 | ``` 9 | $ cargo run --example suggest ansi 10 | Compiled the dictionary in 127ms 11 | Suggestions for "ansi": "ANSI", "ans", "anti", "ans i" (checked in 1367µs) 12 | ``` 13 | */ 14 | use std::time::Instant; 15 | 16 | use spellbook::Dictionary; 17 | 18 | const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff"); 19 | const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic"); 20 | 21 | fn main() { 22 | let mut args = std::env::args().skip(1); 23 | let word = match args.next() { 24 | Some(arg) => arg, 25 | None => { 26 | eprintln!("Usage: suggest WORD"); 27 | std::process::exit(1); 28 | } 29 | }; 30 | 31 | let now = Instant::now(); 32 | let dict = Dictionary::new(EN_US_AFF, EN_US_DIC).unwrap(); 33 | println!("Compiled the dictionary in {:?}", now.elapsed()); 34 | 35 | let mut suggestions = Vec::with_capacity(5); 36 | let now = Instant::now(); 37 | dict.suggest(&word, &mut suggestions); 38 | let time = now.elapsed(); 39 | if suggestions.is_empty() { 40 | println!("No suggestions found for \"{word}\" (checked in {time:?})"); 41 | } else { 42 | let suggestions = suggestions 43 | .into_iter() 44 | .fold(String::new(), |mut s, suggestion| { 45 | if !s.is_empty() { 46 | s.push_str(", "); 47 | } 48 | s.push('"'); 49 | s.push_str(&suggestion); 50 | s.push('"'); 51 | s 52 | }); 53 | println!("Suggestions for \"{word}\": {suggestions} (checked in {time:?})"); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /examples/prose.rs: -------------------------------------------------------------------------------- 1 | /* 2 | An example that checks prose piped in via stdin. 3 | 4 | This is meant to get a rough idea of how performant and correct the checker is. The way this 5 | example tokenizes input is very basic. 6 | 7 | ``` 8 | cat "Grapes of Wrath.txt" | cargo run --release --example prose 9 | ``` 10 | */ 11 | 12 | use std::time::Instant; 13 | 14 | use spellbook::Dictionary; 15 | 16 | const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff"); 17 | const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic"); 18 | 19 | fn main() { 20 | if std::env::args().nth(1).is_some() { 21 | eprintln!("This example takes no arguments. Pipe in prose through stdin."); 22 | std::process::exit(1); 23 | } 24 | let mut total_words = 0; 25 | let mut misspelled = 0; 26 | 27 | let now = Instant::now(); 28 | let dict = Dictionary::new(EN_US_AFF, EN_US_DIC).unwrap(); 29 | println!("Compiled the dictionary in {}ms", now.elapsed().as_millis()); 30 | 31 | let now = Instant::now(); 32 | for line in std::io::stdin().lines() { 33 | let line = match line { 34 | Ok(line) => line, 35 | Err(err) => { 36 | eprintln!("Failed to read line from stdin: {err}"); 37 | std::process::exit(1); 38 | } 39 | }; 40 | for word in line.split_whitespace().flat_map(|s| s.split("--")) { 41 | let word = word.trim_matches(|ch: char| !ch.is_ascii_alphabetic()); 42 | total_words += 1; 43 | 44 | if !dict.check(word) { 45 | eprintln!("* {word}"); 46 | misspelled += 1; 47 | } 48 | } 49 | } 50 | 51 | println!( 52 | "Finished in {}ms: {misspelled}/{total_words} words misspelled.", 53 | now.elapsed().as_millis() 54 | ); 55 | } 56 | -------------------------------------------------------------------------------- /tests/legacy/morph.morph: -------------------------------------------------------------------------------- 1 | > drink 2 | analyze(drink) = st:drink po:noun 3 | analyze(drink) = st:drink po:verb al:drank al:drunk ts:present 4 | stem(drink) = drink 5 | > drinks 6 | analyze(drinks) = st:drink po:verb al:drank al:drunk ts:present is:sg_3 7 | analyze(drinks) = st:drink po:noun is:plur 8 | stem(drinks) = drink 9 | > drinkable 10 | analyze(drinkable) = st:drink po:verb al:drank al:drunk ts:present ds:der_able 11 | stem(drinkable) = drinkable 12 | > drinkables 13 | analyze(drinkables) = st:drink po:verb al:drank al:drunk ts:present ds:der_able is:plur 14 | stem(drinkables) = drinkable 15 | > undrinkable 16 | analyze(undrinkable) = dp:pfx_un sp:un st:drink po:verb al:drank al:drunk ts:present ds:der_able 17 | stem(undrinkable) = undrinkable 18 | > undrinkables 19 | analyze(undrinkables) = dp:pfx_un sp:un st:drink po:verb al:drank al:drunk ts:present ds:der_able is:plur 20 | stem(undrinkables) = undrinkable 21 | > drank 22 | analyze(drank) = po:verb st:drink is:past_1 23 | stem(drank) = drink 24 | > drunk 25 | analyze(drunk) = po:verb st:drink is:past_2 26 | stem(drunk) = drink 27 | > phenomenon 28 | analyze(phenomenon) = st:phenomenon po:noun al:phenomena 29 | stem(phenomenon) = phenomenon 30 | > phenomena 31 | analyze(phenomena) = po:noun st:phenomenon is:plur 32 | stem(phenomena) = phenomenon 33 | generate(drink, eat) = drink 34 | generate(drink, eats) = drinks 35 | generate(drink, ate) = drank 36 | generate(drink, eaten) = drunk 37 | generate(drink, eatable) = drinkable 38 | generate(drink, eatables) = drinkables 39 | generate(drink, phenomena) = drinks 40 | generate(drinks, eat) = drink 41 | generate(drinks, eats) = drinks 42 | generate(drinks, ate) = drank 43 | generate(drinks, eaten) = drunk 44 | generate(drinks, eatable) = drinkable 45 | generate(drinks, eatables) = drinkables 46 | generate(drinks, phenomena) = drinks 47 | generate(undrinkable, phenomena) = undrinkables 48 | generate(phenomenon, drinks) = phenomena 49 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "cfg-if" 7 | version = "1.0.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" 10 | 11 | [[package]] 12 | name = "chardetng" 13 | version = "0.1.17" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" 16 | dependencies = [ 17 | "cfg-if", 18 | "encoding_rs", 19 | "memchr", 20 | ] 21 | 22 | [[package]] 23 | name = "encoding_rs" 24 | version = "0.8.35" 25 | source = "registry+https://github.com/rust-lang/crates.io-index" 26 | checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" 27 | dependencies = [ 28 | "cfg-if", 29 | ] 30 | 31 | [[package]] 32 | name = "foldhash" 33 | version = "0.2.0" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" 36 | 37 | [[package]] 38 | name = "hashbrown" 39 | version = "0.16.0" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" 42 | 43 | [[package]] 44 | name = "memchr" 45 | version = "2.7.5" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" 48 | 49 | [[package]] 50 | name = "once_cell" 51 | version = "1.21.3" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 54 | 55 | [[package]] 56 | name = "spellbook" 57 | version = "0.3.5" 58 | dependencies = [ 59 | "chardetng", 60 | "encoding_rs", 61 | "foldhash", 62 | "hashbrown", 63 | "once_cell", 64 | ] 65 | -------------------------------------------------------------------------------- /benches/check.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use once_cell::sync::Lazy; 6 | use spellbook::Dictionary; 7 | use test::{black_box, Bencher}; 8 | 9 | const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff"); 10 | const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic"); 11 | 12 | type RandomState = foldhash::fast::FixedState; 13 | /// A random seed from a sample run. The values aren't important here: just that they're constant. 14 | /// We don't want the benchmark outputs to reflect random changes to the seed. 15 | const HASHER: RandomState = RandomState::with_seed(16553733157538299820); 16 | 17 | static EN_US: Lazy> = 18 | Lazy::new(|| Dictionary::new_with_hasher(EN_US_AFF, EN_US_DIC, HASHER).unwrap()); 19 | 20 | #[bench] 21 | fn in_dictionary_word(b: &mut Bencher) { 22 | b.iter(|| EN_US.check(black_box("earth"))) 23 | } 24 | 25 | #[bench] 26 | fn number(b: &mut Bencher) { 27 | b.iter(|| EN_US.check(black_box("8675,309.0"))) 28 | } 29 | 30 | #[bench] 31 | fn word_with_suffix(b: &mut Bencher) { 32 | b.iter(|| EN_US.check(black_box("earthly"))) 33 | } 34 | 35 | #[bench] 36 | fn word_with_prefix(b: &mut Bencher) { 37 | b.iter(|| EN_US.check(black_box("unearth"))) 38 | } 39 | 40 | #[bench] 41 | fn word_with_prefix_and_suffix(b: &mut Bencher) { 42 | b.iter(|| EN_US.check(black_box("unearthed"))) 43 | } 44 | 45 | #[bench] 46 | fn incorrect_prefix(b: &mut Bencher) { 47 | b.iter(|| EN_US.check(black_box("reearth"))) 48 | } 49 | 50 | #[bench] 51 | fn uppercase_in_dictionary_word(b: &mut Bencher) { 52 | b.iter(|| EN_US.check(black_box("EARTH"))) 53 | } 54 | 55 | #[bench] 56 | fn titlecase_in_dictionary_word(b: &mut Bencher) { 57 | b.iter(|| EN_US.check(black_box("Earth"))) 58 | } 59 | 60 | #[bench] 61 | fn breaks(b: &mut Bencher) { 62 | b.iter(|| EN_US.check(black_box("light-weight-like"))) 63 | } 64 | 65 | #[bench] 66 | fn compound_word(b: &mut Bencher) { 67 | b.iter(|| EN_US.check(black_box("20000th"))) 68 | } 69 | -------------------------------------------------------------------------------- /tests/legacy/license.hunspell: -------------------------------------------------------------------------------- 1 | /* ***** BEGIN LICENSE BLOCK ***** 2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 3 | * 4 | * Copyright (C) 2002-2022 Németh László 5 | * 6 | * The contents of this file are subject to the Mozilla Public License Version 7 | * 1.1 (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * http://www.mozilla.org/MPL/ 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" basis, 12 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 | * for the specific language governing rights and limitations under the 14 | * License. 15 | * 16 | * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. 17 | * 18 | * Contributor(s): 19 | * David Einstein 20 | * Davide Prina 21 | * Giuseppe Modugno 22 | * Gianluca Turconi 23 | * Simon Brouwer 24 | * Noll János 25 | * Bíró Árpád 26 | * Goldman Eleonóra 27 | * Sarlós Tamás 28 | * Bencsáth Boldizsár 29 | * Halácsy Péter 30 | * Dvornik László 31 | * Gefferth András 32 | * Nagy Viktor 33 | * Varga Dániel 34 | * Chris Halls 35 | * Rene Engelhard 36 | * Bram Moolenaar 37 | * Dafydd Jones 38 | * Harri Pitkänen 39 | * Andras Timar 40 | * Tor Lillqvist 41 | * 42 | * Alternatively, the contents of this file may be used under the terms of 43 | * either the GNU General Public License Version 2 or later (the "GPL"), or 44 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 45 | * in which case the provisions of the GPL or the LGPL are applicable instead 46 | * of those above. If you wish to allow use of your version of this file only 47 | * under the terms of either the GPL or the LGPL, and not to allow others to 48 | * use your version of this file under the terms of the MPL, indicate your 49 | * decision by deleting the provisions above and replace them with the notice 50 | * and other provisions required by the GPL or the LGPL. If you do not delete 51 | * the provisions above, a recipient may use your version of this file under 52 | * the terms of any one of the MPL, the GPL or the LGPL. 53 | * 54 | * ***** END LICENSE BLOCK ***** */ 55 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | env: 8 | MSRV: "1.70" 9 | jobs: 10 | check: 11 | name: Check 12 | strategy: 13 | matrix: 14 | toolchain: 15 | - MSRV 16 | - stable 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout sources 20 | uses: actions/checkout@v4 21 | 22 | - name: Install toolchain 23 | uses: dtolnay/rust-toolchain@master 24 | with: 25 | toolchain: ${{ matrix.toolchain == 'MSRV' && env.MSRV || 'stable' }} 26 | 27 | - uses: Swatinem/rust-cache@v2 28 | 29 | - name: Run cargo check 30 | run: | 31 | rustc --version 32 | cargo check 33 | 34 | test: 35 | name: Test 36 | runs-on: ubuntu-latest 37 | steps: 38 | - name: Checkout sources 39 | uses: actions/checkout@v4 40 | 41 | - name: Install MSRV toolchain 42 | uses: dtolnay/rust-toolchain@master 43 | with: 44 | toolchain: "${{ env.MSRV }}" 45 | 46 | - uses: Swatinem/rust-cache@v2 47 | 48 | - name: Check rust version 49 | run: rustc --version 50 | 51 | - name: Run cargo test 52 | run: cargo test 53 | 54 | lints: 55 | name: Lints 56 | runs-on: ubuntu-latest 57 | steps: 58 | - name: Checkout sources 59 | uses: actions/checkout@v4 60 | 61 | - name: Install MSRV toolchain 62 | uses: dtolnay/rust-toolchain@master 63 | with: 64 | toolchain: "${{ env.MSRV }}" 65 | components: rustfmt, clippy 66 | 67 | - uses: Swatinem/rust-cache@v2 68 | 69 | - name: Check rust version 70 | run: rustc --version 71 | 72 | - name: Run cargo fmt 73 | run: cargo fmt --check 74 | 75 | - name: Run cargo clippy with default features 76 | run: cargo clippy -- -D warnings 77 | 78 | - name: Run cargo clippy with no default features 79 | run: cargo clippy --no-default-features -- -D warnings 80 | 81 | - name: Run cargo doc 82 | run: cargo doc --no-deps --document-private-items 83 | env: 84 | RUSTDOCFLAGS: -D warnings 85 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | 9 | 10 | ## [v0.3.5] - 2025-09-12 11 | 12 | ### Fixed 13 | 14 | * [UTF-8 BOM](https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8) characters are 15 | now stripped from dictionary text if present. 16 | * The `#` character is no special-cased as a comment, fixing parsing of some 17 | dictionaries `en_GB`. 18 | * Version requirements on Hashbrown and Foldhash have been loosened. 19 | 20 | ## [v0.3.4] - 2025-04-30 21 | 22 | ### Fixed 23 | 24 | * Fixed a panic similar to the one fixed in v0.3.3 but within the checker instead 25 | of the suggester. 26 | * This panic could happen in dictionaries which used `REP` patterns with end 27 | anchors (i.e. the first word after `REP` ends in `$`) which also sets 28 | `CHECKCOMPOUNDREP` and other related compounding rules. 29 | 30 | ## [v0.3.3] - 2025-04-21 31 | 32 | ### Fixed 33 | 34 | * Fixed a panic possible in the suggester when suggesting corrections for a word with 35 | non-ASCII characters near the end in dictionaries with replacement patterns with end 36 | anchors. 37 | * For example `caféx` in a french dictionary which has a `REP è$ e` rule. 38 | 39 | ## [v0.3.2] - 2025-04-15 40 | 41 | ### Fixed 42 | 43 | * Aligned parsing of flags with Hunspell. This fixes cases where a dictionary would 44 | use non-ASCII characters for flags without setting `FLAG UTF-8`. 45 | 46 | ## [v0.3.1] - 2025-03-11 47 | 48 | ### Fixed 49 | 50 | * Fixed handling of Unicode flags which are represented by more than one code 51 | unit in UTF-16 representation, for example emoji such as '🔭'. 52 | 53 | ## [v0.3.0] - 2025-02-04 54 | 55 | ### Added 56 | 57 | * Exposed the `Checker` type. 58 | * Added `Checker::check_lower_as_title` and `Checker::check_lower_as_upper` to 59 | configure the checker to try lowercase words as title and/or uppercase. 60 | 61 | ### Changed 62 | 63 | * The `default-hasher` feature flag now uses [`foldhash`](https://github.com/orlp/foldhash) 64 | instead of [`ahash`](https://github.com/tkaitchuck/aHash). 65 | 66 | ## [v0.2.0] - 2024-11-18 67 | 68 | ### Added 69 | 70 | * Added support for `Dictionary::suggest` and the `Suggester` type. 71 | 72 | ### Updated 73 | 74 | * Changed the internal representation of word stems and flagsets for reduced 75 | memory consumption. [More...](https://the-mikedavis.github.io/posts/german-string-optimizations-in-spellbook/) 76 | 77 | ## [v0.1.0] - 2024-09-08 78 | 79 | ### Added 80 | 81 | * Initial support for `Dictionary::new`, `Dictionary::check` and `Dictionary::add` 82 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | New to contributing to projects on GitHub? GitHub provides [getting started documentation](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests) for contributing via pull requests (PRs). 4 | 5 | ## Installing Rust 6 | 7 | To build and test Spellbook you will need a Rust compiler and the `cargo` command. [rustup](https://rustup.rs/) is the recommended way to install Rust: it manages versions and components and tools of Rust like `cargo`, `clippy` and `rust-analyzer`. Install `rustup` as [the Rust documentation recommends](https://www.rust-lang.org/tools/install). 8 | 9 | If you use [Nix or NixOS](https://nixos.org/) you can use the [flake](https://nixos.wiki/wiki/flakes) via `nix develop` or `shell.nix` via `nix-shell` in the repository root to spawn a shell with the necessary dependencies. Using Nix is entirely optional and is only provided for convenience. 10 | 11 | ## MSRV 12 | 13 | Spellbook keeps an intentionally low minimum supported Rust version for the sake of easy compatibility with consuming applications. 14 | 15 | The MSRV should never rise higher than allowed by [Mozilla's Firefox MSRV policy](https://firefox-source-docs.mozilla.org/writing-rust-code/update-policy.html) and should lag behind by as many more versions as is practical. The current MSRV can be found in `Cargo.toml` under the `package.rust-version` key. 16 | 17 | ## Linting 18 | 19 | Spellbook's CI expects the following lints to pass: 20 | 21 | * `cargo fmt --check` - the project is formatted 22 | * `cargo clippy` - there are no lints with default features enabled 23 | * `cargo clippy --no-default-features` - there are no lints with no default features 24 | * `cargo doc --document-private-items` - documentation can be generated without warnings 25 | 26 | ## Testing 27 | 28 | Run the testsuite with: 29 | 30 | ``` 31 | cargo test 32 | ``` 33 | 34 | The testsuite has three parts: 35 | 36 | * Unit tests. These are in `src/**.rs` in `test` modules at the end of each file. 37 | * Doc tests. These are in markdown codefences in documentation comments (comments starting with `///`). 38 | * "Legacy" tests. These tests live under `tests/legacy/` and are run by `tests/legacy.rs`. These cases are ported originally from the Hunspell codebase and should not be modified or expanded unless an equivalent test exists upstream in Hunspell or Nuspell. 39 | 40 | ### Adding a test case 41 | 42 | Avoid adding a test case under `tests/legacy/`. Instead if you're fixing a bug with the checker for example, add a unit test case to the `test` module at the bottom of `src/checker.rs`. Follow examples in that module for creating small dictionaries within the unit test function. 43 | 44 | ### Generating code coverage reports 45 | 46 | You can generate a human readable coverage report using [`cargo-llvm-cov`](https://github.com/taiki-e/cargo-llvm-cov). With that tool installed: 47 | 48 | ``` 49 | cargo llvm-cov --html test 50 | ``` 51 | 52 | For this to work you also need `llvm-tools-preview` which can be installed by `rustup`. `cargo-llvm-cov` and `llvm-tools-preview` are included in the Nix development shell for convenience. 53 | 54 | ## Benchmarking 55 | 56 | There are a few benchmarks in the `examples/` directory which use the nightly benchmarking feature. When using Rustup, use `cargo +nightly bench` before and after a change to see the difference in timings. Note that the timing each benchmark might vary slightly between runs: the timing of a case swaying plus or minus 5% is not unusual. Run the benchmark multiple times to get a feel for how a change impacted performance. 57 | -------------------------------------------------------------------------------- /tests/legacy/base.aff: -------------------------------------------------------------------------------- 1 | # OpenOffice.org's en_US.aff file 2 | 3 | SET ISO8859-1 4 | TRY esianrtolcdugmphbyfvkwz' 5 | 6 | WORDCHARS .' 7 | 8 | PFX A Y 1 9 | PFX A 0 re . 10 | 11 | PFX I Y 1 12 | PFX I 0 in . 13 | 14 | PFX U Y 1 15 | PFX U 0 un . 16 | 17 | PFX C Y 1 18 | PFX C 0 de . 19 | 20 | PFX E Y 1 21 | PFX E 0 dis . 22 | 23 | PFX F Y 1 24 | PFX F 0 con . 25 | 26 | PFX K Y 1 27 | PFX K 0 pro . 28 | 29 | SFX V N 2 30 | SFX V e ive e 31 | SFX V 0 ive [^e] 32 | 33 | SFX N Y 3 34 | SFX N e ion e 35 | SFX N y ication y 36 | SFX N 0 en [^ey] 37 | 38 | SFX X Y 3 39 | SFX X e ions e 40 | SFX X y ications y 41 | SFX X 0 ens [^ey] 42 | 43 | SFX H N 2 44 | SFX H y ieth y 45 | SFX H 0 th [^y] 46 | 47 | SFX Y Y 1 48 | SFX Y 0 ly . 49 | 50 | SFX G Y 2 51 | SFX G e ing e 52 | SFX G 0 ing [^e] 53 | 54 | SFX J Y 2 55 | SFX J e ings e 56 | SFX J 0 ings [^e] 57 | 58 | SFX D Y 4 59 | SFX D 0 d e 60 | SFX D y ied [^aeiou]y 61 | SFX D 0 ed [^ey] 62 | SFX D 0 ed [aeiou]y 63 | 64 | SFX T N 4 65 | SFX T 0 st e 66 | SFX T y iest [^aeiou]y 67 | SFX T 0 est [aeiou]y 68 | SFX T 0 est [^ey] 69 | 70 | SFX R Y 4 71 | SFX R 0 r e 72 | SFX R y ier [^aeiou]y 73 | SFX R 0 er [aeiou]y 74 | SFX R 0 er [^ey] 75 | 76 | SFX Z Y 4 77 | SFX Z 0 rs e 78 | SFX Z y iers [^aeiou]y 79 | SFX Z 0 ers [aeiou]y 80 | SFX Z 0 ers [^ey] 81 | 82 | SFX S Y 4 83 | SFX S y ies [^aeiou]y 84 | SFX S 0 s [aeiou]y 85 | SFX S 0 es [sxzh] 86 | SFX S 0 s [^sxzhy] 87 | 88 | SFX P Y 3 89 | SFX P y iness [^aeiou]y 90 | SFX P 0 ness [aeiou]y 91 | SFX P 0 ness [^y] 92 | 93 | SFX M Y 1 94 | SFX M 0 's . 95 | 96 | SFX B Y 3 97 | SFX B 0 able [^aeiou] 98 | SFX B 0 able ee 99 | SFX B e able [^aeiou]e 100 | 101 | SFX L Y 1 102 | SFX L 0 ment . 103 | 104 | REP 88 105 | REP a ei 106 | REP ei a 107 | REP a ey 108 | REP ey a 109 | REP ai ie 110 | REP ie ai 111 | REP are air 112 | REP are ear 113 | REP are eir 114 | REP air are 115 | REP air ere 116 | REP ere air 117 | REP ere ear 118 | REP ere eir 119 | REP ear are 120 | REP ear air 121 | REP ear ere 122 | REP eir are 123 | REP eir ere 124 | REP ch te 125 | REP te ch 126 | REP ch ti 127 | REP ti ch 128 | REP ch tu 129 | REP tu ch 130 | REP ch s 131 | REP s ch 132 | REP ch k 133 | REP k ch 134 | REP f ph 135 | REP ph f 136 | REP gh f 137 | REP f gh 138 | REP i igh 139 | REP igh i 140 | REP i uy 141 | REP uy i 142 | REP i ee 143 | REP ee i 144 | REP j di 145 | REP di j 146 | REP j gg 147 | REP gg j 148 | REP j ge 149 | REP ge j 150 | REP s ti 151 | REP ti s 152 | REP s ci 153 | REP ci s 154 | REP k cc 155 | REP cc k 156 | REP k qu 157 | REP qu k 158 | REP kw qu 159 | REP o eau 160 | REP eau o 161 | REP o ew 162 | REP ew o 163 | REP oo ew 164 | REP ew oo 165 | REP ew ui 166 | REP ui ew 167 | REP oo ui 168 | REP ui oo 169 | REP ew u 170 | REP u ew 171 | REP oo u 172 | REP u oo 173 | REP u oe 174 | REP oe u 175 | REP u ieu 176 | REP ieu u 177 | REP ue ew 178 | REP ew ue 179 | REP uff ough 180 | REP oo ieu 181 | REP ieu oo 182 | REP ier ear 183 | REP ear ier 184 | REP ear air 185 | REP air ear 186 | REP w qu 187 | REP qu w 188 | REP z ss 189 | REP ss z 190 | REP shun tion 191 | REP shun sion 192 | REP shun cion 193 | -------------------------------------------------------------------------------- /tests/legacy/base_utf.aff: -------------------------------------------------------------------------------- 1 | # OpenOffice.org’s en_US.aff file 2 | # with Unicode apostrophe: ’ 3 | 4 | SET UTF-8 5 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' 6 | 7 | MAXNGRAMSUGS 1 8 | WORDCHARS .'’ 9 | 10 | PFX A Y 1 11 | PFX A 0 re . 12 | 13 | PFX I Y 1 14 | PFX I 0 in . 15 | 16 | PFX U Y 1 17 | PFX U 0 un . 18 | 19 | PFX C Y 1 20 | PFX C 0 de . 21 | 22 | PFX E Y 1 23 | PFX E 0 dis . 24 | 25 | PFX F Y 1 26 | PFX F 0 con . 27 | 28 | PFX K Y 1 29 | PFX K 0 pro . 30 | 31 | SFX V N 2 32 | SFX V e ive e 33 | SFX V 0 ive [^e] 34 | 35 | SFX N Y 3 36 | SFX N e ion e 37 | SFX N y ication y 38 | SFX N 0 en [^ey] 39 | 40 | SFX X Y 3 41 | SFX X e ions e 42 | SFX X y ications y 43 | SFX X 0 ens [^ey] 44 | 45 | SFX H N 2 46 | SFX H y ieth y 47 | SFX H 0 th [^y] 48 | 49 | SFX Y Y 1 50 | SFX Y 0 ly . 51 | 52 | SFX G Y 2 53 | SFX G e ing e 54 | SFX G 0 ing [^e] 55 | 56 | SFX J Y 2 57 | SFX J e ings e 58 | SFX J 0 ings [^e] 59 | 60 | SFX D Y 4 61 | SFX D 0 d e 62 | SFX D y ied [^aeiou]y 63 | SFX D 0 ed [^ey] 64 | SFX D 0 ed [aeiou]y 65 | 66 | SFX T N 4 67 | SFX T 0 st e 68 | SFX T y iest [^aeiou]y 69 | SFX T 0 est [aeiou]y 70 | SFX T 0 est [^ey] 71 | 72 | SFX R Y 4 73 | SFX R 0 r e 74 | SFX R y ier [^aeiou]y 75 | SFX R 0 er [aeiou]y 76 | SFX R 0 er [^ey] 77 | 78 | SFX Z Y 4 79 | SFX Z 0 rs e 80 | SFX Z y iers [^aeiou]y 81 | SFX Z 0 ers [aeiou]y 82 | SFX Z 0 ers [^ey] 83 | 84 | SFX S Y 4 85 | SFX S y ies [^aeiou]y 86 | SFX S 0 s [aeiou]y 87 | SFX S 0 es [sxzh] 88 | SFX S 0 s [^sxzhy] 89 | 90 | SFX P Y 3 91 | SFX P y iness [^aeiou]y 92 | SFX P 0 ness [aeiou]y 93 | SFX P 0 ness [^y] 94 | 95 | SFX M Y 1 96 | SFX M 0 's . 97 | 98 | SFX B Y 3 99 | SFX B 0 able [^aeiou] 100 | SFX B 0 able ee 101 | SFX B e able [^aeiou]e 102 | 103 | SFX L Y 1 104 | SFX L 0 ment . 105 | 106 | REP 88 107 | REP a ei 108 | REP ei a 109 | REP a ey 110 | REP ey a 111 | REP ai ie 112 | REP ie ai 113 | REP are air 114 | REP are ear 115 | REP are eir 116 | REP air are 117 | REP air ere 118 | REP ere air 119 | REP ere ear 120 | REP ere eir 121 | REP ear are 122 | REP ear air 123 | REP ear ere 124 | REP eir are 125 | REP eir ere 126 | REP ch te 127 | REP te ch 128 | REP ch ti 129 | REP ti ch 130 | REP ch tu 131 | REP tu ch 132 | REP ch s 133 | REP s ch 134 | REP ch k 135 | REP k ch 136 | REP f ph 137 | REP ph f 138 | REP gh f 139 | REP f gh 140 | REP i igh 141 | REP igh i 142 | REP i uy 143 | REP uy i 144 | REP i ee 145 | REP ee i 146 | REP j di 147 | REP di j 148 | REP j gg 149 | REP gg j 150 | REP j ge 151 | REP ge j 152 | REP s ti 153 | REP ti s 154 | REP s ci 155 | REP ci s 156 | REP k cc 157 | REP cc k 158 | REP k qu 159 | REP qu k 160 | REP kw qu 161 | REP o eau 162 | REP eau o 163 | REP o ew 164 | REP ew o 165 | REP oo ew 166 | REP ew oo 167 | REP ew ui 168 | REP ui ew 169 | REP oo ui 170 | REP ui oo 171 | REP ew u 172 | REP u ew 173 | REP oo u 174 | REP u oo 175 | REP u oe 176 | REP oe u 177 | REP u ieu 178 | REP ieu u 179 | REP ue ew 180 | REP ew ue 181 | REP uff ough 182 | REP oo ieu 183 | REP ieu oo 184 | REP ier ear 185 | REP ear ier 186 | REP ear air 187 | REP air ear 188 | REP w qu 189 | REP qu w 190 | REP z ss 191 | REP ss z 192 | REP shun tion 193 | REP shun sion 194 | REP shun cion 195 | McDonalds’sá/w 196 | McDonald’sszá/g3) st:McDonald’s po:noun_prs is:TRANS 197 | McDonald’sszal/g4) st:McDonald’s po:noun_prs is:INSTR 198 | McDonald’ssal/w 199 | -------------------------------------------------------------------------------- /vendor/en_US/en_US.aff: -------------------------------------------------------------------------------- 1 | SET UTF-8 2 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' 3 | ICONV 1 4 | ICONV ’ ' 5 | NOSUGGEST ! 6 | 7 | # ordinal numbers 8 | COMPOUNDMIN 1 9 | # only in compounds: 1th, 2th, 3th 10 | ONLYINCOMPOUND c 11 | # compound rules: 12 | # 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.) 13 | # 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.) 14 | COMPOUNDRULE 2 15 | COMPOUNDRULE n*1t 16 | COMPOUNDRULE n*mp 17 | WORDCHARS 0123456789 18 | 19 | PFX A Y 1 20 | PFX A 0 re . 21 | 22 | PFX I Y 1 23 | PFX I 0 in . 24 | 25 | PFX U Y 1 26 | PFX U 0 un . 27 | 28 | PFX C Y 1 29 | PFX C 0 de . 30 | 31 | PFX E Y 1 32 | PFX E 0 dis . 33 | 34 | PFX F Y 1 35 | PFX F 0 con . 36 | 37 | PFX K Y 1 38 | PFX K 0 pro . 39 | 40 | SFX V N 2 41 | SFX V e ive e 42 | SFX V 0 ive [^e] 43 | 44 | SFX N Y 3 45 | SFX N e ion e 46 | SFX N y ication y 47 | SFX N 0 en [^ey] 48 | 49 | SFX X Y 3 50 | SFX X e ions e 51 | SFX X y ications y 52 | SFX X 0 ens [^ey] 53 | 54 | SFX H N 2 55 | SFX H y ieth y 56 | SFX H 0 th [^y] 57 | 58 | SFX Y Y 1 59 | SFX Y 0 ly . 60 | 61 | SFX G Y 2 62 | SFX G e ing e 63 | SFX G 0 ing [^e] 64 | 65 | SFX J Y 2 66 | SFX J e ings e 67 | SFX J 0 ings [^e] 68 | 69 | SFX D Y 4 70 | SFX D 0 d e 71 | SFX D y ied [^aeiou]y 72 | SFX D 0 ed [^ey] 73 | SFX D 0 ed [aeiou]y 74 | 75 | SFX T N 4 76 | SFX T 0 st e 77 | SFX T y iest [^aeiou]y 78 | SFX T 0 est [aeiou]y 79 | SFX T 0 est [^ey] 80 | 81 | SFX R Y 4 82 | SFX R 0 r e 83 | SFX R y ier [^aeiou]y 84 | SFX R 0 er [aeiou]y 85 | SFX R 0 er [^ey] 86 | 87 | SFX Z Y 4 88 | SFX Z 0 rs e 89 | SFX Z y iers [^aeiou]y 90 | SFX Z 0 ers [aeiou]y 91 | SFX Z 0 ers [^ey] 92 | 93 | SFX S Y 4 94 | SFX S y ies [^aeiou]y 95 | SFX S 0 s [aeiou]y 96 | SFX S 0 es [sxzh] 97 | SFX S 0 s [^sxzhy] 98 | 99 | SFX P Y 3 100 | SFX P y iness [^aeiou]y 101 | SFX P 0 ness [aeiou]y 102 | SFX P 0 ness [^y] 103 | 104 | SFX M Y 1 105 | SFX M 0 's . 106 | 107 | SFX B Y 3 108 | SFX B 0 able [^aeiou] 109 | SFX B 0 able ee 110 | SFX B e able [^aeiou]e 111 | 112 | SFX L Y 1 113 | SFX L 0 ment . 114 | 115 | REP 90 116 | REP a ei 117 | REP ei a 118 | REP a ey 119 | REP ey a 120 | REP ai ie 121 | REP ie ai 122 | REP alot a_lot 123 | REP are air 124 | REP are ear 125 | REP are eir 126 | REP air are 127 | REP air ere 128 | REP ere air 129 | REP ere ear 130 | REP ere eir 131 | REP ear are 132 | REP ear air 133 | REP ear ere 134 | REP eir are 135 | REP eir ere 136 | REP ch te 137 | REP te ch 138 | REP ch ti 139 | REP ti ch 140 | REP ch tu 141 | REP tu ch 142 | REP ch s 143 | REP s ch 144 | REP ch k 145 | REP k ch 146 | REP f ph 147 | REP ph f 148 | REP gh f 149 | REP f gh 150 | REP i igh 151 | REP igh i 152 | REP i uy 153 | REP uy i 154 | REP i ee 155 | REP ee i 156 | REP j di 157 | REP di j 158 | REP j gg 159 | REP gg j 160 | REP j ge 161 | REP ge j 162 | REP s ti 163 | REP ti s 164 | REP s ci 165 | REP ci s 166 | REP k cc 167 | REP cc k 168 | REP k qu 169 | REP qu k 170 | REP kw qu 171 | REP o eau 172 | REP eau o 173 | REP o ew 174 | REP ew o 175 | REP oo ew 176 | REP ew oo 177 | REP ew ui 178 | REP ui ew 179 | REP oo ui 180 | REP ui oo 181 | REP ew u 182 | REP u ew 183 | REP oo u 184 | REP u oo 185 | REP u oe 186 | REP oe u 187 | REP u ieu 188 | REP ieu u 189 | REP ue ew 190 | REP ew ue 191 | REP uff ough 192 | REP oo ieu 193 | REP ieu oo 194 | REP ier ear 195 | REP ear ier 196 | REP ear air 197 | REP air ear 198 | REP w qu 199 | REP qu w 200 | REP z ss 201 | REP ss z 202 | REP shun tion 203 | REP shun sion 204 | REP shun cion 205 | REP size cise 206 | -------------------------------------------------------------------------------- /tests/legacy/i35725.aff: -------------------------------------------------------------------------------- 1 | # Ngram suggestions 2 | # - fix case problem 3 | # - detect character swapping (keep only these suggestions) 4 | # - lesser suggestions 5 | # - weight with common subsequence algorithm 6 | # - suggest uppercased words 7 | 8 | # 2007-02-05: 9 | # now not neighbour character replacements and character movings are 10 | # detected by not ngram suggestions, too. 11 | 12 | # OpenOffice.org's en_US.aff file 13 | 14 | SET ISO8859-1 15 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' 16 | 17 | WORDCHARS ' 18 | 19 | PFX A Y 1 20 | PFX A 0 re . 21 | 22 | PFX I Y 1 23 | PFX I 0 in . 24 | 25 | PFX U Y 1 26 | PFX U 0 un . 27 | 28 | PFX C Y 1 29 | PFX C 0 de . 30 | 31 | PFX E Y 1 32 | PFX E 0 dis . 33 | 34 | PFX F Y 1 35 | PFX F 0 con . 36 | 37 | PFX K Y 1 38 | PFX K 0 pro . 39 | 40 | SFX V N 2 41 | SFX V e ive e 42 | SFX V 0 ive [^e] 43 | 44 | SFX N Y 3 45 | SFX N e ion e 46 | SFX N y ication y 47 | SFX N 0 en [^ey] 48 | 49 | SFX X Y 3 50 | SFX X e ions e 51 | SFX X y ications y 52 | SFX X 0 ens [^ey] 53 | 54 | SFX H N 2 55 | SFX H y ieth y 56 | SFX H 0 th [^y] 57 | 58 | SFX Y Y 1 59 | SFX Y 0 ly . 60 | 61 | SFX G Y 2 62 | SFX G e ing e 63 | SFX G 0 ing [^e] 64 | 65 | SFX J Y 2 66 | SFX J e ings e 67 | SFX J 0 ings [^e] 68 | 69 | SFX D Y 4 70 | SFX D 0 d e 71 | SFX D y ied [^aeiou]y 72 | SFX D 0 ed [^ey] 73 | SFX D 0 ed [aeiou]y 74 | 75 | SFX T N 4 76 | SFX T 0 st e 77 | SFX T y iest [^aeiou]y 78 | SFX T 0 est [aeiou]y 79 | SFX T 0 est [^ey] 80 | 81 | SFX R Y 4 82 | SFX R 0 r e 83 | SFX R y ier [^aeiou]y 84 | SFX R 0 er [aeiou]y 85 | SFX R 0 er [^ey] 86 | 87 | SFX Z Y 4 88 | SFX Z 0 rs e 89 | SFX Z y iers [^aeiou]y 90 | SFX Z 0 ers [aeiou]y 91 | SFX Z 0 ers [^ey] 92 | 93 | SFX S Y 4 94 | SFX S y ies [^aeiou]y 95 | SFX S 0 s [aeiou]y 96 | SFX S 0 es [sxzh] 97 | SFX S 0 s [^sxzhy] 98 | 99 | SFX P Y 3 100 | SFX P y iness [^aeiou]y 101 | SFX P 0 ness [aeiou]y 102 | SFX P 0 ness [^y] 103 | 104 | SFX M Y 1 105 | SFX M 0 's . 106 | 107 | SFX B Y 3 108 | SFX B 0 able [^aeiou] 109 | SFX B 0 able ee 110 | SFX B e able [^aeiou]e 111 | 112 | SFX L Y 1 113 | SFX L 0 ment . 114 | 115 | REP 88 116 | REP a ei 117 | REP ei a 118 | REP a ey 119 | REP ey a 120 | REP ai ie 121 | REP ie ai 122 | REP are air 123 | REP are ear 124 | REP are eir 125 | REP air are 126 | REP air ere 127 | REP ere air 128 | REP ere ear 129 | REP ere eir 130 | REP ear are 131 | REP ear air 132 | REP ear ere 133 | REP eir are 134 | REP eir ere 135 | REP ch te 136 | REP te ch 137 | REP ch ti 138 | REP ti ch 139 | REP ch tu 140 | REP tu ch 141 | REP ch s 142 | REP s ch 143 | REP ch k 144 | REP k ch 145 | REP f ph 146 | REP ph f 147 | REP gh f 148 | REP f gh 149 | REP i igh 150 | REP igh i 151 | REP i uy 152 | REP uy i 153 | REP i ee 154 | REP ee i 155 | REP j di 156 | REP di j 157 | REP j gg 158 | REP gg j 159 | REP j ge 160 | REP ge j 161 | REP s ti 162 | REP ti s 163 | REP s ci 164 | REP ci s 165 | REP k cc 166 | REP cc k 167 | REP k qu 168 | REP qu k 169 | REP kw qu 170 | REP o eau 171 | REP eau o 172 | REP o ew 173 | REP ew o 174 | REP oo ew 175 | REP ew oo 176 | REP ew ui 177 | REP ui ew 178 | REP oo ui 179 | REP ui oo 180 | REP ew u 181 | REP u ew 182 | REP oo u 183 | REP u oo 184 | REP u oe 185 | REP oe u 186 | REP u ieu 187 | REP ieu u 188 | REP ue ew 189 | REP ew ue 190 | REP uff ough 191 | REP oo ieu 192 | REP ieu oo 193 | REP ier ear 194 | REP ear ier 195 | REP ear air 196 | REP air ear 197 | REP w qu 198 | REP qu w 199 | REP z ss 200 | REP ss z 201 | REP shun tion 202 | REP shun sion 203 | REP shun cion 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spellbook 2 | 3 | [![Crates.io](https://img.shields.io/crates/v/spellbook.svg)](https://crates.io/crates/spellbook) 4 | [![Documentation](https://docs.rs/spellbook/badge.svg)](https://docs.rs/spellbook) 5 | 6 | Spellbook is a Rust spellchecking library compatible with the Hunspell dictionary format. 7 | 8 | ```rust 9 | fn main() { 10 | let aff = std::fs::read_to_string("en_US.aff").unwrap(); 11 | let dic = std::fs::read_to_string("en_US.dic").unwrap(); 12 | let dict = spellbook::Dictionary::new(&aff, &dic).unwrap(); 13 | 14 | let word = std::env::args().nth(1).expect("expected a word to check"); 15 | 16 | if dict.check(&word) { 17 | println!("{word:?} is in the dictionary."); 18 | } else { 19 | let mut suggestions = Vec::new(); 20 | dict.suggest(&word, &mut suggestions); 21 | eprintln!("{word:?} is NOT in the dictionary. Did you mean {suggestions:?}?"); 22 | std::process::exit(1); 23 | } 24 | } 25 | ``` 26 | 27 | Spellbook aims to be minimal: it is `no_std` and only requires [`hashbrown`] as a dependency. (Note that [`foldhash`] is included by default, see the feature flags section below.) 28 | 29 | ### Maturity 30 | 31 | Spellbook is a work in progress and might see breaking changes to any part of the API as well as updates to the MSRV and dependencies. 32 | 33 | Currently the `check` API works well for `en_US` - a relatively simple dictionary - though it should work reasonably well for most other dictionaries. Some dictionaries which use complex compounding directives may work less well. 34 | 35 | The `suggest` API was added in v0.2.0 and should behave the same as Nuspell's `suggest`. (Meaning that phonetic suggestions are not implemented.) 36 | 37 | Spellbook should be considered to be in _alpha_. Almost all of the Hunspell test corpus tested by Nuspell is passing. 38 | 39 | ### Feature flags 40 | 41 | Spellbook follows [`hashbrown`] by including a `default-hasher` feature flag which is enabled by default. Like Hashbrown v0.15+, the default hasher is [`foldhash`]. 42 | 43 | A non-cryptographic hash significantly improves the time it takes to initialize a dictionary and check and suggest words. Denial-of-service attacks are not usually relevant for this use-case since you would usually not take dictionary files as arbitrary inputs, so a non-cryptographic hash is probably ok. (I am not a cryptologist.) Note that Hashbrown v0.14 and lower used [`ahash`](https://github.com/tkaitchuck/aHash) instead of foldhash. In my runs of the Spellbook benchmarks there is no perceptible performance difference between `foldhash` and `ahash`. 44 | 45 | If you wish to use a different hasher you may turn this default feature off: 46 | 47 | ```toml 48 | [dependencies] 49 | spellbook = { version = "1.0", default-features = false } 50 | ``` 51 | 52 | and specify a hasher of your choosing instead: 53 | 54 | ```rust 55 | use std::hash::BuildHasherDefault; 56 | type Dictionary = spellbook::Dictionary>; 57 | ``` 58 | 59 | ### Spellbook in practice 60 | 61 | Spellbook is used "in the wild" in these projects: 62 | 63 | * [`codebook`](https://github.com/blopker/codebook) uses [`tree-sitter`](https://github.com/tree-sitter/tree-sitter) and Spellbook for a programming-language-friendly spell checking extension for the Zed editor and a LSP language server 64 | * [`cargo-spellcheck`](https://github.com/drahnr/cargo-spellcheck) uses Spellbook as an optional back end under the `spellbook` feature flag for a CLI spell checking tool for [`cargo`](https://github.com/rust-lang/cargo) Rust projects 65 | 66 | ### How does it work? 67 | 68 | For a more in depth overview, check out [`@zverok`]'s blog series [Rebuilding the spellchecker][zverok-blog]. 69 | 70 | Hunspell dictionaries are split into two files: `.dic` and `.aff`. 71 | The `.dic` file has a listing of stems and flags associated with that stem. For example `en_US.dic` contains the word `adventure/DRSMZG` meaning that "adventure" is a stem in the dictionary with flags `D`, `R`, `S`, `M`, `Z` and `G`. 72 | The `.aff` file contains a bunch of rules to use when determining if a word is correct or figuring out which words to suggest. The most intuitive of these are prefixes and suffixes. `en_US` contains suffixes like `R` and `G`: 73 | 74 | ``` 75 | SFX R Y 4 76 | SFX R 0 r e 77 | SFX R y ier [^aeiou]y 78 | SFX R 0 er [aeiou]y 79 | SFX R 0 er [^ey] 80 | 81 | SFX G Y 2 82 | SFX G e ing e 83 | SFX G 0 ing [^e] 84 | ``` 85 | 86 | Since "adventure" has these flags, these suffixes can be applied. The rules are structured as tables that define the flag (like `R`), what to strip from the end of the word (`0` for nothing), what to add to the end (`er` for example) and under what condition the suffix applies (matches `[^aeiou]y` meaning not 'a' 'e' 'i' 'o' 'u' and then 'y' for example). When checking a word like "adventurer" you find any suffixes where the "add" portion of the suffix matches the ending of the word and check if the condition applies. The first clause of `R` applies since the "adventure" ends in 'e', and we add a 'r' to the end. When checking this happens in reverse. Starting with a word like "adventurer" we strip the 'r' and check the condition. Similarly with `G`, the first clause matches "adventuring" because "adventure" ends with 'e' and we add an "ing". 87 | 88 | Hunspell dictionaries use these prefixing and suffixing rules to compress the dictionary. Without prefixes and suffixes we'd need a big set of every possible conjugation of every word in the dictionary. That might be possible with the gigabytes of RAM we have today but it certainly isn't efficient. 89 | 90 | Another way Hunspell dictionaries "compress" words like this is compounding. For example with the COMPOUNDRULE directive: 91 | 92 | ``` 93 | # compound rules: 94 | # 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.) 95 | # 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.) 96 | COMPOUNDRULE 2 97 | COMPOUNDRULE n*1t 98 | COMPOUNDRULE n*mp 99 | ``` 100 | 101 | `en_US.dic` has words for digits like `0/nm`, `0th/pt`, `1/n1`, `1st/p`, etc. The COMPOUNDRULE directive describes a regex-like pattern using flags and `*` (zero-or-more) and `?` (zero-or-one) modifiers. For example the first compound rule in the table `n*1t` allows a word like "10th": it matches the `n` flag zero times and then "1" (the stem of the `1` flag in the `.dic` file) and "0th". The `n*` modifier at the front allows adding any number of any other digit, so this rule also allows words like "110th" or "10000th". 102 | 103 | ### Other docs 104 | 105 | * An overview of [internals](./docs/internals.md) 106 | * [Comparisons](./docs/compare.md) to other spellcheckers 107 | * [Development and contributing notes](./docs/CONTRIBUTING.md) 108 | 109 | ### Credits 110 | 111 | * [`@zverok`]'s [blog series on rebuilding Hunspell][zverok-blog] was an invaluable resource during early prototypes. The old [`spylls`](https://github.com/zverok/spylls)-like prototype can be found on the `spylls` branch. 112 | * Ultimately [Nuspell](https://github.com/nuspell/nuspell)'s codebase became the reference for Spellbook though as C++ idioms mesh better with Rust than Python's. Nuspell's code is in great shape and is much more readable than Hunspell so for now Spellbook is essentially a Rust rewrite of Nuspell (though we may diverge in the future). 113 | * The parser for `.dic` and `.aff` files is loosely based on [ZSpell](https://github.com/pluots/zspell). 114 | 115 | [`hashbrown`]: https://github.com/rust-lang/hashbrown 116 | [`foldhash`]: https://github.com/orlp/foldhash 117 | [`@zverok`]: https://github.com/zverok 118 | [zverok-blog]: https://zverok.space/spellchecker.html 119 | -------------------------------------------------------------------------------- /docs/internals.md: -------------------------------------------------------------------------------- 1 | # Internals 2 | 3 | ## Data structures 4 | 5 | ### Boxed slices 6 | 7 | By default Spellbook prefers boxed slices (`Box<[T]>`) and boxed strs (`Box`) rather than their resizable counterparts `Vec` and `String`. Boxed slices can be used as drop-in replacements for the most part. For example you can index into a boxed slice and iterate on the elements. The difference is that boxed slices have a fixed size once created: you can push to a `Vec` but not a `Box<[T]>`. They also discard any excess capacity and don't need to track length and capacity separately, saving a very small amount of memory per instance. 8 | 9 | ```rust 10 | type Stem = UmbraString; 11 | ``` 12 | 13 | `Box` was the representation for stems in the dictionary but this type has been changed to an even further optimized structure based on a "German string." A deep dive into the optimization can be found [here](https://the-mikedavis.github.io/posts/german-string-optimizations-in-spellbook/). 14 | 15 | ### Flag sets 16 | 17 | ```rust 18 | struct FlagSet(UmbraSlice); 19 | ``` 20 | 21 | As mentioned above, `FlagSet` uses a "German string" inspired optimization to store small sets inline. You can imagine this type as basically `Box<[Flag]>` though. Further discussion of the `FlagSet` optimization in particular can be found [here](https://the-mikedavis.github.io/posts/german-string-optimizations-in-spellbook/#bonus-points-the-flagset-can-also-be-german). 22 | 23 | Words in the dictionary are associated with any number of flags, like `adventure/DRSMZG` mentioned above. The order of the flags as written in the dictionary isn't important. We need a way to look up whether a flag exists in that set quickly. The right tool for the job might seem like a `HashSet` or a `BTreeSet`. Those are mutable though so they carry some extra overhead. A dictionary contains many many flag sets and the overhead adds up. So what we use instead is an optimized version of a sorted `Box<[Flag]>` and look up flags with `slice::contains` or `slice::binary_search` depending on length. 24 | 25 | Binary searching a small slice is a tiny bit slower than `slice::contains` but we prefer `slice::binary_search` for its consistent performance on outlier flag sets. See [`benches/slice-contains.rs`](../benches/slice-contains.rs) for more details. 26 | 27 | ### Flags 28 | 29 | ```rust 30 | type Flag = core::num::NonZeroU16; 31 | ``` 32 | 33 | Spellbook represents flags with non-zero `u16`s. Non-zero numbers are special core types in Rust that enable a memory layout optimization: the size of a `Option` is the same as the size of a `u16` - you don't pay for the `Option`. This optimization isn't useful for flags in flag sets. Flags are also used in `.aff` files to mark stems as having some special properties though. For example `en_US` uses `ONLYINCOMPOUND c` to declare that stems in the dictionary with the `c` flag are only valid when used in a compound, for example `1th/tc`, `2th/tc` or `3th/tc`. These stems are only correct when used in a compound like "11th", "12th" or "13th" and aren't correct alone. Internally Spellbook keeps a bunch of these `Option`s so the layout optimization saves a small amount of space. 34 | 35 | By default, flags are encoded in a dictionary with the `UTF-8` flag type. For `en_US` that means that each character after the `/` in a word in the dictionary and any flags declared in `en_US.aff` are converted to a `u16` (and then `NonZeroU16`). A 16 bit integer can't fit all unicode characters (UTF-8 characters may be up to 32 bits) but the lower 16 bits of UTF-8 are more than sufficient for declaring flags. Flags are only used to specify properties with the `.aff` file rather than stems. There are other encoding for flags used by some dictionaries. See the `FlagType` enum for more details. 36 | 37 | ### Word list 38 | 39 | ```rust 40 | type WordList = HashBag; 41 | ``` 42 | 43 | The word list is one of the two central data structures. It's a lookup table for the pairs of `(stem, flag_set)` defined in a dictionary's `.dic` file. We need to look up whether a word is in the dictionary (and what flags it has) very quickly. A natural choice might be `HashMap` or `BTreeSet`. Unlike flag sets and boxed slices and strs mentioned above, it's ok for this type to be resizable. There's only one instance of it in a dictionary and the API can support adding words to the dictionary to enable building a personal dictionary feature. Instead the snag with this type is that there can be duplicate stems in the dictionary with different flag sets. Merging the flag sets together isn't correct: the combination of flags might allow one prefix/suffix to apply but not work in a compounds while another entry provides a different prefix/suffix which can compound. 44 | 45 | So what we need is something closer to `HashMap>`. The extra `Vec` is more overhead though that isn't necessary in most cases since duplicate stems are fairly rare. In other languages like C++ this is where a [multi map](https://en.cppreference.com/w/cpp/container/unordered_multimap) might fit. It's the same idea as a hash map but allows for duplicate keys. Building a type like this in Rust is actually pretty straightforward with the [`hashbrown`] `HashTable` API. Insertion is slightly simpler than a `HashMap`: we don't need to check if the key is already in the table, we can just blindly insert. Reading from the table works very similarly to `HashMap::get`. Lookup in a regular hash map can stop searching the table when the first entry matching the key is found. For a multi map though we continue to look up until we find an entry that doesn't match. 46 | 47 | See the implementation details for this in [`src/hash_bag.rs`](../src/hash_bag.rs). 48 | 49 | ### Affix index 50 | 51 | Affixes (i.e. prefixes and suffixes) are stored in an "index" that allows quick lookup. For example `en_US` has prefixes like these: 52 | 53 | ``` 54 | PFX C Y 1 55 | PFX C 0 de . 56 | 57 | PFX E Y 1 58 | PFX E 0 dis . 59 | ``` 60 | 61 | Which might apply to a stem in the dictionary like `pose/CAKEGDS` to allow words "depose" and "dispose". When checking "depose" we look up in the set of prefixes to find any where the input word starts with the "add" part (for example `"depose".starts_with("de")`). 62 | 63 | A [prefix tree](https://en.wikipedia.org/wiki/Trie) would allow very quick lookup. Trees and graph-like structures are not the most straightforward things to write in Rust though. Luckily Nuspell has a trick for this type which works well in Rust. Instead of a tree, we collect the set of prefixes into a `Box<[Prefix]>` table sorted by the "add" part of a prefix/suffix ("de" or "dis" above, for example). We can then binary search based on whether a prefix matches (`str::starts_with`). There are some additional optimizations like an extra lookup table that maps the first character in a prefix to the starting index in the `Box<[Prefix]>` table so that we can jump to the right region of the table quickly. 64 | 65 | ## Unsafe code 66 | 67 | Spellbook uses `unsafe` in three ways: 68 | 69 | 1. Small-string/slice optimizations. The `umbra_slice` module uses `unsafe` to interpret itself as either an inline or allocated string/slice. 70 | 2. UTF-8 manipulation. Spellbook manipulates UTF-8 encoded strings as bytes in some cases for performance reasons. For example when checking German sharps, Spellbook might replace "ss" with "ß". These two strings have the same UTF-8 length (2) so the bytes can be overwritten directly. This kind of edit can't be done as efficiently in safe Rust. 71 | 3. A `CharsStr` type in the ngram suggester (`src/suggester/ngram.rs`) indexes into its underlying `str` without bounds checks for performance reasons. 72 | 73 | These uses of `unsafe` could theoretically be eliminated: 74 | 75 | 1. The `Stem` and `FlagSlice` types could switch from `umbra_slice` types to `Box` and `Box<[Flag]>` respectively with the tradeoff of significantly higher total dictionary memory size (around 25% more for `en_US`). 76 | 2. String edits could be done using safe methods only for an unknown performance hit to the checker and likely a larger hit to the suggester. 77 | 3. `CharsStr` could use checked lookups into its underlying `str` for a small performance hit. 78 | 79 | But eliminating `unsafe` is not really interesting to me. The uses rely on solid assumptions are typically documented with "SAFETY" comments. 80 | 81 | [`hashbrown`]: https://github.com/rust-lang/hashbrown 82 | -------------------------------------------------------------------------------- /src/hash_bag.rs: -------------------------------------------------------------------------------- 1 | use core::{ 2 | borrow::Borrow, 3 | fmt::Debug, 4 | hash::{BuildHasher, Hash}, 5 | }; 6 | 7 | use hashbrown::hash_table::{self, HashTable, IterHash}; 8 | 9 | /// A collection of key-value pairs - similar to a HashMap - which allows for duplicate keys. 10 | /// 11 | /// The name is inspired by Erlang's ETS bag table type which also allows duplicate records. 12 | /// Entire key-value pairs may be duplicated. Conceptually this is a lot like 13 | /// `HashMap>`. In other languages like C++ this is called a [multimap]. 14 | /// Multimaps are usually preferred over `HashMap>` in cases where there are few 15 | /// duplicates since the overhead of the Vec is unnecessary in most lookups. 16 | /// 17 | /// In Spellbook this type is used to represent the "WordList". Hunspell-like dictionaries are 18 | /// defined as sets of "stems" and a collection of "flags" that apply to that stem. Some 19 | /// dictionaries provide multiple definitions of a stem with different sets of flags. Naively 20 | /// merging these stems is not correct: the flags in one set might prevent an affix from 21 | /// compounding while another set of flags provides a different affix which supports compounding. 22 | /// 23 | /// Internally this is built on Hashbrown's "raw" API - a set of tools for building [Swiss 24 | /// Tables]. 25 | /// 26 | /// [multimap]: https://en.cppreference.com/w/cpp/container/multimap 27 | /// [Swiss Tables]: https://abseil.io/blog/20180927-swisstables 28 | #[derive(Clone)] 29 | pub struct HashBag { 30 | table: HashTable<(K, V)>, 31 | build_hasher: S, 32 | } 33 | 34 | impl HashBag { 35 | /// Returns an iterator over the key-value pairs in the bag. 36 | /// 37 | /// The ordering of the pairs returned by the iterator is undefined. 38 | pub fn iter(&self) -> Iter<'_, K, V> { 39 | Iter { 40 | inner: self.table.iter(), 41 | } 42 | } 43 | 44 | /// The number of key-value pairs in the table. 45 | pub fn len(&self) -> usize { 46 | self.table.len() 47 | } 48 | } 49 | 50 | impl HashBag 51 | where 52 | K: Hash + Eq, 53 | S: BuildHasher, 54 | { 55 | pub fn with_capacity_and_hasher(capacity: usize, build_hasher: S) -> Self { 56 | Self { 57 | table: HashTable::with_capacity(capacity), 58 | build_hasher, 59 | } 60 | } 61 | 62 | /// Inserts a key-value pair into the bag. 63 | /// 64 | /// Duplicate keys or entire key-value pairs are permitted. 65 | pub fn insert(&mut self, k: K, v: V) { 66 | let hash = make_hash(&self.build_hasher, &k); 67 | let hasher = make_hasher(&self.build_hasher); 68 | // Insert without attempting to find an existing entry with this key. 69 | self.table.insert_unique(hash, (k, v), hasher); 70 | } 71 | 72 | /// Gets all key-value pairs in the bag with the given key. 73 | // NOTE: we return the key strictly for lifetime reasons: we can "smuggle" owned Cows through 74 | // the bag. 75 | pub fn get<'bag, 'key, Q>(&'bag self, k: &'key Q) -> GetAllIter<'bag, 'key, Q, K, V> 76 | where 77 | K: Borrow, 78 | Q: Hash + Eq + ?Sized, 79 | { 80 | let hash = make_hash(&self.build_hasher, k); 81 | 82 | GetAllIter { 83 | inner: self.table.iter_hash(hash), 84 | key: k, 85 | } 86 | } 87 | } 88 | 89 | impl Debug for HashBag 90 | where 91 | K: Debug + Hash + Eq, 92 | V: Debug, 93 | { 94 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 95 | f.debug_map().entries(self.iter()).finish() 96 | } 97 | } 98 | 99 | // `make_hash`, `make_hasher`, and `Iter` are pulled from Hashbrown's `map` module 100 | // at `274c7bbd79398881e0225c0133e423ce60d7a8f1`. 101 | 102 | fn make_hash(hash_builder: &S, val: &Q) -> u64 103 | where 104 | Q: Hash + ?Sized, 105 | S: BuildHasher, 106 | { 107 | use core::hash::Hasher; 108 | let mut state = hash_builder.build_hasher(); 109 | val.hash(&mut state); 110 | state.finish() 111 | } 112 | 113 | fn make_hasher(hash_builder: &S) -> impl Fn(&(Q, V)) -> u64 + '_ 114 | where 115 | Q: Hash, 116 | S: BuildHasher, 117 | { 118 | move |val| make_hash::(hash_builder, &val.0) 119 | } 120 | 121 | // This is a very thin wrapper around `hash_table::Iter` which rearranges the reference so that 122 | // we return `(&k, &v)` instead of `&(k, v)`. 123 | pub struct Iter<'a, K, V> { 124 | inner: hash_table::Iter<'a, (K, V)>, 125 | } 126 | 127 | impl<'a, K, V> Iterator for Iter<'a, K, V> { 128 | type Item = (&'a K, &'a V); 129 | 130 | fn next(&mut self) -> Option<(&'a K, &'a V)> { 131 | let (k, v) = self.inner.next()?; 132 | Some((k, v)) 133 | } 134 | 135 | fn size_hint(&self) -> (usize, Option) { 136 | self.inner.size_hint() 137 | } 138 | } 139 | 140 | impl ExactSizeIterator for Iter<'_, K, V> { 141 | fn len(&self) -> usize { 142 | self.inner.len() 143 | } 144 | } 145 | 146 | pub struct GetAllIter<'bag, 'key, Q: ?Sized, K, V> 147 | where 148 | K: Borrow, 149 | Q: Hash + Eq, 150 | { 151 | inner: IterHash<'bag, (K, V)>, 152 | key: &'key Q, 153 | } 154 | 155 | impl<'bag, Q: ?Sized, K, V> Iterator for GetAllIter<'bag, '_, Q, K, V> 156 | where 157 | K: Borrow, 158 | Q: Hash + Eq, 159 | { 160 | type Item = (&'bag K, &'bag V); 161 | 162 | fn next(&mut self) -> Option { 163 | loop { 164 | match self.inner.next() { 165 | Some((k, v)) => { 166 | if self.key.eq(k.borrow()) { 167 | return Some((k, v)); 168 | } 169 | continue; 170 | } 171 | None => return None, 172 | } 173 | } 174 | } 175 | } 176 | 177 | #[cfg(test)] 178 | mod test { 179 | use core::hash::BuildHasher; 180 | 181 | use crate::alloc::{string::ToString, vec::Vec}; 182 | use crate::DefaultHashBuilder; 183 | 184 | impl super::HashBag { 185 | pub fn new() -> Self { 186 | Self { 187 | table: super::HashTable::new(), 188 | build_hasher: S::default(), 189 | } 190 | } 191 | } 192 | 193 | type HashBag = super::HashBag; 194 | 195 | #[test] 196 | fn insert_and_get_duplicate_keys() { 197 | let mut bag = HashBag::new(); 198 | bag.insert(1, 1); 199 | bag.insert(5, 5); 200 | assert!(bag.len() == 2); 201 | bag.insert(1, 1); 202 | bag.insert(1, 2); 203 | assert!(bag.len() == 4); 204 | 205 | let mut vals: Vec<_> = bag.get(&1).map(|kv| kv.1).copied().collect(); 206 | vals.sort_unstable(); 207 | assert_eq!(&[1, 1, 2], vals.as_slice()); 208 | let vals = bag.get(&5).map(|kv| kv.1).copied().collect::>(); 209 | assert_eq!(&[5], vals.as_slice()); 210 | } 211 | 212 | #[test] 213 | fn string_keys() { 214 | let mut bag = HashBag::new(); 215 | bag.insert("hello".to_string(), "bob"); 216 | bag.insert("hello".to_string(), "world"); 217 | bag.insert("bye".to_string(), "bob"); 218 | 219 | let mut hellos: Vec<_> = bag.get("hello").map(|kv| kv.1).copied().collect(); 220 | hellos.sort_unstable(); 221 | assert_eq!(&["bob", "world"], hellos.as_slice()); 222 | 223 | let vals: Vec<_> = bag.get("bye").map(|kv| kv.1).copied().collect(); 224 | assert_eq!(&["bob"], vals.as_slice()); 225 | } 226 | 227 | #[test] 228 | fn lookup_correctness_on_large_corpus() { 229 | let max = 100_000; 230 | let expected: Vec<_> = (1..max).flat_map(|n| [(n, n), (n, n + 1)]).collect(); 231 | 232 | let mut bag = HashBag::new(); 233 | for (k, v) in expected.iter() { 234 | bag.insert(*k, *v); 235 | } 236 | 237 | let mut buf = Vec::with_capacity(2); 238 | for n in 1..max { 239 | buf.clear(); 240 | buf.extend(bag.get(&n).map(|(k, v)| (*k, *v))); 241 | buf.sort_unstable(); 242 | assert_eq!(&[(n, n), (n, n + 1)], buf.as_slice()); 243 | } 244 | } 245 | 246 | #[test] 247 | fn iter() { 248 | // The iterator is currently unused but very small and could be useful for debugging. 249 | let pairs = &[(1, 1), (1, 2), (1, 3), (3, 1)]; 250 | let mut bag = HashBag::new(); 251 | for (k, v) in pairs { 252 | bag.insert(k, v); 253 | } 254 | 255 | assert_eq!(bag.iter().len(), pairs.len()); 256 | 257 | let mut values: Vec<_> = bag.iter().map(|(k, v)| (**k, **v)).collect(); 258 | values.sort_unstable(); 259 | assert_eq!(&values, pairs); 260 | } 261 | 262 | #[test] 263 | fn display() { 264 | // Shameless coverage test, it brings the file to 100% :P 265 | let pairs = &[(1, 1), (1, 1), (1, 2), (1, 3), (3, 1)]; 266 | let mut bag = super::HashBag::with_capacity_and_hasher( 267 | pairs.len(), 268 | // We use a hard-coded seed so that the display is deterministic. 269 | foldhash::fast::FixedState::with_seed(1234567810), 270 | ); 271 | for (k, v) in pairs { 272 | bag.insert(k, v); 273 | } 274 | 275 | assert_eq!( 276 | "{1: 1, 1: 1, 1: 2, 1: 3, 3: 1}", 277 | crate::alloc::format!("{bag:?}").as_str() 278 | ); 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /docs/compare.md: -------------------------------------------------------------------------------- 1 | # Comparisons to other spellcheckers 2 | 3 | Spellbook is mainly a rewrite of Nuspell so the ways Spellbook diverges from Nuspell are the most notable: 4 | 5 | * Instead of a custom hash table implementation Spellbook uses [`hashbrown`]. 6 | * Spellbook uses a different string and slice representation for word-list stems and flagsets. See the [internals] document for more info. 7 | * Nuspell switches to UTF-32 for ngram suggestions while Spellbook consistently uses UTF-8. This performs better in Rust as the standard library has optimized routines for UTF-8 strings. 8 | 9 | ## Performance comparisons of "Hunspell-like" `check` 10 | 11 | When it comes to performance I mainly care about the time it takes to check a word - dictionary initialization (`new`) and suggestion (`suggest`) don't happen often enough to be really concerning. To measure `check`/`spell` we use `cargo bench` benchmarks as of Rust nightly-2024-08-21 (arbitrarily). Benchmarks for Spellbook can be seen in the `benches/check.rs` file in this repository. Benchmarks for Hunspell and Nuspell can be found in [`the-mikedavis/ffi-dictionaries`](https://github.com/the-mikedavis/ffi-dictionaries/tree/3b1dc8fb4caf1961a958011d0255ed7d31696616/benches)'s `benches` directory. Note that these use `c"word"` sigils so the benchmark doesn't pay the cost of converting from a Rust string to a C/C++ string. These tests are run against the stock en_US dictionary. 12 | 13 | | Test name | Word | Hunspell | Nuspell | Spellbook | 14 | |--------------------------------|-------------------|------------------------------|------------------------------|----------------------------| 15 | | `breaks` | light-weight-like | 4,433.77 ns/iter (+/- 44.80) | 1,250.27 ns/iter (+/- 23.00) | 865.61 ns/iter (+/- 31.54) | 16 | | `compound_word` | 20000th | 3,432.38 ns/iter (+/- 49.68) | 736.53 ns/iter (+/- 15.57) | 708.11 ns/iter (+/- 20.72) | 17 | | `in_dictionary_word` | earth | 212.30 ns/iter (+/- 3.67) | 126.97 ns/iter (+/- 7.39) | 65.84 ns/iter (+/- 2.12) | 18 | | `incorrect_prefix` | reearth | 1,175.15 ns/iter (+/- 12.86) | 520.51 ns/iter (+/- 19.37) | 493.24 ns/iter (+/- 18.18) | 19 | | `number` | 8675,309.0 | 239.64 ns/iter (+/- 6.47) | 92.12 ns/iter (+/- 1.29) | 41.10 ns/iter (+/- 1.13) | 20 | | `titlecase_in_dictionary_word` | Earth | 827.88 ns/iter (+/- 13.48) | 513.78 ns/iter (+/- 9.10) | 314.48 ns/iter (+/- 8.61) | 21 | | `uppercase_in_dictionary_word` | EARTH | 1,426.46 ns/iter (+/- 15.70) | 280.78 ns/iter (+/- 8.03) | 182.43 ns/iter (+/- 6.41) | 22 | | `word_with_prefix` | unearth | 288.74 ns/iter (+/- 7.82) | 924.66 ns/iter (+/- 35.69) | 539.69 ns/iter (+/- 10.97) | 23 | | `word_with_prefix_and_suffix` | unearthed | 448.48 ns/iter (+/- 9.64) | 534.85 ns/iter (+/- 14.11) | 395.25 ns/iter (+/- 9.71) | 24 | | `word_with_suffix` | earthly | 236.94 ns/iter (+/- 6.17) | 141.47 ns/iter (+/- 2.98) | 69.81 ns/iter (+/- 7.60) | 25 | 26 | When it comes to memory, Valgrind's [DHAT](https://valgrind.org/docs/manual/dh-manual.html) is a good tool for reporting total allocations and breaking down heap interaction. We'll run `vaglind --tool=dhat hello` for an example binary that checks the given word against en_US. 27 | 28 | Hunspell: 29 | 30 | ``` 31 | Total: 3,325,493 bytes in 90,736 blocks 32 | At t-gmax: 3,164,250 bytes in 90,144 blocks 33 | At t-end: 0 bytes in 0 blocks 34 | Reads: 16,733,982 bytes 35 | Writes: 6,264,702 bytes 36 | ``` 37 | 38 | Nuspell: 39 | 40 | ``` 41 | Total: 4,437,599 bytes in 50,419 blocks 42 | At t-gmax: 4,208,644 bytes in 49,870 blocks 43 | At t-end: 3,486 bytes in 6 blocks 44 | Reads: 16,320,481 bytes 45 | Writes: 8,027,585 bytes 46 | ``` 47 | 48 | Spellbook: 49 | 50 | ``` 51 | Total: 2,580,386 bytes in 44,741 blocks 52 | At t-gmax: 2,189,633 bytes in 947 blocks 53 | At t-end: 0 bytes in 0 blocks 54 | Reads: 1,728,454 bytes 55 | Writes: 2,105,758 bytes 56 | ``` 57 | 58 | ### Analysis 59 | 60 | Mostly I am familiar with Nuspell so I'll be talking about Spellbook vs. Nuspell in this section. 61 | 62 | The `check` code is basically a rewrite so they should perform very similarly. One major difference that might affect lookup time is the main lookup table. It's meant to be a hash multi-map, like a `HashMap` but allowing duplicate keys. Nuspell rolls its own hash table type for this while Spellbook uses `hashbrown::HashTable` which has SIMD optimizations for searching. Spellbook also uses `foldhash` by default which is quite fast while Nuspell uses `std::hash` (implementation-specific). This sometimes happens with Rust rewrites: it's a pain to take a dependency in C/C++ so C/C++ libraries/tools might leave performance on the table by not taking advantage of available high-performance dependencies. To confirm or deny this suspicion one could replace Nuspell's `Word_List` type with an adaptation from Google's `SwissTable` library (on which `hashbrown` is based). 63 | 64 | Otherwise I suspect that Rust's standard library has better optimizations for string searching and equality, as I know it uses `memchr` and SIMD operations when available. 65 | 66 | When it comes to memory, Spellbook is optimized to save memory by cutting out unnecessary bytes from the common string type used in the lookup table, as well as small-string and small-slice optimizations for the stem and flagsets. The [internals] document has more details. 67 | 68 | ## ZSpell 69 | 70 | [`pluots/zspell`](https://github.com/pluots/zspell) is an interesting alternative to the Hunspell-like spellcheckers mentioned above. ZSpell also takes the `.dic` and `.aff` Hunspell-style dictionary files. At time of writing ZSpell doesn't support suggestions. The interesting part of ZSpell is how it checks words instead. 71 | 72 | ZSpell expands affixes during instantiation of a dictionary. (See the `README.md` doc in this repository for a basic intro on affixes.) The "classic" spellcheckers mentioned above contain a subset of the possible dictionary words in a main lookup table. For example Spellbook's table includes "adventure" but not some of its conjugations made possible by prefixes/suffixes like "adventurer" or "adventured". In contrast, ZSpell expands each stem so that its tables include "adventure", "adventures", "adventurer", "adventured", "adventuring" and more. When checking a word, ZSpell performs a lookup into a handful of hash maps, short-circuiting if a word is found. 73 | 74 | The benefit is a basically constant-time `Dictionary::check_word` performance: 75 | 76 | | Test name | Word | ZSpell | Notes... | 77 | |--------------------------------|-------------------|------------------------------|----------| 78 | | `breaks` | light-weight-like | N/A | ZSpell has a custom tokenization/breaking strategy not based on Hunspell | 79 | | `compound_word` | 20000th | N/A | ZSpell does not support compounds | 80 | | `in_dictionary_word` | earth | 46.86 ns/iter (+/- 1.35) | | 81 | | `incorrect_prefix` | reearth | 62.47 ns/iter (+/- 1.14) | | 82 | | `number` | 8675,309.0 | N/A | ZSpell does not detect/support numbers | 83 | | `titlecase_in_dictionary_word` | Earth | 50.51 ns/iter (+/- 0.42) | | 84 | | `uppercase_in_dictionary_word` | EARTH | 52.63 ns/iter (+/- 0.46) | | 85 | | `word_with_prefix` | unearth | 54.52 ns/iter (+/- 1.84) | | 86 | | `word_with_prefix_and_suffix` | unearthed | 61.13 ns/iter (+/- 2.08) | | 87 | | `word_with_suffix` | earthly | 54.94 ns/iter (+/- 1.52) | | 88 | 89 | This comes with costs however. Behold the `DHAT` output for an example `check` binary run: 90 | 91 | ``` 92 | Total: 83,209,456 bytes in 731,177 blocks 93 | At t-gmax: 57,081,051 bytes in 347,038 blocks 94 | At t-end: 246,796 bytes in 459 blocks 95 | Reads: 130,487,585 bytes 96 | Writes: 69,845,862 bytes 97 | ``` 98 | 99 | So the tradeoff is much more memory usage. There's also a correctness issue with compounds: "20000th" from the benchmark fails to check. Checking compounds involves slicing up the input word and checking the components to see if they are compound components laid out in a pattern declared by the `.aff` file. This part of Hunspell/Nuspell is not implemented by ZSpell. 100 | 101 | For `en_US` specifically you might accept these tradeoffs. It's more memory but the check time is nearly constant. `en_US` only uses compounds for numbers, for example "7th", "21st" or "20000th" from the benchmark. If you have a large corpus to check, don't care much for memory and can skip compounds then it's not a bad tradeoff. 102 | 103 | The approach of expanding affixes is not scalable however and the tradeoff becomes worse with other Hunspell dictionaries. `en_US` is quite slim and simple with 50,000 stems, 7 prefixes and 16 suffixes. Brazilian Portuguese (`pt_BR`) is a far more complicated real-world dictionary weighing in at over 312,000 stems, 47 prefixes and 57 suffixes. Even with Spellbook this dictionary takes a hefty 100ms to initialize but with ZSpell, initialization runs for more than six minutes and consumes more than 100GB of memory before I kill it. 104 | 105 | The reason I mention ZSpell specifically in the comparison is it's a good example of the strategy taken by other implementations that consume Hunspell dictionary files. [Harper](https://github.com/elijah-potter/harper) and [Vale](https://github.com/errata-ai/vale) are two other projects in the wild that expand affixes in their checkers. 106 | 107 | [`hashbrown`]: https://github.com/rust-lang/hashbrown 108 | [internals]: ./internals.md 109 | -------------------------------------------------------------------------------- /tests/legacy.rs: -------------------------------------------------------------------------------- 1 | /* 2 | These are "legacy" tests (borrowing Nuspell's terminology) which are ported originally from 3 | Hunspell's codebase. Each case has a `.dic` and `.aff` file which set up an example 4 | dictionary and each case might have any of... 5 | 6 | * `.good`: a listing of words for which `Dictionary::check` should return `true`. 7 | * `.wrong`: a listing of words for which `Dictionary::check` should return `false`. 8 | * `.sug`: a listing of words which should be suggested for the given wrong word on the 9 | corresponding line in `.wrong`, separated by commas and whitespace. 10 | 11 | We use a simple declarative macro to create a `#[test]` for each case. The advantage of a 12 | `#[test]` for each case is that a single test can fail but the others will run. We could use a 13 | single test which globs for `.dic` files instead: that would make it very easy to add a case. But 14 | these cases are meant just to ensure we have parity with the Hunspell/Nuspell test beds, so cases 15 | should not be added often. The test runs faster without the glob (and requires no glob 16 | dependency). 17 | */ 18 | use std::{ 19 | fs::{self, File}, 20 | io::{self, Read}, 21 | path::{Path, PathBuf}, 22 | }; 23 | 24 | use spellbook::Dictionary; 25 | 26 | macro_rules! check { 27 | ($case:ident) => { 28 | #[allow(non_snake_case)] 29 | #[test] 30 | fn $case() { 31 | let case = stringify!($case).strip_prefix("check_").unwrap(); 32 | do_check_case(case); 33 | } 34 | }; 35 | } 36 | 37 | fn do_check_case(case: &str) { 38 | let manifest_path = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); 39 | let path = manifest_path.join("tests/legacy").join(case); 40 | let aff = read_to_string(path.with_extension("aff")).unwrap(); 41 | let dic = read_to_string(path.with_extension("dic")).unwrap(); 42 | let dict = Dictionary::new(&aff, &dic).unwrap(); 43 | 44 | for good_word in fs::read_to_string(path.with_extension("good")) 45 | .iter() 46 | .flat_map(|text| text.lines()) 47 | { 48 | let word = good_word.trim(); 49 | assert!( 50 | dict.check(word), 51 | "expected {word:?} to be correct but it was incorrect" 52 | ); 53 | } 54 | 55 | for wrong_word in fs::read_to_string(path.with_extension("wrong")) 56 | .iter() 57 | .flat_map(|text| text.lines()) 58 | { 59 | let word = wrong_word.trim(); 60 | assert!( 61 | !dict.check(word), 62 | "expected {word:?} to be incorrect but it was correct" 63 | ); 64 | } 65 | } 66 | 67 | check!(check_1463589); 68 | check!(check_1463589_utf); 69 | check!(check_1592880); 70 | check!(check_1695964); 71 | check!(check_1706659); 72 | check!(check_1975530); 73 | check!(check_2970240); 74 | check!(check_2970242); 75 | check!(check_2999225); 76 | check!(check_affixes); 77 | check!(check_alias2); 78 | check!(check_alias3); 79 | check!(check_alias); 80 | check!(check_allcaps2); 81 | check!(check_allcaps3); 82 | check!(check_allcaps); 83 | check!(check_allcaps_utf); 84 | check!(check_arabic); 85 | check!(check_base); 86 | check!(check_base_utf); 87 | check!(check_breakdefault); 88 | check!(check_break); 89 | check!(check_breakoff); 90 | check!(check_checkcompoundcase2); 91 | check!(check_checkcompoundcase); 92 | check!(check_checkcompoundcaseutf); 93 | check!(check_checkcompounddup); 94 | // Use CHECKCOMPOUNDPATTERN replacements which aren't implemented yet: 95 | // check!(check_checkcompoundpattern2); 96 | // check!(check_checkcompoundpattern3); 97 | // check!(check_checkcompoundpattern4); 98 | check!(check_checkcompoundpattern); 99 | check!(check_checkcompoundrep); 100 | check!(check_checkcompoundtriple); 101 | check!(check_checksharps); 102 | check!(check_checksharpsutf); 103 | check!(check_circumfix); 104 | check!(check_colons_in_words); 105 | check!(check_complexprefixes2); 106 | check!(check_complexprefixes); 107 | check!(check_complexprefixesutf); 108 | check!(check_compoundaffix2); 109 | check!(check_compoundaffix3); 110 | check!(check_compoundaffix); 111 | check!(check_compoundflag); 112 | check!(check_compoundrule2); 113 | check!(check_compoundrule3); 114 | check!(check_compoundrule4); 115 | check!(check_compoundrule5); 116 | check!(check_compoundrule6); 117 | check!(check_compoundrule7); 118 | check!(check_compoundrule8); 119 | check!(check_compoundrule); 120 | check!(check_conditionalprefix); 121 | // Fails due to weird encoding of the aff/dic: 122 | // check!(check_condition); 123 | check!(check_condition_utf); 124 | check!(check_digits_in_words); 125 | check!(check_dotless_i); 126 | // Fails due to weird encoding of the aff/dic: 127 | // check!(check_encoding); 128 | check!(check_flag); 129 | check!(check_flaglong); 130 | check!(check_flagnum); 131 | check!(check_flagutf8); 132 | check!(check_fogemorpheme); 133 | check!(check_forbiddenword); 134 | check!(check_forceucase); 135 | check!(check_fullstrip); 136 | check!(check_germancompounding); 137 | check!(check_germancompoundingold); 138 | check!(check_hu); 139 | check!(check_i35725); 140 | check!(check_i53643); 141 | check!(check_i54633); 142 | // Fails due to weird encoding of the aff/dic: 143 | // check!(check_i54980); 144 | check!(check_i58202); 145 | check!(check_i68568); 146 | check!(check_i68568utf); 147 | check!(check_iconv2); 148 | check!(check_iconv); 149 | check!(check_ignore); 150 | check!(check_ignoreutf); 151 | check!(check_IJ); 152 | check!(check_keepcase); 153 | check!(check_korean); 154 | check!(check_map); 155 | check!(check_maputf); 156 | // Presumably needs morphology support? 157 | // check!(check_morph); 158 | check!(check_needaffix2); 159 | check!(check_needaffix3); 160 | check!(check_needaffix4); 161 | check!(check_needaffix5); 162 | check!(check_needaffix); 163 | check!(check_nepali); 164 | check!(check_ngram_utf_fix); 165 | check!(check_nosuggest); 166 | check!(check_oconv); 167 | check!(check_onlyincompound2); 168 | check!(check_onlyincompound); 169 | check!(check_opentaal_cpdpat2); 170 | check!(check_opentaal_cpdpat); 171 | check!(check_opentaal_forbiddenword1); 172 | check!(check_opentaal_forbiddenword2); 173 | check!(check_opentaal_keepcase); 174 | check!(check_phone); 175 | check!(check_rep); 176 | check!(check_reputf); 177 | check!(check_simplifiedtriple); 178 | check!(check_slash); 179 | check!(check_sug); 180 | check!(check_sugutf); 181 | check!(check_utf8_bom2); 182 | check!(check_utf8_bom); 183 | check!(check_utf8); 184 | check!(check_utf8_nonbmp); 185 | check!(check_utfcompound); 186 | check!(check_warn); 187 | check!(check_zeroaffix); 188 | 189 | macro_rules! suggest { 190 | ($case:ident) => { 191 | #[allow(non_snake_case)] 192 | #[test] 193 | fn $case() { 194 | let case = stringify!($case).strip_prefix("suggest_").unwrap(); 195 | do_suggest_case(case); 196 | } 197 | }; 198 | } 199 | 200 | fn do_suggest_case(case: &str) { 201 | let manifest_path = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); 202 | let path = manifest_path.join("tests/legacy").join(case); 203 | let aff = read_to_string(path.with_extension("aff")).unwrap(); 204 | let dic = read_to_string(path.with_extension("dic")).unwrap(); 205 | let dict = Dictionary::new(&aff, &dic).unwrap(); 206 | 207 | let mut list_sugs = Vec::new(); 208 | let mut sugs = Vec::new(); 209 | for word in fs::read_to_string(path.with_extension("wrong")) 210 | .iter() 211 | .flat_map(|text| text.lines()) 212 | .filter(|line| !line.is_empty()) 213 | { 214 | assert!( 215 | !dict.check(word), 216 | "word {word:?} was in the .wrong file but was actually correct" 217 | ); 218 | 219 | dict.suggest(word, &mut sugs); 220 | 221 | if !sugs.is_empty() { 222 | list_sugs.push(std::mem::take(&mut sugs)); 223 | } 224 | } 225 | 226 | let mut expected_list_sugs = Vec::new(); 227 | for line in fs::read_to_string(path.with_extension("sug")) 228 | .iter() 229 | .flat_map(|text| text.lines()) 230 | .filter(|line| !line.is_empty()) 231 | { 232 | let sugs: Vec<_> = line.split(", ").map(ToOwned::to_owned).collect(); 233 | if !sugs.is_empty() { 234 | expected_list_sugs.push(sugs); 235 | } 236 | } 237 | 238 | assert_eq!( 239 | expected_list_sugs, list_sugs, 240 | "(left: expected, right: actual)" 241 | ); 242 | } 243 | 244 | suggest!(suggest_1463589); 245 | suggest!(suggest_1463589_utf); 246 | suggest!(suggest_1695964); 247 | suggest!(suggest_IJ); 248 | suggest!(suggest_allcaps); 249 | suggest!(suggest_allcaps_utf); 250 | suggest!(suggest_allcaps2); 251 | suggest!(suggest_base); 252 | suggest!(suggest_base_utf); 253 | suggest!(suggest_breakdefault); 254 | suggest!(suggest_forceucase); 255 | suggest!(suggest_i35725); 256 | suggest!(suggest_i54633); 257 | suggest!(suggest_i58202); 258 | suggest!(suggest_keepcase); 259 | suggest!(suggest_map); 260 | suggest!(suggest_maputf); 261 | suggest!(suggest_ngram_utf_fix); 262 | suggest!(suggest_oconv); 263 | suggest!(suggest_onlyincompound); 264 | suggest!(suggest_opentaal_forbiddenword1); 265 | suggest!(suggest_opentaal_forbiddenword2); 266 | suggest!(suggest_opentaal_keepcase); 267 | suggest!(suggest_rep); 268 | suggest!(suggest_reputf); 269 | suggest!(suggest_sug); 270 | suggest!(suggest_sugutf); 271 | 272 | // These are marked as failing in Nuspell: 273 | // suggest!(suggest_checksharps); 274 | // suggest!(suggest_checksharpsutf); 275 | // suggest!(suggest_nosuggest); 276 | // suggest!(suggest_phone); 277 | // suggest!(suggest_utf8_nonbmp); 278 | 279 | /// Reads the contents of a file into a String, handling detecting and decoding of non-UTF-8 280 | /// contents. 281 | fn read_to_string>(path: P) -> io::Result { 282 | // Adapted from Helix's document opening function: 283 | // 284 | 285 | const BUF_SIZE: usize = 8 * 1024; 286 | let mut buf = [0u8; BUF_SIZE]; 287 | let mut reader = File::open(path).unwrap(); 288 | let read = reader.read(&mut buf).unwrap(); 289 | assert_ne!(read, 0); 290 | 291 | // Guess encoding. 292 | let mut detector = chardetng::EncodingDetector::new(); 293 | detector.feed(&buf, read < BUF_SIZE); 294 | let encoding = detector.guess(None, true); 295 | 296 | let mut decoder = encoding.new_decoder(); 297 | let mut output = String::new(); 298 | let mut slice = &buf[..read]; 299 | let mut is_empty = read == 0; 300 | let mut total_written = 0usize; 301 | // Zero-initialized bytes are a valid str. 302 | let mut buf_out = [0u8; BUF_SIZE]; 303 | let buf_str = unsafe { std::str::from_utf8_unchecked_mut(&mut buf_out) }; 304 | loop { 305 | let mut total_read = 0usize; 306 | 307 | // An inner loop is necessary as it is possible that the input buffer 308 | // may not be completely decoded on the first `decode_to_str()` call 309 | // which would happen in cases where the output buffer is filled to 310 | // capacity. 311 | loop { 312 | let (result, read, written, ..) = decoder.decode_to_str( 313 | &slice[total_read..], 314 | &mut buf_str[total_written..], 315 | is_empty, 316 | ); 317 | 318 | // These variables act as the read and write cursors of `buf` and `buf_str` respectively. 319 | // They are necessary in case the output buffer fills before decoding of the entire input 320 | // loop is complete. Otherwise, the loop would endlessly iterate over the same `buf` and 321 | // the data inside the output buffer would be overwritten. 322 | total_read += read; 323 | total_written += written; 324 | match result { 325 | encoding_rs::CoderResult::InputEmpty => { 326 | assert_eq!(slice.len(), total_read); 327 | break; 328 | } 329 | encoding_rs::CoderResult::OutputFull => { 330 | assert!(slice.len() > total_read); 331 | output.push_str(&buf_str[..total_written]); 332 | total_written = 0; 333 | } 334 | } 335 | } 336 | // Once the end of the stream is reached, the output buffer is 337 | // flushed and the loop terminates. 338 | if is_empty { 339 | assert_eq!(reader.read(&mut buf)?, 0); 340 | output.push_str(&buf_str[..total_written]); 341 | break; 342 | } 343 | 344 | // Once the previous input has been processed and decoded, the next set of 345 | // data is fetched from the reader. The end of the reader is determined to 346 | // be when exactly `0` bytes were read from the reader, as per the invariants 347 | // of the `Read` trait. 348 | let read = reader.read(&mut buf)?; 349 | slice = &buf[..read]; 350 | is_empty = read == 0; 351 | } 352 | 353 | Ok(output) 354 | } 355 | --------------------------------------------------------------------------------